diff options
211 files changed, 2298 insertions, 1632 deletions
diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt index 2216eb187c21..b784c270105f 100644 --- a/Documentation/printk-formats.txt +++ b/Documentation/printk-formats.txt @@ -23,6 +23,10 @@ Example: Reminder: sizeof() result is of type size_t. +The kernel's printf does not support %n. For obvious reasons, floating +point formats (%e, %f, %g, %a) are also not recognized. Use of any +unsupported specifier or length qualifier results in a WARN and early +return from vsnprintf. Raw pointer value SHOULD be printed with %p. The kernel supports the following extended format specifiers for pointer types: @@ -119,6 +123,7 @@ Raw buffer as an escaped string: If field width is omitted the 1 byte only will be escaped. Raw buffer as a hex string: + %*ph 00 01 02 ... 3f %*phC 00:01:02: ... :3f %*phD 00-01-02- ... -3f @@ -234,6 +239,7 @@ UUID/GUID addresses: Passed by reference. dentry names: + %pd{,2,3,4} %pD{,2,3,4} @@ -256,6 +262,8 @@ struct va_format: va_list *va; }; + Implements a "recursive vsnprintf". + Do not use this feature without some mechanism to verify the correctness of the format string and va_list arguments. @@ -284,6 +292,27 @@ bitmap and its derivatives such as cpumask and nodemask: Passed by reference. +Network device features: + + %pNF 0x000000000000c000 + + For printing netdev_features_t. + + Passed by reference. + +Command from struct task_struct + + %pT ls + + For printing executable name excluding path from struct + task_struct. + + Passed by reference. + +If you add other %p extensions, please extend lib/test_printf.c with +one or more test cases, if at all feasible. + + Thank you for your cooperation and attention. diff --git a/Documentation/vm/balance b/Documentation/vm/balance index c46e68cf9344..964595481af6 100644 --- a/Documentation/vm/balance +++ b/Documentation/vm/balance @@ -1,12 +1,14 @@ Started Jan 2000 by Kanoj Sarcar <kanoj@sgi.com> -Memory balancing is needed for non __GFP_WAIT as well as for non -__GFP_IO allocations. +Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as +well as for non __GFP_IO allocations. -There are two reasons to be requesting non __GFP_WAIT allocations: -the caller can not sleep (typically intr context), or does not want -to incur cost overheads of page stealing and possible swap io for -whatever reasons. +The first reason why a caller may avoid reclaim is that the caller can not +sleep due to holding a spinlock or is in interrupt context. The second may +be that the caller is willing to fail the allocation without incurring the +overhead of page reclaim. This may happen for opportunistic high-order +allocation requests that have order-0 fallback options. In such cases, +the caller may also wish to avoid waking kswapd. __GFP_IO allocation requests are made to prevent file system deadlocks. diff --git a/Documentation/vm/split_page_table_lock b/Documentation/vm/split_page_table_lock index 6dea4fd5c961..62842a857dab 100644 --- a/Documentation/vm/split_page_table_lock +++ b/Documentation/vm/split_page_table_lock @@ -54,8 +54,8 @@ everything required is done by pgtable_page_ctor() and pgtable_page_dtor(), which must be called on PTE table allocation / freeing. Make sure the architecture doesn't use slab allocator for page table -allocation: slab uses page->slab_cache and page->first_page for its pages. -These fields share storage with page->ptl. +allocation: slab uses page->slab_cache for its pages. +This field shares storage with page->ptl. PMD split lock only makes sense if you have more than two page table levels. diff --git a/MAINTAINERS b/MAINTAINERS index 4c5446a6a4a2..7af7f4a01f0b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4209,7 +4209,10 @@ L: linux-kernel@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/chanwoo/extcon.git S: Maintained F: drivers/extcon/ +F: include/linux/extcon/ +F: include/linux/extcon.h F: Documentation/extcon/ +F: Documentation/devicetree/bindings/extcon/ EXYNOS DP DRIVER M: Jingoo Han <jingoohan1@gmail.com> @@ -7490,6 +7493,7 @@ S: Supported F: Documentation/filesystems/nilfs2.txt F: fs/nilfs2/ F: include/linux/nilfs2_fs.h +F: include/trace/events/nilfs2.h NINJA SCSI-3 / NINJA SCSI-32Bi (16bit/CardBus) PCMCIA SCSI HOST ADAPTER DRIVER M: YOKOTA Hiroshi <yokota@netlab.is.tsukuba.ac.jp> diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index ad4eb2d26e16..e62400e5fb99 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -651,12 +651,12 @@ static void *__dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, if (nommu()) addr = __alloc_simple_buffer(dev, size, gfp, &page); - else if (dev_get_cma_area(dev) && (gfp & __GFP_WAIT)) + else if (dev_get_cma_area(dev) && (gfp & __GFP_DIRECT_RECLAIM)) addr = __alloc_from_contiguous(dev, size, prot, &page, caller, want_vaddr); else if (is_coherent) addr = __alloc_simple_buffer(dev, size, gfp, &page); - else if (!(gfp & __GFP_WAIT)) + else if (!gfpflags_allow_blocking(gfp)) addr = __alloc_from_pool(size, &page); else addr = __alloc_remap_buffer(dev, size, gfp, prot, &page, @@ -1363,7 +1363,7 @@ static void *arm_iommu_alloc_attrs(struct device *dev, size_t size, *handle = DMA_ERROR_CODE; size = PAGE_ALIGN(size); - if (!(gfp & __GFP_WAIT)) + if (!gfpflags_allow_blocking(gfp)) return __iommu_alloc_atomic(dev, size, handle); /* diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c index 7c34f7126b04..c5f9a9e3d1f3 100644 --- a/arch/arm/xen/mm.c +++ b/arch/arm/xen/mm.c @@ -25,7 +25,7 @@ unsigned long xen_get_swiotlb_free_pages(unsigned int order) { struct memblock_region *reg; - gfp_t flags = __GFP_NOWARN; + gfp_t flags = __GFP_NOWARN|__GFP_KSWAPD_RECLAIM; for_each_memblock(memory, reg) { if (reg->base < (phys_addr_t)0xffffffff) { diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 6320361d8d4c..bb4bf6a06ad6 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -100,7 +100,7 @@ static void *__dma_alloc_coherent(struct device *dev, size_t size, if (IS_ENABLED(CONFIG_ZONE_DMA) && dev->coherent_dma_mask <= DMA_BIT_MASK(32)) flags |= GFP_DMA; - if (dev_get_cma_area(dev) && (flags & __GFP_WAIT)) { + if (dev_get_cma_area(dev) && gfpflags_allow_blocking(flags)) { struct page *page; void *addr; @@ -148,7 +148,7 @@ static void *__dma_alloc(struct device *dev, size_t size, size = PAGE_ALIGN(size); - if (!coherent && !(flags & __GFP_WAIT)) { + if (!coherent && !gfpflags_allow_blocking(flags)) { struct page *page = NULL; void *addr = __alloc_from_pool(size, &page, flags); diff --git a/arch/sh/kernel/cpu/sh5/unwind.c b/arch/sh/kernel/cpu/sh5/unwind.c index 10aed41757fc..3a4fed406fc6 100644 --- a/arch/sh/kernel/cpu/sh5/unwind.c +++ b/arch/sh/kernel/cpu/sh5/unwind.c @@ -159,7 +159,7 @@ static int lookup_prev_stack_frame(unsigned long fp, unsigned long pc, /* Sign extend */ regcache[dest] = - ((((s64)(u64)op >> 10) & 0xffff) << 54) >> 54; + sign_extend64((((u64)op >> 10) & 0xffff), 9); break; case (0xd0 >> 2): /* addi */ case (0xd4 >> 2): /* addi.l */ diff --git a/arch/sh/kernel/traps_64.c b/arch/sh/kernel/traps_64.c index 112ea11c030d..d208c27ccc67 100644 --- a/arch/sh/kernel/traps_64.c +++ b/arch/sh/kernel/traps_64.c @@ -101,7 +101,7 @@ static int generate_and_check_address(struct pt_regs *regs, if (displacement_not_indexed) { __s64 displacement; displacement = (opcode >> 10) & 0x3ff; - displacement = ((displacement << 54) >> 54); /* sign extend */ + displacement = sign_extend64(displacement, 9); addr = (__u64)((__s64)base_address + (displacement << width_shift)); } else { __u64 offset; diff --git a/arch/x86/kernel/cpu/perf_event_msr.c b/arch/x86/kernel/cpu/perf_event_msr.c index f32ac13934f2..ec863b9a9f78 100644 --- a/arch/x86/kernel/cpu/perf_event_msr.c +++ b/arch/x86/kernel/cpu/perf_event_msr.c @@ -163,10 +163,9 @@ again: goto again; delta = now - prev; - if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) { - delta <<= 32; - delta >>= 32; /* sign extend */ - } + if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) + delta = sign_extend64(delta, 31); + local64_add(now - prev, &event->count); } diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index cd99433b8ba1..6ba014c61d62 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -90,7 +90,7 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size, again: page = NULL; /* CMA can be used only in the context which permits sleeping */ - if (flag & __GFP_WAIT) { + if (gfpflags_allow_blocking(flag)) { page = dma_alloc_from_contiguous(dev, count, get_order(size)); if (page && page_to_phys(page) + size > dma_mask) { dma_release_from_contiguous(dev, page, count); diff --git a/arch/xtensa/configs/iss_defconfig b/arch/xtensa/configs/iss_defconfig index f3dfe0d921c2..44c6764d9146 100644 --- a/arch/xtensa/configs/iss_defconfig +++ b/arch/xtensa/configs/iss_defconfig @@ -169,7 +169,6 @@ CONFIG_FLATMEM_MANUAL=y # CONFIG_SPARSEMEM_MANUAL is not set CONFIG_FLATMEM=y CONFIG_FLAT_NODE_MEM_MAP=y -CONFIG_PAGEFLAGS_EXTENDED=y CONFIG_SPLIT_PTLOCK_CPUS=4 # CONFIG_PHYS_ADDR_T_64BIT is not set CONFIG_ZONE_DMA_FLAG=1 diff --git a/block/bio.c b/block/bio.c index ad3f276d74bc..4f184d938942 100644 --- a/block/bio.c +++ b/block/bio.c @@ -211,7 +211,7 @@ fallback: bvl = mempool_alloc(pool, gfp_mask); } else { struct biovec_slab *bvs = bvec_slabs + *idx; - gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO); + gfp_t __gfp_mask = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO); /* * Make this allocation restricted and don't dump info on @@ -221,11 +221,11 @@ fallback: __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; /* - * Try a slab allocation. If this fails and __GFP_WAIT + * Try a slab allocation. If this fails and __GFP_DIRECT_RECLAIM * is set, retry with the 1-entry mempool */ bvl = kmem_cache_alloc(bvs->slab, __gfp_mask); - if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) { + if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) { *idx = BIOVEC_MAX_IDX; goto fallback; } @@ -395,12 +395,12 @@ static void punt_bios_to_rescuer(struct bio_set *bs) * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is * backed by the @bs's mempool. * - * When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be - * able to allocate a bio. This is due to the mempool guarantees. To make this - * work, callers must never allocate more than 1 bio at a time from this pool. - * Callers that need to allocate more than 1 bio must always submit the - * previously allocated bio for IO before attempting to allocate a new one. - * Failure to do so can cause deadlocks under memory pressure. + * When @bs is not NULL, if %__GFP_DIRECT_RECLAIM is set then bio_alloc will + * always be able to allocate a bio. This is due to the mempool guarantees. + * To make this work, callers must never allocate more than 1 bio at a time + * from this pool. Callers that need to allocate more than 1 bio must always + * submit the previously allocated bio for IO before attempting to allocate + * a new one. Failure to do so can cause deadlocks under memory pressure. * * Note that when running under generic_make_request() (i.e. any block * driver), bios are not submitted until after you return - see the code in @@ -459,13 +459,13 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) * We solve this, and guarantee forward progress, with a rescuer * workqueue per bio_set. If we go to allocate and there are * bios on current->bio_list, we first try the allocation - * without __GFP_WAIT; if that fails, we punt those bios we - * would be blocking to the rescuer workqueue before we retry - * with the original gfp_flags. + * without __GFP_DIRECT_RECLAIM; if that fails, we punt those + * bios we would be blocking to the rescuer workqueue before + * we retry with the original gfp_flags. */ if (current->bio_list && !bio_list_empty(current->bio_list)) - gfp_mask &= ~__GFP_WAIT; + gfp_mask &= ~__GFP_DIRECT_RECLAIM; p = mempool_alloc(bs->bio_pool, gfp_mask); if (!p && gfp_mask != saved_gfp) { diff --git a/block/blk-core.c b/block/blk-core.c index 89eec7965870..590cca21c24a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -638,7 +638,7 @@ int blk_queue_enter(struct request_queue *q, gfp_t gfp) if (percpu_ref_tryget_live(&q->q_usage_counter)) return 0; - if (!(gfp & __GFP_WAIT)) + if (!gfpflags_allow_blocking(gfp)) return -EBUSY; ret = wait_event_interruptible(q->mq_freeze_wq, @@ -1206,8 +1206,8 @@ rq_starved: * @bio: bio to allocate request for (can be %NULL) * @gfp_mask: allocation mask * - * Get a free request from @q. If %__GFP_WAIT is set in @gfp_mask, this - * function keeps retrying under memory pressure and fails iff @q is dead. + * Get a free request from @q. If %__GFP_DIRECT_RECLAIM is set in @gfp_mask, + * this function keeps retrying under memory pressure and fails iff @q is dead. * * Must be called with @q->queue_lock held and, * Returns ERR_PTR on failure, with @q->queue_lock held. @@ -1227,7 +1227,7 @@ retry: if (!IS_ERR(rq)) return rq; - if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dying(q))) { + if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) { blk_put_rl(rl); return rq; } @@ -1305,11 +1305,11 @@ EXPORT_SYMBOL(blk_get_request); * BUG. * * WARNING: When allocating/cloning a bio-chain, careful consideration should be - * given to how you allocate bios. In particular, you cannot use __GFP_WAIT for - * anything but the first bio in the chain. Otherwise you risk waiting for IO - * completion of a bio that hasn't been submitted yet, thus resulting in a - * deadlock. Alternatively bios should be allocated using bio_kmalloc() instead - * of bio_alloc(), as that avoids the mempool deadlock. + * given to how you allocate bios. In particular, you cannot use + * __GFP_DIRECT_RECLAIM for anything but the first bio in the chain. Otherwise + * you risk waiting for IO completion of a bio that hasn't been submitted yet, + * thus resulting in a deadlock. Alternatively bios should be allocated using + * bio_kmalloc() instead of bio_alloc(), as that avoids the mempool deadlock. * If possible a big IO should be split into smaller parts when allocation * fails. Partial allocation should not be an error, or you risk a live-lock. */ @@ -2038,7 +2038,7 @@ void generic_make_request(struct bio *bio) do { struct request_queue *q = bdev_get_queue(bio->bi_bdev); - if (likely(blk_queue_enter(q, __GFP_WAIT) == 0)) { + if (likely(blk_queue_enter(q, __GFP_DIRECT_RECLAIM) == 0)) { q->make_request_fn(q, bio); diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 1a27f45ec776..381cb50a673c 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -289,7 +289,7 @@ struct io_context *get_task_io_context(struct task_struct *task, { struct io_context *ioc; - might_sleep_if(gfp_flags & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(gfp_flags)); do { task_lock(task); diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 60ac684c8b8c..a07ca3488d96 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -268,7 +268,7 @@ static int bt_get(struct blk_mq_alloc_data *data, if (tag != -1) return tag; - if (!(data->gfp & __GFP_WAIT)) + if (!gfpflags_allow_blocking(data->gfp)) return -1; bs = bt_wait_ptr(bt, hctx); diff --git a/block/blk-mq.c b/block/blk-mq.c index 1c27b3eaef64..694f8703f83c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -244,11 +244,11 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); - blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_WAIT, + blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_DIRECT_RECLAIM, reserved, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, rw); - if (!rq && (gfp & __GFP_WAIT)) { + if (!rq && (gfp & __GFP_DIRECT_RECLAIM)) { __blk_mq_run_hw_queue(hctx); blk_mq_put_ctx(ctx); @@ -1186,7 +1186,7 @@ static struct request *blk_mq_map_request(struct request_queue *q, ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); blk_mq_set_alloc_data(&alloc_data, q, - __GFP_WAIT|GFP_ATOMIC, false, ctx, hctx); + __GFP_RECLAIM|__GFP_HIGH, false, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, rw); ctx = alloc_data.ctx; hctx = alloc_data.hctx; diff --git a/block/ioprio.c b/block/ioprio.c index 31666c92b46a..cc7800e9eb44 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -123,7 +123,8 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) break; do_each_thread(g, p) { - if (!uid_eq(task_uid(p), uid)) + if (!uid_eq(task_uid(p), uid) || + !task_pid_vnr(p)) continue; ret = set_task_ioprio(p, ioprio); if (ret) @@ -220,7 +221,8 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) break; do_each_thread(g, p) { - if (!uid_eq(task_uid(p), user->uid)) + if (!uid_eq(task_uid(p), user->uid) || + !task_pid_vnr(p)) continue; tmpio = get_task_ioprio(p); if (tmpio < 0) diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index dda653ce7b24..0774799942e0 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -444,7 +444,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, } - rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_WAIT); + rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_RECLAIM); if (IS_ERR(rq)) { err = PTR_ERR(rq); goto error_free_buffer; @@ -495,7 +495,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, break; } - if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_WAIT)) { + if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_RECLAIM)) { err = DRIVER_ERROR << 24; goto error; } @@ -536,7 +536,7 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk, struct request *rq; int err; - rq = blk_get_request(q, WRITE, __GFP_WAIT); + rq = blk_get_request(q, WRITE, __GFP_RECLAIM); if (IS_ERR(rq)) return PTR_ERR(rq); blk_rq_set_block_pc(rq); diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index d3d73d114a46..9462d2752850 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -1007,7 +1007,7 @@ static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_ho bm_set_page_unchanged(b->bm_pages[page_nr]); if (ctx->flags & BM_AIO_COPY_PAGES) { - page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT); + page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_RECLAIM); copy_highpage(page, b->bm_pages[page_nr]); bm_store_page_idx(page, page_nr); } else diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index c097909c589c..b4b5680ac6ad 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -357,7 +357,8 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto } if (has_payload && data_size) { - page = drbd_alloc_pages(peer_device, nr_pages, (gfp_mask & __GFP_WAIT)); + page = drbd_alloc_pages(peer_device, nr_pages, + gfpflags_allow_blocking(gfp_mask)); if (!page) goto fail; } diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index f504232c1ee7..a28a562f7b7f 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -173,7 +173,7 @@ static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd) { struct request *rq; - rq = blk_mq_alloc_request(dd->queue, 0, __GFP_WAIT, true); + rq = blk_mq_alloc_request(dd->queue, 0, __GFP_RECLAIM, true); return blk_mq_rq_to_pdu(rq); } diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 1b87623381e2..93b3f99b6865 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -444,9 +444,7 @@ static int nbd_thread_recv(struct nbd_device *nbd) spin_unlock_irqrestore(&nbd->tasks_lock, flags); if (signal_pending(current)) { - siginfo_t info; - - ret = dequeue_signal_lock(current, ¤t->blocked, &info); + ret = kernel_dequeue_signal(NULL); dev_warn(nbd_to_dev(nbd), "pid %d, %s, got signal %d\n", task_pid_nr(current), current->comm, ret); mutex_lock(&nbd->tx_lock); @@ -560,11 +558,8 @@ static int nbd_thread_send(void *data) !list_empty(&nbd->waiting_queue)); if (signal_pending(current)) { - siginfo_t info; - int ret; + int ret = kernel_dequeue_signal(NULL); - ret = dequeue_signal_lock(current, ¤t->blocked, - &info); dev_warn(nbd_to_dev(nbd), "pid %d, %s, got signal %d\n", task_pid_nr(current), current->comm, ret); mutex_lock(&nbd->tx_lock); @@ -592,10 +587,8 @@ static int nbd_thread_send(void *data) spin_unlock_irqrestore(&nbd->tasks_lock, flags); /* Clear maybe pending signals */ - if (signal_pending(current)) { - siginfo_t info; - dequeue_signal_lock(current, ¤t->blocked, &info); - } + if (signal_pending(current)) + kernel_dequeue_signal(NULL); return 0; } diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c index e22942596207..1b709a4e3b5e 100644 --- a/drivers/block/osdblk.c +++ b/drivers/block/osdblk.c @@ -271,7 +271,7 @@ static struct bio *bio_chain_clone(struct bio *old_chain, gfp_t gfpmask) goto err_out; tmp->bi_bdev = NULL; - gfpmask &= ~__GFP_WAIT; + gfpmask &= ~__GFP_DIRECT_RECLAIM; tmp->bi_next = NULL; if (!new_chain) diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index b9242d78283d..562b5a4ca7b7 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -723,7 +723,7 @@ static int pd_special_command(struct pd_unit *disk, struct request *rq; int err = 0; - rq = blk_get_request(disk->gd->queue, READ, __GFP_WAIT); + rq = blk_get_request(disk->gd->queue, READ, __GFP_RECLAIM); if (IS_ERR(rq)) return PTR_ERR(rq); diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index cd813f9110bf..2f477d45d6cf 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -704,14 +704,14 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command * int ret = 0; rq = blk_get_request(q, (cgc->data_direction == CGC_DATA_WRITE) ? - WRITE : READ, __GFP_WAIT); + WRITE : READ, __GFP_RECLAIM); if (IS_ERR(rq)) return PTR_ERR(rq); blk_rq_set_block_pc(rq); if (cgc->buflen) { ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen, - __GFP_WAIT); + __GFP_RECLAIM); if (ret) goto out; } diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 9fa15bb9d118..81a557c33a1f 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -106,7 +106,7 @@ static void zram_set_obj_size(struct zram_meta *meta, meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size; } -static inline int is_partial_io(struct bio_vec *bvec) +static inline bool is_partial_io(struct bio_vec *bvec) { return bvec->bv_len != PAGE_SIZE; } @@ -114,25 +114,25 @@ static inline int is_partial_io(struct bio_vec *bvec) /* * Check if request is within bounds and aligned on zram logical blocks. */ -static inline int valid_io_request(struct zram *zram, +static inline bool valid_io_request(struct zram *zram, sector_t start, unsigned int size) { u64 end, bound; /* unaligned request */ if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1))) - return 0; + return false; if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1))) - return 0; + return false; end = start + (size >> SECTOR_SHIFT); bound = zram->disksize >> SECTOR_SHIFT; /* out of range range */ if (unlikely(start >= bound || end > bound || start > end)) - return 0; + return false; /* I/O request is valid */ - return 1; + return true; } static void update_position(u32 *index, int *offset, struct bio_vec *bvec) @@ -157,7 +157,7 @@ static inline void update_used_max(struct zram *zram, } while (old_max != cur_max); } -static int page_zero_filled(void *ptr) +static bool page_zero_filled(void *ptr) { unsigned int pos; unsigned long *page; @@ -166,10 +166,10 @@ static int page_zero_filled(void *ptr) for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) { if (page[pos]) - return 0; + return false; } - return 1; + return true; } static void handle_zero_page(struct bio_vec *bvec) @@ -365,6 +365,9 @@ static ssize_t comp_algorithm_store(struct device *dev, struct zram *zram = dev_to_zram(dev); size_t sz; + if (!zcomp_available_algorithm(buf)) + return -EINVAL; + down_write(&zram->init_lock); if (init_done(zram)) { up_write(&zram->init_lock); @@ -378,9 +381,6 @@ static ssize_t comp_algorithm_store(struct device *dev, if (sz > 0 && zram->compressor[sz - 1] == '\n') zram->compressor[sz - 1] = 0x00; - if (!zcomp_available_algorithm(zram->compressor)) - len = -EINVAL; - up_write(&zram->init_lock); return len; } @@ -726,14 +726,14 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, } alloced_pages = zs_get_total_pages(meta->mem_pool); + update_used_max(zram, alloced_pages); + if (zram->limit_pages && alloced_pages > zram->limit_pages) { zs_free(meta->mem_pool, handle); ret = -ENOMEM; goto out; } - update_used_max(zram, alloced_pages); - cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO); if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) { diff --git a/drivers/connector/connector.c b/drivers/connector/connector.c index 30f522848c73..d7373ca69c99 100644 --- a/drivers/connector/connector.c +++ b/drivers/connector/connector.c @@ -124,7 +124,8 @@ int cn_netlink_send_mult(struct cn_msg *msg, u16 len, u32 portid, u32 __group, if (group) return netlink_broadcast(dev->nls, skb, portid, group, gfp_mask); - return netlink_unicast(dev->nls, skb, portid, !(gfp_mask&__GFP_WAIT)); + return netlink_unicast(dev->nls, skb, portid, + !gfpflags_allow_blocking(gfp_mask)); } EXPORT_SYMBOL_GPL(cn_netlink_send_mult); diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c index 2a3973a7c441..36a7c2d89a01 100644 --- a/drivers/firewire/core-cdev.c +++ b/drivers/firewire/core-cdev.c @@ -486,7 +486,7 @@ static int ioctl_get_info(struct client *client, union ioctl_arg *arg) static int add_client_resource(struct client *client, struct client_resource *resource, gfp_t gfp_mask) { - bool preload = !!(gfp_mask & __GFP_WAIT); + bool preload = gfpflags_allow_blocking(gfp_mask); unsigned long flags; int ret; diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c index 3c2d4abd71c5..1d47d2e9487c 100644 --- a/drivers/gpu/drm/drm_gem.c +++ b/drivers/gpu/drm/drm_gem.c @@ -491,7 +491,7 @@ struct page **drm_gem_get_pages(struct drm_gem_object *obj) * __GFP_DMA32 to be set in mapping_gfp_mask(inode->i_mapping) * so shmem can relocate pages during swapin if required. */ - BUG_ON((mapping_gfp_mask(mapping) & __GFP_DMA32) && + BUG_ON(mapping_gfp_constraint(mapping, __GFP_DMA32) && (page_to_pfn(p) >= 0x00100000UL)); } diff --git a/drivers/gpu/drm/drm_lock.c b/drivers/gpu/drm/drm_lock.c index 4924d381b664..daa2ff12101b 100644 --- a/drivers/gpu/drm/drm_lock.c +++ b/drivers/gpu/drm/drm_lock.c @@ -38,8 +38,6 @@ #include "drm_legacy.h" #include "drm_internal.h" -static int drm_notifier(void *priv); - static int drm_lock_take(struct drm_lock_data *lock_data, unsigned int context); /** @@ -118,14 +116,8 @@ int drm_legacy_lock(struct drm_device *dev, void *data, * really probably not the correct answer but lets us debug xkb * xserver for now */ if (!file_priv->is_master) { - sigemptyset(&dev->sigmask); - sigaddset(&dev->sigmask, SIGSTOP); - sigaddset(&dev->sigmask, SIGTSTP); - sigaddset(&dev->sigmask, SIGTTIN); - sigaddset(&dev->sigmask, SIGTTOU); dev->sigdata.context = lock->context; dev->sigdata.lock = master->lock.hw_lock; - block_all_signals(drm_notifier, dev, &dev->sigmask); } if (dev->driver->dma_quiescent && (lock->flags & _DRM_LOCK_QUIESCENT)) @@ -169,7 +161,6 @@ int drm_legacy_unlock(struct drm_device *dev, void *data, struct drm_file *file_ /* FIXME: Should really bail out here. */ } - unblock_all_signals(); return 0; } @@ -288,38 +279,6 @@ int drm_legacy_lock_free(struct drm_lock_data *lock_data, unsigned int context) } /** - * If we get here, it means that the process has called DRM_IOCTL_LOCK - * without calling DRM_IOCTL_UNLOCK. - * - * If the lock is not held, then let the signal proceed as usual. If the lock - * is held, then set the contended flag and keep the signal blocked. - * - * \param priv pointer to a drm_device structure. - * \return one if the signal should be delivered normally, or zero if the - * signal should be blocked. - */ -static int drm_notifier(void *priv) -{ - struct drm_device *dev = priv; - struct drm_hw_lock *lock = dev->sigdata.lock; - unsigned int old, new, prev; - - /* Allow signal delivery if lock isn't held */ - if (!lock || !_DRM_LOCK_IS_HELD(lock->lock) - || _DRM_LOCKING_CONTEXT(lock->lock) != dev->sigdata.context) - return 1; - - /* Otherwise, set flag to force call to - drmUnlock */ - do { - old = lock->lock; - new = old | _DRM_LOCK_CONT; - prev = cmpxchg(&lock->lock, old, new); - } while (prev != old); - return 0; -} - -/** * This function returns immediately and takes the hw lock * with the kernel context if it is free, otherwise it gets the highest priority when and if * it is eventually released. diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 4d631a946481..399aab265db3 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2214,9 +2214,8 @@ i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj) * Fail silently without starting the shrinker */ mapping = file_inode(obj->base.filp)->i_mapping; - gfp = mapping_gfp_mask(mapping); - gfp |= __GFP_NORETRY | __GFP_NOWARN | __GFP_NO_KSWAPD; - gfp &= ~(__GFP_IO | __GFP_WAIT); + gfp = mapping_gfp_constraint(mapping, ~(__GFP_IO | __GFP_RECLAIM)); + gfp |= __GFP_NORETRY | __GFP_NOWARN; sg = st->sgl; st->nents = 0; for (i = 0; i < page_count; i++) { diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 1362ad80a76c..05352f490d60 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -92,7 +92,7 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk, struct request *rq; int error; - rq = blk_get_request(drive->queue, READ, __GFP_WAIT); + rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM); rq->cmd_type = REQ_TYPE_DRV_PRIV; rq->special = (char *)pc; diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 64a6b827b3dd..ef907fd5ba98 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -441,7 +441,7 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd, struct request *rq; int error; - rq = blk_get_request(drive->queue, write, __GFP_WAIT); + rq = blk_get_request(drive->queue, write, __GFP_RECLAIM); memcpy(rq->cmd, cmd, BLK_MAX_CDB); rq->cmd_type = REQ_TYPE_ATA_PC; diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c index 066e39036518..474173eb31bb 100644 --- a/drivers/ide/ide-cd_ioctl.c +++ b/drivers/ide/ide-cd_ioctl.c @@ -303,7 +303,7 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi) struct request *rq; int ret; - rq = blk_get_request(drive->queue, READ, __GFP_WAIT); + rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM); rq->cmd_type = REQ_TYPE_DRV_PRIV; rq->cmd_flags = REQ_QUIET; ret = blk_execute_rq(drive->queue, cd->disk, rq, 0); diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c index b05a74d78ef5..0dd43b4fcec6 100644 --- a/drivers/ide/ide-devsets.c +++ b/drivers/ide/ide-devsets.c @@ -165,7 +165,7 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting, if (!(setting->flags & DS_SYNC)) return setting->set(drive, arg); - rq = blk_get_request(q, READ, __GFP_WAIT); + rq = blk_get_request(q, READ, __GFP_RECLAIM); rq->cmd_type = REQ_TYPE_DRV_PRIV; rq->cmd_len = 5; rq->cmd[0] = REQ_DEVSET_EXEC; diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c index 56b9708894a5..37a8a907febe 100644 --- a/drivers/ide/ide-disk.c +++ b/drivers/ide/ide-disk.c @@ -477,7 +477,7 @@ static int set_multcount(ide_drive_t *drive, int arg) if (drive->special_flags & IDE_SFLAG_SET_MULTMODE) return -EBUSY; - rq = blk_get_request(drive->queue, READ, __GFP_WAIT); + rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM); rq->cmd_type = REQ_TYPE_ATA_TASKFILE; drive->mult_req = arg; diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c index aa2e9b77b20d..d05db2469209 100644 --- a/drivers/ide/ide-ioctls.c +++ b/drivers/ide/ide-ioctls.c @@ -125,7 +125,7 @@ static int ide_cmd_ioctl(ide_drive_t *drive, unsigned long arg) if (NULL == (void *) arg) { struct request *rq; - rq = blk_get_request(drive->queue, READ, __GFP_WAIT); + rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM); rq->cmd_type = REQ_TYPE_ATA_TASKFILE; err = blk_execute_rq(drive->queue, NULL, rq, 0); blk_put_request(rq); @@ -221,7 +221,7 @@ static int generic_drive_reset(ide_drive_t *drive) struct request *rq; int ret = 0; - rq = blk_get_request(drive->queue, READ, __GFP_WAIT); + rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM); rq->cmd_type = REQ_TYPE_DRV_PRIV; rq->cmd_len = 1; rq->cmd[0] = REQ_DRIVE_RESET; diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c index c80868520488..2d7dca56dd24 100644 --- a/drivers/ide/ide-park.c +++ b/drivers/ide/ide-park.c @@ -31,7 +31,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout) } spin_unlock_irq(&hwif->lock); - rq = blk_get_request(q, READ, __GFP_WAIT); + rq = blk_get_request(q, READ, __GFP_RECLAIM); rq->cmd[0] = REQ_PARK_HEADS; rq->cmd_len = 1; rq->cmd_type = REQ_TYPE_DRV_PRIV; diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c index 081e43458d50..e34af488693a 100644 --- a/drivers/ide/ide-pm.c +++ b/drivers/ide/ide-pm.c @@ -18,7 +18,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg) } memset(&rqpm, 0, sizeof(rqpm)); - rq = blk_get_request(drive->queue, READ, __GFP_WAIT); + rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM); rq->cmd_type = REQ_TYPE_ATA_PM_SUSPEND; rq->special = &rqpm; rqpm.pm_step = IDE_PM_START_SUSPEND; @@ -88,7 +88,7 @@ int generic_ide_resume(struct device *dev) } memset(&rqpm, 0, sizeof(rqpm)); - rq = blk_get_request(drive->queue, READ, __GFP_WAIT); + rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM); rq->cmd_type = REQ_TYPE_ATA_PM_RESUME; rq->cmd_flags |= REQ_PREEMPT; rq->special = &rqpm; diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index f5d51d1d09ee..12fa04997dcc 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -852,7 +852,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size) BUG_ON(cmd != REQ_IDETAPE_READ && cmd != REQ_IDETAPE_WRITE); BUG_ON(size < 0 || size % tape->blk_size); - rq = blk_get_request(drive->queue, READ, __GFP_WAIT); + rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM); rq->cmd_type = REQ_TYPE_DRV_PRIV; rq->cmd[13] = cmd; rq->rq_disk = tape->disk; @@ -860,7 +860,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size) if (size) { ret = blk_rq_map_kern(drive->queue, rq, tape->buf, size, - __GFP_WAIT); + __GFP_RECLAIM); if (ret) goto out_put; } diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c index 0979e126fff1..a716693417a3 100644 --- a/drivers/ide/ide-taskfile.c +++ b/drivers/ide/ide-taskfile.c @@ -430,7 +430,7 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf, int error; int rw = !(cmd->tf_flags & IDE_TFLAG_WRITE) ? READ : WRITE; - rq = blk_get_request(drive->queue, rw, __GFP_WAIT); + rq = blk_get_request(drive->queue, rw, __GFP_RECLAIM); rq->cmd_type = REQ_TYPE_ATA_TASKFILE; /* @@ -441,7 +441,7 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf, */ if (nsect) { error = blk_rq_map_kern(drive->queue, rq, buf, - nsect * SECTOR_SIZE, __GFP_WAIT); + nsect * SECTOR_SIZE, __GFP_RECLAIM); if (error) goto put_req; } diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index dcdaa79e3f0f..2aba774f835b 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -1086,7 +1086,7 @@ static void init_mad(struct ib_sa_mad *mad, struct ib_mad_agent *agent) static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask) { - bool preload = !!(gfp_mask & __GFP_WAIT); + bool preload = gfpflags_allow_blocking(gfp_mask); unsigned long flags; int ret, id; diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c index 7e00470adc30..4ff340fe904f 100644 --- a/drivers/infiniband/hw/qib/qib_init.c +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -1680,7 +1680,7 @@ int qib_setup_eagerbufs(struct qib_ctxtdata *rcd) * heavy filesystem activity makes these fail, and we can * use compound pages. */ - gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP; + gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP; egrcnt = rcd->rcvegrcnt; egroff = rcd->rcvegr_tid_base; diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 0d533bba4ad1..8b2be1e7714f 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2668,7 +2668,7 @@ static void *alloc_coherent(struct device *dev, size_t size, page = alloc_pages(flag | __GFP_NOWARN, get_order(size)); if (!page) { - if (!(flag & __GFP_WAIT)) + if (!gfpflags_allow_blocking(flag)) return NULL; page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT, diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 7cf80c1a8a16..f1042daef9ad 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -3647,7 +3647,7 @@ static void *intel_alloc_coherent(struct device *dev, size_t size, flags |= GFP_DMA32; } - if (flags & __GFP_WAIT) { + if (gfpflags_allow_blocking(flags)) { unsigned int count = size >> PAGE_SHIFT; page = dma_alloc_from_contiguous(dev, count, order); diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 3729b394432c..917d47e290ae 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -994,7 +994,7 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size) struct bio_vec *bvec; retry: - if (unlikely(gfp_mask & __GFP_WAIT)) + if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM)) mutex_lock(&cc->bio_alloc_lock); clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs); @@ -1010,7 +1010,7 @@ retry: if (!page) { crypt_free_buffer_pages(cc, clone); bio_put(clone); - gfp_mask |= __GFP_WAIT; + gfp_mask |= __GFP_DIRECT_RECLAIM; goto retry; } @@ -1027,7 +1027,7 @@ retry: } return_clone: - if (unlikely(gfp_mask & __GFP_WAIT)) + if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM)) mutex_unlock(&cc->bio_alloc_lock); return clone; diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 3a7cade5e27d..1452ed9aacb4 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -244,7 +244,7 @@ static int kcopyd_get_pages(struct dm_kcopyd_client *kc, *pages = NULL; do { - pl = alloc_pl(__GFP_NOWARN | __GFP_NORETRY); + pl = alloc_pl(__GFP_NOWARN | __GFP_NORETRY | __GFP_KSWAPD_RECLAIM); if (unlikely(!pl)) { /* Use reserved pages */ pl = kc->pages; diff --git a/drivers/media/pci/solo6x10/solo6x10-v4l2-enc.c b/drivers/media/pci/solo6x10/solo6x10-v4l2-enc.c index 1bd2fd47421f..4432fd69b7cb 100644 --- a/drivers/media/pci/solo6x10/solo6x10-v4l2-enc.c +++ b/drivers/media/pci/solo6x10/solo6x10-v4l2-enc.c @@ -1297,7 +1297,7 @@ static struct solo_enc_dev *solo_enc_alloc(struct solo_dev *solo_dev, solo_enc->vidq.ops = &solo_enc_video_qops; solo_enc->vidq.mem_ops = &vb2_dma_sg_memops; solo_enc->vidq.drv_priv = solo_enc; - solo_enc->vidq.gfp_flags = __GFP_DMA32; + solo_enc->vidq.gfp_flags = __GFP_DMA32 | __GFP_KSWAPD_RECLAIM; solo_enc->vidq.timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC; solo_enc->vidq.buf_struct_size = sizeof(struct solo_vb2_buf); solo_enc->vidq.lock = &solo_enc->lock; diff --git a/drivers/media/pci/solo6x10/solo6x10-v4l2.c b/drivers/media/pci/solo6x10/solo6x10-v4l2.c index 26df903585d7..f7ce493b1fee 100644 --- a/drivers/media/pci/solo6x10/solo6x10-v4l2.c +++ b/drivers/media/pci/solo6x10/solo6x10-v4l2.c @@ -678,7 +678,7 @@ int solo_v4l2_init(struct solo_dev *solo_dev, unsigned nr) solo_dev->vidq.mem_ops = &vb2_dma_contig_memops; solo_dev->vidq.drv_priv = solo_dev; solo_dev->vidq.timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC; - solo_dev->vidq.gfp_flags = __GFP_DMA32; + solo_dev->vidq.gfp_flags = __GFP_DMA32 | __GFP_KSWAPD_RECLAIM; solo_dev->vidq.buf_struct_size = sizeof(struct solo_vb2_buf); solo_dev->vidq.lock = &solo_dev->lock; ret = vb2_queue_init(&solo_dev->vidq); diff --git a/drivers/media/pci/tw68/tw68-video.c b/drivers/media/pci/tw68/tw68-video.c index 4c3293dcddbc..46642ef9151b 100644 --- a/drivers/media/pci/tw68/tw68-video.c +++ b/drivers/media/pci/tw68/tw68-video.c @@ -979,7 +979,7 @@ int tw68_video_init2(struct tw68_dev *dev, int video_nr) dev->vidq.ops = &tw68_video_qops; dev->vidq.mem_ops = &vb2_dma_sg_memops; dev->vidq.drv_priv = dev; - dev->vidq.gfp_flags = __GFP_DMA32; + dev->vidq.gfp_flags = __GFP_DMA32 | __GFP_KSWAPD_RECLAIM; dev->vidq.buf_struct_size = sizeof(struct tw68_buf); dev->vidq.lock = &dev->lock; dev->vidq.min_buffers_needed = 2; diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c index 89300870fefb..1e688bfec567 100644 --- a/drivers/misc/vmw_balloon.c +++ b/drivers/misc/vmw_balloon.c @@ -75,7 +75,7 @@ MODULE_LICENSE("GPL"); /* * Use __GFP_HIGHMEM to allow pages from HIGHMEM zone. We don't - * allow wait (__GFP_WAIT) for NOSLEEP page allocations. Use + * allow wait (__GFP_RECLAIM) for NOSLEEP page allocations. Use * __GFP_NOWARN, to suppress page allocation failure warnings. */ #define VMW_PAGE_ALLOC_NOSLEEP (__GFP_HIGHMEM|__GFP_NOWARN) diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index a91cee90aef9..95c13b2ffa79 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -1216,8 +1216,7 @@ EXPORT_SYMBOL_GPL(mtd_writev); */ void *mtd_kmalloc_up_to(const struct mtd_info *mtd, size_t *size) { - gfp_t flags = __GFP_NOWARN | __GFP_WAIT | - __GFP_NORETRY | __GFP_NO_KSWAPD; + gfp_t flags = __GFP_NOWARN | __GFP_DIRECT_RECLAIM | __GFP_NORETRY; size_t min_alloc = max_t(size_t, mtd->writesize, PAGE_SIZE); void *kbuf; diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c index 44173be5cbf0..f8d7a2f06950 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c @@ -691,7 +691,7 @@ static void *bnx2x_frag_alloc(const struct bnx2x_fastpath *fp, gfp_t gfp_mask) { if (fp->rx_frag_size) { /* GFP_KERNEL allocations are used only during initialization */ - if (unlikely(gfp_mask & __GFP_WAIT)) + if (unlikely(gfpflags_allow_blocking(gfp_mask))) return (void *)__get_free_page(gfp_mask); return netdev_alloc_frag(fp->rx_frag_size); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 9f4fe3a5f41e..97b6640a3745 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1025,11 +1025,13 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, req->special = (void *)0; if (buffer && bufflen) { - ret = blk_rq_map_kern(q, req, buffer, bufflen, __GFP_WAIT); + ret = blk_rq_map_kern(q, req, buffer, bufflen, + __GFP_DIRECT_RECLAIM); if (ret) goto out; } else if (ubuffer && bufflen) { - ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, __GFP_WAIT); + ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, + __GFP_DIRECT_RECLAIM); if (ret) goto out; bio = req->bio; diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index 66a96cd98b97..984ddcb4786d 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -1970,7 +1970,7 @@ static void scsi_eh_lock_door(struct scsi_device *sdev) struct request *req; /* - * blk_get_request with GFP_KERNEL (__GFP_WAIT) sleeps until a + * blk_get_request with GFP_KERNEL (__GFP_RECLAIM) sleeps until a * request becomes available */ req = blk_get_request(sdev->request_queue, READ, GFP_KERNEL); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 126a48c6431e..dd8ad2a44510 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -222,13 +222,13 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, int write = (data_direction == DMA_TO_DEVICE); int ret = DRIVER_ERROR << 24; - req = blk_get_request(sdev->request_queue, write, __GFP_WAIT); + req = blk_get_request(sdev->request_queue, write, __GFP_RECLAIM); if (IS_ERR(req)) return ret; blk_rq_set_block_pc(req); if (bufflen && blk_rq_map_kern(sdev->request_queue, req, - buffer, bufflen, __GFP_WAIT)) + buffer, bufflen, __GFP_RECLAIM)) goto out; req->cmd_len = COMMAND_SIZE(cmd[0]); diff --git a/drivers/staging/android/ion/ion_system_heap.c b/drivers/staging/android/ion/ion_system_heap.c index ada724aab3d5..d4c3e5512dd5 100644 --- a/drivers/staging/android/ion/ion_system_heap.c +++ b/drivers/staging/android/ion/ion_system_heap.c @@ -27,7 +27,7 @@ #include "ion_priv.h" static gfp_t high_order_gfp_flags = (GFP_HIGHUSER | __GFP_ZERO | __GFP_NOWARN | - __GFP_NORETRY) & ~__GFP_WAIT; + __GFP_NORETRY) & ~__GFP_DIRECT_RECLAIM; static gfp_t low_order_gfp_flags = (GFP_HIGHUSER | __GFP_ZERO | __GFP_NOWARN); static const unsigned int orders[] = {8, 4, 0}; static const int num_orders = ARRAY_SIZE(orders); diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h index 6af733de69ca..f0b0423a716b 100644 --- a/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h +++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h @@ -95,7 +95,7 @@ do { \ do { \ LASSERT(!in_interrupt() || \ ((size) <= LIBCFS_VMALLOC_SIZE && \ - ((mask) & __GFP_WAIT) == 0)); \ + !gfpflags_allow_blocking(mask))); \ } while (0) #define LIBCFS_ALLOC_POST(ptr, size) \ diff --git a/drivers/staging/lustre/lnet/lnet/router.c b/drivers/staging/lustre/lnet/lnet/router.c index fe49f1b87652..4ea651c6db3a 100644 --- a/drivers/staging/lustre/lnet/lnet/router.c +++ b/drivers/staging/lustre/lnet/lnet/router.c @@ -1245,7 +1245,7 @@ lnet_new_rtrbuf(lnet_rtrbufpool_t *rbp, int cpt) for (i = 0; i < npages; i++) { page = alloc_pages_node( cfs_cpt_spread_node(lnet_cpt_table(), cpt), - __GFP_ZERO | GFP_IOFS, 0); + GFP_KERNEL | __GFP_ZERO, 0); if (page == NULL) { while (--i >= 0) __free_page(rb->rb_kiov[i].kiov_page); diff --git a/drivers/staging/lustre/lnet/selftest/conrpc.c b/drivers/staging/lustre/lnet/selftest/conrpc.c index 0060ff64f88e..64a0335934f3 100644 --- a/drivers/staging/lustre/lnet/selftest/conrpc.c +++ b/drivers/staging/lustre/lnet/selftest/conrpc.c @@ -860,7 +860,7 @@ lstcon_testrpc_prep(lstcon_node_t *nd, int transop, unsigned feats, bulk->bk_iovs[i].kiov_offset = 0; bulk->bk_iovs[i].kiov_len = len; bulk->bk_iovs[i].kiov_page = - alloc_page(GFP_IOFS); + alloc_page(GFP_KERNEL); if (bulk->bk_iovs[i].kiov_page == NULL) { lstcon_rpc_put(*crpc); diff --git a/drivers/staging/lustre/lnet/selftest/rpc.c b/drivers/staging/lustre/lnet/selftest/rpc.c index 162f9d330496..7005002c15da 100644 --- a/drivers/staging/lustre/lnet/selftest/rpc.c +++ b/drivers/staging/lustre/lnet/selftest/rpc.c @@ -146,7 +146,7 @@ srpc_alloc_bulk(int cpt, unsigned bulk_npg, unsigned bulk_len, int sink) int nob; pg = alloc_pages_node(cfs_cpt_spread_node(lnet_cpt_table(), cpt), - GFP_IOFS, 0); + GFP_KERNEL, 0); if (pg == NULL) { CERROR("Can't allocate page %d of %d\n", i, bulk_npg); srpc_free_bulk(bk); diff --git a/drivers/staging/lustre/lustre/libcfs/module.c b/drivers/staging/lustre/lustre/libcfs/module.c index 50e8fd23fa17..07a68594c279 100644 --- a/drivers/staging/lustre/lustre/libcfs/module.c +++ b/drivers/staging/lustre/lustre/libcfs/module.c @@ -319,7 +319,7 @@ static int libcfs_ioctl(struct cfs_psdev_file *pfile, unsigned long cmd, void *a struct libcfs_ioctl_data *data; int err = 0; - LIBCFS_ALLOC_GFP(buf, 1024, GFP_IOFS); + LIBCFS_ALLOC_GFP(buf, 1024, GFP_KERNEL); if (buf == NULL) return -ENOMEM; diff --git a/drivers/staging/lustre/lustre/libcfs/tracefile.c b/drivers/staging/lustre/lustre/libcfs/tracefile.c index 973c7c209dfc..f2d018d7823c 100644 --- a/drivers/staging/lustre/lustre/libcfs/tracefile.c +++ b/drivers/staging/lustre/lustre/libcfs/tracefile.c @@ -810,7 +810,7 @@ int cfs_trace_allocate_string_buffer(char **str, int nob) if (nob > 2 * PAGE_CACHE_SIZE) /* string must be "sensible" */ return -EINVAL; - *str = kmalloc(nob, GFP_IOFS | __GFP_ZERO); + *str = kmalloc(nob, GFP_KERNEL | __GFP_ZERO); if (*str == NULL) return -ENOMEM; diff --git a/drivers/staging/lustre/lustre/llite/remote_perm.c b/drivers/staging/lustre/lustre/llite/remote_perm.c index c902133dfc97..fe4a72268e3a 100644 --- a/drivers/staging/lustre/lustre/llite/remote_perm.c +++ b/drivers/staging/lustre/lustre/llite/remote_perm.c @@ -82,7 +82,7 @@ static struct hlist_head *alloc_rmtperm_hash(void) struct hlist_head *hash; int i; - hash = kmem_cache_alloc(ll_rmtperm_hash_cachep, GFP_IOFS | __GFP_ZERO); + hash = kmem_cache_alloc(ll_rmtperm_hash_cachep, GFP_NOFS | __GFP_ZERO); if (!hash) return NULL; diff --git a/drivers/staging/lustre/lustre/mgc/mgc_request.c b/drivers/staging/lustre/lustre/mgc/mgc_request.c index b81efcd997ae..5f53f3b7ceff 100644 --- a/drivers/staging/lustre/lustre/mgc/mgc_request.c +++ b/drivers/staging/lustre/lustre/mgc/mgc_request.c @@ -1112,7 +1112,7 @@ static int mgc_apply_recover_logs(struct obd_device *mgc, LASSERT(cfg->cfg_instance != NULL); LASSERT(cfg->cfg_sb == cfg->cfg_instance); - inst = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); + inst = kzalloc(PAGE_CACHE_SIZE, GFP_KERNEL); if (!inst) return -ENOMEM; @@ -1308,14 +1308,14 @@ static int mgc_process_recover_log(struct obd_device *obd, if (cfg->cfg_last_idx == 0) /* the first time */ nrpages = CONFIG_READ_NRPAGES_INIT; - pages = kcalloc(nrpages, sizeof(*pages), GFP_NOFS); + pages = kcalloc(nrpages, sizeof(*pages), GFP_KERNEL); if (pages == NULL) { rc = -ENOMEM; goto out; } for (i = 0; i < nrpages; i++) { - pages[i] = alloc_page(GFP_IOFS); + pages[i] = alloc_page(GFP_KERNEL); if (pages[i] == NULL) { rc = -ENOMEM; goto out; @@ -1466,7 +1466,7 @@ static int mgc_process_cfg_log(struct obd_device *mgc, if (cld->cld_cfg.cfg_sb) lsi = s2lsi(cld->cld_cfg.cfg_sb); - env = kzalloc(sizeof(*env), GFP_NOFS); + env = kzalloc(sizeof(*env), GFP_KERNEL); if (!env) return -ENOMEM; diff --git a/drivers/staging/lustre/lustre/obdecho/echo_client.c b/drivers/staging/lustre/lustre/obdecho/echo_client.c index b6f000bb8c82..f61ef669644c 100644 --- a/drivers/staging/lustre/lustre/obdecho/echo_client.c +++ b/drivers/staging/lustre/lustre/obdecho/echo_client.c @@ -1562,7 +1562,7 @@ static int echo_client_kbrw(struct echo_device *ed, int rw, struct obdo *oa, (oa->o_valid & OBD_MD_FLFLAGS) != 0 && (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0); - gfp_mask = ((ostid_id(&oa->o_oi) & 2) == 0) ? GFP_IOFS : GFP_HIGHUSER; + gfp_mask = ((ostid_id(&oa->o_oi) & 2) == 0) ? GFP_KERNEL : GFP_HIGHUSER; LASSERT(rw == OBD_BRW_WRITE || rw == OBD_BRW_READ); LASSERT(lsm != NULL); diff --git a/drivers/staging/lustre/lustre/osc/osc_cache.c b/drivers/staging/lustre/lustre/osc/osc_cache.c index cfb83bcfcb17..b1d1a87f05e3 100644 --- a/drivers/staging/lustre/lustre/osc/osc_cache.c +++ b/drivers/staging/lustre/lustre/osc/osc_cache.c @@ -346,7 +346,7 @@ static struct osc_extent *osc_extent_alloc(struct osc_object *obj) { struct osc_extent *ext; - ext = kmem_cache_alloc(osc_extent_kmem, GFP_IOFS | __GFP_ZERO); + ext = kmem_cache_alloc(osc_extent_kmem, GFP_NOFS | __GFP_ZERO); if (ext == NULL) return NULL; diff --git a/drivers/staging/rdma/hfi1/init.c b/drivers/staging/rdma/hfi1/init.c index 47a1202fcbdf..8666f3ad24e9 100644 --- a/drivers/staging/rdma/hfi1/init.c +++ b/drivers/staging/rdma/hfi1/init.c @@ -1560,7 +1560,7 @@ int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd) * heavy filesystem activity makes these fail, and we can * use compound pages. */ - gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP; + gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP; /* * The minimum size of the eager buffers is a groups of MTU-sized diff --git a/drivers/staging/rdma/ipath/ipath_file_ops.c b/drivers/staging/rdma/ipath/ipath_file_ops.c index 5d9b9dbd8fc4..13c3cd11ab92 100644 --- a/drivers/staging/rdma/ipath/ipath_file_ops.c +++ b/drivers/staging/rdma/ipath/ipath_file_ops.c @@ -905,7 +905,7 @@ static int ipath_create_user_egr(struct ipath_portdata *pd) * heavy filesystem activity makes these fail, and we can * use compound pages. */ - gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP; + gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP; egrcnt = dd->ipath_rcvegrcnt; /* TID number offset for this port */ diff --git a/drivers/usb/gadget/function/f_mass_storage.c b/drivers/usb/gadget/function/f_mass_storage.c index cd54e72a6c50..5ec533826621 100644 --- a/drivers/usb/gadget/function/f_mass_storage.c +++ b/drivers/usb/gadget/function/f_mass_storage.c @@ -2345,7 +2345,6 @@ static void fsg_disable(struct usb_function *f) static void handle_exception(struct fsg_common *common) { - siginfo_t info; int i; struct fsg_buffhd *bh; enum fsg_state old_state; @@ -2357,8 +2356,7 @@ static void handle_exception(struct fsg_common *common) * into a high-priority EXIT exception. */ for (;;) { - int sig = - dequeue_signal_lock(current, ¤t->blocked, &info); + int sig = kernel_dequeue_signal(NULL); if (!sig) break; if (sig != SIGUSR1) { diff --git a/drivers/usb/host/u132-hcd.c b/drivers/usb/host/u132-hcd.c index 0a94895a358d..692ccc69345e 100644 --- a/drivers/usb/host/u132-hcd.c +++ b/drivers/usb/host/u132-hcd.c @@ -2244,7 +2244,7 @@ static int u132_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, { struct u132 *u132 = hcd_to_u132(hcd); if (irqs_disabled()) { - if (__GFP_WAIT & mem_flags) { + if (gfpflags_allow_blocking(mem_flags)) { printk(KERN_ERR "invalid context for function that might sleep\n"); return -EINVAL; } diff --git a/drivers/video/fbdev/vermilion/vermilion.c b/drivers/video/fbdev/vermilion/vermilion.c index 6b70d7f62b2f..1c1e95a0b8fa 100644 --- a/drivers/video/fbdev/vermilion/vermilion.c +++ b/drivers/video/fbdev/vermilion/vermilion.c @@ -99,7 +99,7 @@ static int vmlfb_alloc_vram_area(struct vram_area *va, unsigned max_order, * below the first 16MB. */ - flags = __GFP_DMA | __GFP_HIGH; + flags = __GFP_DMA | __GFP_HIGH | __GFP_KSWAPD_RECLAIM; va->logical = __get_free_pages(flags, --max_order); } while (va->logical == 0 && max_order > min_order); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 97b049ad0594..c473c42d7d6c 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -482,13 +482,12 @@ static noinline int add_ra_bio_pages(struct inode *inode, goto next; } - page = __page_cache_alloc(mapping_gfp_mask(mapping) & - ~__GFP_FS); + page = __page_cache_alloc(mapping_gfp_constraint(mapping, + ~__GFP_FS)); if (!page) break; - if (add_to_page_cache_lru(page, mapping, pg_index, - GFP_NOFS)) { + if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) { page_cache_release(page); goto next; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a2e73f6053a8..8c58191249cc 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3367,7 +3367,7 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) { - return mapping_gfp_mask(mapping) & ~__GFP_FS; + return mapping_gfp_constraint(mapping, ~__GFP_FS); } /* extent-tree.c */ diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2d4667594681..640598c0d0e7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2575,7 +2575,7 @@ int open_ctree(struct super_block *sb, fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */ /* readahead state */ - INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); + INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); spin_lock_init(&fs_info->reada_lock); fs_info->thread_pool_size = min_t(unsigned long, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 33a01ea41465..9abe18763a7f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -616,7 +616,7 @@ static int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) clear = 1; again: - if (!prealloc && (mask & __GFP_WAIT)) { + if (!prealloc && gfpflags_allow_blocking(mask)) { /* * Don't care for allocation failure here because we might end * up not needing the pre-allocated extent state at all, which @@ -741,7 +741,7 @@ search_again: if (start > end) goto out; spin_unlock(&tree->lock); - if (mask & __GFP_WAIT) + if (gfpflags_allow_blocking(mask)) cond_resched(); goto again; } @@ -874,7 +874,7 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, bits |= EXTENT_FIRST_DELALLOC; again: - if (!prealloc && (mask & __GFP_WAIT)) { + if (!prealloc && gfpflags_allow_blocking(mask)) { prealloc = alloc_extent_state(mask); BUG_ON(!prealloc); } @@ -1052,7 +1052,7 @@ search_again: if (start > end) goto out; spin_unlock(&tree->lock); - if (mask & __GFP_WAIT) + if (gfpflags_allow_blocking(mask)) cond_resched(); goto again; } @@ -1100,7 +1100,7 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, btrfs_debug_check_extent_io_range(tree, start, end); again: - if (!prealloc && (mask & __GFP_WAIT)) { + if (!prealloc && gfpflags_allow_blocking(mask)) { /* * Best effort, don't worry if extent state allocation fails * here for the first iteration. We might have a cached state @@ -1278,7 +1278,7 @@ search_again: if (start > end) goto out; spin_unlock(&tree->lock); - if (mask & __GFP_WAIT) + if (gfpflags_allow_blocking(mask)) cond_resched(); first_iteration = false; goto again; @@ -4386,7 +4386,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, u64 start = page_offset(page); u64 end = start + PAGE_CACHE_SIZE - 1; - if ((mask & __GFP_WAIT) && + if (gfpflags_allow_blocking(mask) && page->mapping->host->i_size > 16 * 1024 * 1024) { u64 len; while (start <= end) { diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 0948d34cb84a..85a1f8621b51 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -85,8 +85,8 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, } mapping_set_gfp_mask(inode->i_mapping, - mapping_gfp_mask(inode->i_mapping) & - ~(__GFP_FS | __GFP_HIGHMEM)); + mapping_gfp_constraint(inode->i_mapping, + ~(__GFP_FS | __GFP_HIGHMEM))); return inode; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 17ed76d18eb6..9b2dafa5ba59 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -232,8 +232,8 @@ static struct btrfs_device *__alloc_device(void) spin_lock_init(&dev->reada_lock); atomic_set(&dev->reada_in_flight, 0); atomic_set(&dev->dev_stats_ccnt, 0); - INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT); - INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT); + INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); + INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); return dev; } diff --git a/fs/buffer.c b/fs/buffer.c index 82283abb2795..51aff0296ce2 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -999,7 +999,7 @@ grow_dev_page(struct block_device *bdev, sector_t block, int ret = 0; /* Will call free_more_memory() */ gfp_t gfp_mask; - gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp; + gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp; /* * XXX: __getblk_slow() can not really deal with failure and diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index aecd0859eacb..9c4b737a54df 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -30,7 +30,7 @@ extern unsigned cachefiles_debug; #define CACHEFILES_DEBUG_KLEAVE 2 #define CACHEFILES_DEBUG_KDEBUG 4 -#define cachefiles_gfp (__GFP_WAIT | __GFP_NORETRY | __GFP_NOMEMALLOC) +#define cachefiles_gfp (__GFP_RECLAIM | __GFP_NORETRY | __GFP_NOMEMALLOC) /* * node records diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 9d23e788d1df..b7d218a168fb 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1283,8 +1283,8 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) int ret1; struct address_space *mapping = inode->i_mapping; struct page *page = find_or_create_page(mapping, 0, - mapping_gfp_mask(mapping) & - ~__GFP_FS); + mapping_gfp_constraint(mapping, + ~__GFP_FS)); if (!page) { ret = VM_FAULT_OOM; goto out; @@ -1428,7 +1428,8 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, if (i_size_read(inode) == 0) return; page = find_or_create_page(mapping, 0, - mapping_gfp_mask(mapping) & ~__GFP_FS); + mapping_gfp_constraint(mapping, + ~__GFP_FS)); if (!page) return; if (PageUptodate(page)) { diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 47c5c97e2dd3..0068e82217c3 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3380,7 +3380,7 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, struct page *page, *tpage; unsigned int expected_index; int rc; - gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(mapping); + gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL); INIT_LIST_HEAD(tmplist); diff --git a/fs/coredump.c b/fs/coredump.c index a8f75640ac86..1777331eee76 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -280,23 +280,24 @@ out: return ispipe; } -static int zap_process(struct task_struct *start, int exit_code) +static int zap_process(struct task_struct *start, int exit_code, int flags) { struct task_struct *t; int nr = 0; + /* ignore all signals except SIGKILL, see prepare_signal() */ + start->signal->flags = SIGNAL_GROUP_COREDUMP | flags; start->signal->group_exit_code = exit_code; start->signal->group_stop_count = 0; - t = start; - do { + for_each_thread(start, t) { task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); if (t != current && t->mm) { sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); nr++; } - } while_each_thread(start, t); + } return nr; } @@ -311,10 +312,8 @@ static int zap_threads(struct task_struct *tsk, struct mm_struct *mm, spin_lock_irq(&tsk->sighand->siglock); if (!signal_group_exit(tsk->signal)) { mm->core_state = core_state; - nr = zap_process(tsk, exit_code); tsk->signal->group_exit_task = tsk; - /* ignore all signals except SIGKILL, see prepare_signal() */ - tsk->signal->flags = SIGNAL_GROUP_COREDUMP; + nr = zap_process(tsk, exit_code, 0); clear_tsk_thread_flag(tsk, TIF_SIGPENDING); } spin_unlock_irq(&tsk->sighand->siglock); @@ -360,18 +359,18 @@ static int zap_threads(struct task_struct *tsk, struct mm_struct *mm, continue; if (g->flags & PF_KTHREAD) continue; - p = g; - do { - if (p->mm) { - if (unlikely(p->mm == mm)) { - lock_task_sighand(p, &flags); - nr += zap_process(p, exit_code); - p->signal->flags = SIGNAL_GROUP_EXIT; - unlock_task_sighand(p, &flags); - } - break; + + for_each_thread(g, p) { + if (unlikely(!p->mm)) + continue; + if (unlikely(p->mm == mm)) { + lock_task_sighand(p, &flags); + nr += zap_process(p, exit_code, + SIGNAL_GROUP_EXIT); + unlock_task_sighand(p, &flags); } - } while_each_thread(g, p); + break; + } } rcu_read_unlock(); done: diff --git a/fs/direct-io.c b/fs/direct-io.c index 3ae0e0427191..18e7554cf94c 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -361,7 +361,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, /* * bio_alloc() is guaranteed to return a bio when called with - * __GFP_WAIT and we request a valid number of vectors. + * __GFP_RECLAIM and we request a valid number of vectors. */ bio = bio_alloc(GFP_KERNEL, nr_vecs); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e8d620a484f6..7d1aad1d9313 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3386,7 +3386,7 @@ static int __ext4_block_zero_page_range(handle_t *handle, int err = 0; page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, - mapping_gfp_mask(mapping) & ~__GFP_FS); + mapping_gfp_constraint(mapping, ~__GFP_FS)); if (!page) return -ENOMEM; diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index d94af71a4e7f..5dc5e95063de 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -166,7 +166,7 @@ int ext4_mpage_readpages(struct address_space *mapping, page = list_entry(pages->prev, struct page, lru); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, - GFP_KERNEL & mapping_gfp_mask(mapping))) + mapping_gfp_constraint(mapping, GFP_KERNEL))) goto next_page; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 04d0f1b33409..753f4e68b820 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1061,7 +1061,7 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page, return 0; if (journal) return jbd2_journal_try_to_free_buffers(journal, page, - wait & ~__GFP_WAIT); + wait & ~__GFP_DIRECT_RECLAIM); return try_to_free_buffers(page); } diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index d403c69bee08..4304072161aa 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -111,7 +111,7 @@ struct fscache_cookie *__fscache_acquire_cookie( /* radix tree insertion won't use the preallocation pool unless it's * told it may not wait */ - INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_WAIT); + INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); switch (cookie->def->type) { case FSCACHE_COOKIE_TYPE_INDEX: diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 483bbc613bf0..79483b3d8c6f 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -58,7 +58,7 @@ bool release_page_wait_timeout(struct fscache_cookie *cookie, struct page *page) /* * decide whether a page can be released, possibly by cancelling a store to it - * - we're allowed to sleep if __GFP_WAIT is flagged + * - we're allowed to sleep if __GFP_DIRECT_RECLAIM is flagged */ bool __fscache_maybe_release_page(struct fscache_cookie *cookie, struct page *page, @@ -122,7 +122,7 @@ page_busy: * allocator as the work threads writing to the cache may all end up * sleeping on memory allocation, so we may need to impose a timeout * too. */ - if (!(gfp & __GFP_WAIT) || !(gfp & __GFP_FS)) { + if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS)) { fscache_stat(&fscache_n_store_vmscan_busy); return false; } @@ -132,7 +132,7 @@ page_busy: _debug("fscache writeout timeout page: %p{%lx}", page, page->index); - gfp &= ~__GFP_WAIT; + gfp &= ~__GFP_DIRECT_RECLAIM; goto try_again; } EXPORT_SYMBOL(__fscache_maybe_release_page); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 6b8338ec2464..89463eee6791 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1937,8 +1937,8 @@ out: * @journal: journal for operation * @page: to try and free * @gfp_mask: we use the mask to detect how hard should we try to release - * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to - * release the buffers. + * buffers. If __GFP_DIRECT_RECLAIM and __GFP_FS is set, we wait for commit + * code to release the buffers. * * * For all the buffers on this page, diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c index bb9cebc9ca8a..e5c1783ab64a 100644 --- a/fs/jffs2/background.c +++ b/fs/jffs2/background.c @@ -80,7 +80,6 @@ static int jffs2_garbage_collect_thread(void *_c) siginitset(&hupmask, sigmask(SIGHUP)); allow_signal(SIGKILL); allow_signal(SIGSTOP); - allow_signal(SIGCONT); allow_signal(SIGHUP); c->gc_task = current; @@ -121,20 +120,18 @@ static int jffs2_garbage_collect_thread(void *_c) /* Put_super will send a SIGKILL and then wait on the sem. */ while (signal_pending(current) || freezing(current)) { - siginfo_t info; unsigned long signr; if (try_to_freeze()) goto again; - signr = dequeue_signal_lock(current, ¤t->blocked, &info); + signr = kernel_dequeue_signal(NULL); switch(signr) { case SIGSTOP: jffs2_dbg(1, "%s(): SIGSTOP received\n", __func__); - set_current_state(TASK_STOPPED); - schedule(); + kernel_signal_stop(); break; case SIGKILL: diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index 63f31c0733c5..f3a4857ff071 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -1264,7 +1264,7 @@ int jffs2_dataflash_setup(struct jffs2_sb_info *c) { if ((c->flash_size % c->sector_size) != 0) { c->flash_size = (c->flash_size / c->sector_size) * c->sector_size; pr_warn("flash size adjusted to %dKiB\n", c->flash_size); - }; + } c->wbuf_ofs = 0xFFFFFFFF; c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL); diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c index 7f9b096d8d57..6de0fbfc6c00 100644 --- a/fs/logfs/segment.c +++ b/fs/logfs/segment.c @@ -57,7 +57,7 @@ static struct page *get_mapping_page(struct super_block *sb, pgoff_t index, filler_t *filler = super->s_devops->readpage; struct page *page; - BUG_ON(mapping_gfp_mask(mapping) & __GFP_FS); + BUG_ON(mapping_gfp_constraint(mapping, __GFP_FS)); if (use_filler) page = read_cache_page(mapping, index, filler, sb); else { diff --git a/fs/mpage.c b/fs/mpage.c index 09abba7653aa..1480d3a18037 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -361,7 +361,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages, sector_t last_block_in_bio = 0; struct buffer_head map_bh; unsigned long first_logical_block = 0; - gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(mapping); + gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL); map_bh.b_state = 0; map_bh.b_size = 0; @@ -397,7 +397,7 @@ int mpage_readpage(struct page *page, get_block_t get_block) sector_t last_block_in_bio = 0; struct buffer_head map_bh; unsigned long first_logical_block = 0; - gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(page->mapping); + gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); map_bh.b_state = 0; map_bh.b_size = 0; diff --git a/fs/namei.c b/fs/namei.c index 6f567347f14f..d84d7c7515fc 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4604,7 +4604,7 @@ EXPORT_SYMBOL(__page_symlink); int page_symlink(struct inode *inode, const char *symname, int len) { return __page_symlink(inode, symname, len, - !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS)); + !mapping_gfp_constraint(inode->i_mapping, __GFP_FS)); } EXPORT_SYMBOL(page_symlink); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 37f639d50af5..93e236429c5d 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -473,8 +473,8 @@ static int nfs_release_page(struct page *page, gfp_t gfp) dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); /* Always try to initiate a 'commit' if relevant, but only - * wait for it if __GFP_WAIT is set. Even then, only wait 1 - * second and only if the 'bdi' is not congested. + * wait for it if the caller allows blocking. Even then, + * only wait 1 second and only if the 'bdi' is not congested. * Waiting indefinitely can cause deadlocks when the NFS * server is on this machine, when a new TCP connection is * needed and in other rare cases. There is no particular @@ -484,7 +484,7 @@ static int nfs_release_page(struct page *page, gfp_t gfp) if (mapping) { struct nfs_server *nfss = NFS_SERVER(mapping->host); nfs_commit_inode(mapping->host, 0); - if ((gfp & __GFP_WAIT) && + if (gfpflags_allow_blocking(gfp) && !bdi_write_congested(&nfss->backing_dev_info)) { wait_on_page_bit_killable_timeout(page, PG_private, HZ); diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index 8df0f3b7839b..2ccbf5531554 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -133,38 +133,38 @@ nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group) /** * nilfs_palloc_group_desc_nfrees - get the number of free entries in a group - * @inode: inode of metadata file using this allocator - * @group: group number * @desc: pointer to descriptor structure for the group + * @lock: spin lock protecting @desc */ static unsigned long -nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group, - const struct nilfs_palloc_group_desc *desc) +nilfs_palloc_group_desc_nfrees(const struct nilfs_palloc_group_desc *desc, + spinlock_t *lock) { unsigned long nfree; - spin_lock(nilfs_mdt_bgl_lock(inode, group)); + spin_lock(lock); nfree = le32_to_cpu(desc->pg_nfrees); - spin_unlock(nilfs_mdt_bgl_lock(inode, group)); + spin_unlock(lock); return nfree; } /** * nilfs_palloc_group_desc_add_entries - adjust count of free entries - * @inode: inode of metadata file using this allocator - * @group: group number * @desc: pointer to descriptor structure for the group + * @lock: spin lock protecting @desc * @n: delta to be added */ -static void -nilfs_palloc_group_desc_add_entries(struct inode *inode, - unsigned long group, - struct nilfs_palloc_group_desc *desc, - u32 n) +static u32 +nilfs_palloc_group_desc_add_entries(struct nilfs_palloc_group_desc *desc, + spinlock_t *lock, u32 n) { - spin_lock(nilfs_mdt_bgl_lock(inode, group)); + u32 nfree; + + spin_lock(lock); le32_add_cpu(&desc->pg_nfrees, n); - spin_unlock(nilfs_mdt_bgl_lock(inode, group)); + nfree = le32_to_cpu(desc->pg_nfrees); + spin_unlock(lock); + return nfree; } /** @@ -240,6 +240,26 @@ static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff, } /** + * nilfs_palloc_delete_block - delete a block on the persistent allocator file + * @inode: inode of metadata file using this allocator + * @blkoff: block offset + * @prev: nilfs_bh_assoc struct of the last used buffer + * @lock: spin lock protecting @prev + */ +static int nilfs_palloc_delete_block(struct inode *inode, unsigned long blkoff, + struct nilfs_bh_assoc *prev, + spinlock_t *lock) +{ + spin_lock(lock); + if (prev->bh && blkoff == prev->blkoff) { + brelse(prev->bh); + prev->bh = NULL; + } + spin_unlock(lock); + return nilfs_mdt_delete_block(inode, blkoff); +} + +/** * nilfs_palloc_get_desc_block - get buffer head of a group descriptor block * @inode: inode of metadata file using this allocator * @group: group number @@ -278,6 +298,22 @@ static int nilfs_palloc_get_bitmap_block(struct inode *inode, } /** + * nilfs_palloc_delete_bitmap_block - delete a bitmap block + * @inode: inode of metadata file using this allocator + * @group: group number + */ +static int nilfs_palloc_delete_bitmap_block(struct inode *inode, + unsigned long group) +{ + struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; + + return nilfs_palloc_delete_block(inode, + nilfs_palloc_bitmap_blkoff(inode, + group), + &cache->prev_bitmap, &cache->lock); +} + +/** * nilfs_palloc_get_entry_block - get buffer head of an entry block * @inode: inode of metadata file using this allocator * @nr: serial number of the entry (e.g. inode number) @@ -296,6 +332,20 @@ int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr, } /** + * nilfs_palloc_delete_entry_block - delete an entry block + * @inode: inode of metadata file using this allocator + * @nr: serial number of the entry + */ +static int nilfs_palloc_delete_entry_block(struct inode *inode, __u64 nr) +{ + struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; + + return nilfs_palloc_delete_block(inode, + nilfs_palloc_entry_blkoff(inode, nr), + &cache->prev_entry, &cache->lock); +} + +/** * nilfs_palloc_block_get_group_desc - get kernel address of a group descriptor * @inode: inode of metadata file using this allocator * @group: group number @@ -332,51 +382,40 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr, /** * nilfs_palloc_find_available_slot - find available slot in a group - * @inode: inode of metadata file using this allocator - * @group: group number - * @target: offset number of an entry in the group (start point) * @bitmap: bitmap of the group + * @target: offset number of an entry in the group (start point) * @bsize: size in bits + * @lock: spin lock protecting @bitmap */ -static int nilfs_palloc_find_available_slot(struct inode *inode, - unsigned long group, +static int nilfs_palloc_find_available_slot(unsigned char *bitmap, unsigned long target, - unsigned char *bitmap, - int bsize) -{ - int curr, pos, end, i; - - if (target > 0) { - end = (target + BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1); - if (end > bsize) - end = bsize; - pos = nilfs_find_next_zero_bit(bitmap, end, target); - if (pos < end && - !nilfs_set_bit_atomic( - nilfs_mdt_bgl_lock(inode, group), pos, bitmap)) - return pos; - } else - end = 0; - - for (i = 0, curr = end; - i < bsize; - i += BITS_PER_LONG, curr += BITS_PER_LONG) { - /* wrap around */ - if (curr >= bsize) - curr = 0; - while (*((unsigned long *)bitmap + curr / BITS_PER_LONG) - != ~0UL) { - end = curr + BITS_PER_LONG; - if (end > bsize) - end = bsize; - pos = nilfs_find_next_zero_bit(bitmap, end, curr); - if ((pos < end) && - !nilfs_set_bit_atomic( - nilfs_mdt_bgl_lock(inode, group), pos, - bitmap)) + unsigned bsize, + spinlock_t *lock) +{ + int pos, end = bsize; + + if (likely(target < bsize)) { + pos = target; + do { + pos = nilfs_find_next_zero_bit(bitmap, end, pos); + if (pos >= end) + break; + if (!nilfs_set_bit_atomic(lock, pos, bitmap)) return pos; - } + } while (++pos < end); + + end = target; + } + + /* wrap around */ + for (pos = 0; pos < end; pos++) { + pos = nilfs_find_next_zero_bit(bitmap, end, pos); + if (pos >= end) + break; + if (!nilfs_set_bit_atomic(lock, pos, bitmap)) + return pos; } + return -ENOSPC; } @@ -475,15 +514,15 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, void *desc_kaddr, *bitmap_kaddr; unsigned long group, maxgroup, ngroups; unsigned long group_offset, maxgroup_offset; - unsigned long n, entries_per_group, groups_per_desc_block; + unsigned long n, entries_per_group; unsigned long i, j; + spinlock_t *lock; int pos, ret; ngroups = nilfs_palloc_groups_count(inode); maxgroup = ngroups - 1; group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); entries_per_group = nilfs_palloc_entries_per_group(inode); - groups_per_desc_block = nilfs_palloc_groups_per_desc_block(inode); for (i = 0; i < ngroups; i += n) { if (group >= ngroups) { @@ -501,8 +540,8 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, n = nilfs_palloc_rest_groups_in_desc_block(inode, group, maxgroup); for (j = 0; j < n; j++, desc++, group++) { - if (nilfs_palloc_group_desc_nfrees(inode, group, desc) - > 0) { + lock = nilfs_mdt_bgl_lock(inode, group); + if (nilfs_palloc_group_desc_nfrees(desc, lock) > 0) { ret = nilfs_palloc_get_bitmap_block( inode, group, 1, &bitmap_bh); if (ret < 0) @@ -510,12 +549,12 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, bitmap_kaddr = kmap(bitmap_bh->b_page); bitmap = bitmap_kaddr + bh_offset(bitmap_bh); pos = nilfs_palloc_find_available_slot( - inode, group, group_offset, bitmap, - entries_per_group); + bitmap, group_offset, + entries_per_group, lock); if (pos >= 0) { /* found a free entry */ nilfs_palloc_group_desc_add_entries( - inode, group, desc, -1); + desc, lock, -1); req->pr_entry_nr = entries_per_group * group + pos; kunmap(desc_bh->b_page); @@ -573,6 +612,7 @@ void nilfs_palloc_commit_free_entry(struct inode *inode, unsigned long group, group_offset; unsigned char *bitmap; void *desc_kaddr, *bitmap_kaddr; + spinlock_t *lock; group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); desc_kaddr = kmap(req->pr_desc_bh->b_page); @@ -580,13 +620,15 @@ void nilfs_palloc_commit_free_entry(struct inode *inode, req->pr_desc_bh, desc_kaddr); bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page); bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh); + lock = nilfs_mdt_bgl_lock(inode, group); - if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group), - group_offset, bitmap)) - printk(KERN_WARNING "%s: entry number %llu already freed\n", - __func__, (unsigned long long)req->pr_entry_nr); + if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) + nilfs_warning(inode->i_sb, __func__, + "entry number %llu already freed: ino=%lu\n", + (unsigned long long)req->pr_entry_nr, + (unsigned long)inode->i_ino); else - nilfs_palloc_group_desc_add_entries(inode, group, desc, 1); + nilfs_palloc_group_desc_add_entries(desc, lock, 1); kunmap(req->pr_bitmap_bh->b_page); kunmap(req->pr_desc_bh->b_page); @@ -611,6 +653,7 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode, void *desc_kaddr, *bitmap_kaddr; unsigned char *bitmap; unsigned long group, group_offset; + spinlock_t *lock; group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); desc_kaddr = kmap(req->pr_desc_bh->b_page); @@ -618,12 +661,15 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode, req->pr_desc_bh, desc_kaddr); bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page); bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh); - if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group), - group_offset, bitmap)) - printk(KERN_WARNING "%s: entry number %llu already freed\n", - __func__, (unsigned long long)req->pr_entry_nr); + lock = nilfs_mdt_bgl_lock(inode, group); + + if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) + nilfs_warning(inode->i_sb, __func__, + "entry number %llu already freed: ino=%lu\n", + (unsigned long long)req->pr_entry_nr, + (unsigned long)inode->i_ino); else - nilfs_palloc_group_desc_add_entries(inode, group, desc, 1); + nilfs_palloc_group_desc_add_entries(desc, lock, 1); kunmap(req->pr_bitmap_bh->b_page); kunmap(req->pr_desc_bh->b_page); @@ -680,22 +726,6 @@ void nilfs_palloc_abort_free_entry(struct inode *inode, } /** - * nilfs_palloc_group_is_in - judge if an entry is in a group - * @inode: inode of metadata file using this allocator - * @group: group number - * @nr: serial number of the entry (e.g. inode number) - */ -static int -nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr) -{ - __u64 first, last; - - first = group * nilfs_palloc_entries_per_group(inode); - last = first + nilfs_palloc_entries_per_group(inode) - 1; - return (nr >= first) && (nr <= last); -} - -/** * nilfs_palloc_freev - deallocate a set of persistent objects * @inode: inode of metadata file using this allocator * @entry_nrs: array of entry numbers to be deallocated @@ -708,9 +738,18 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) unsigned char *bitmap; void *desc_kaddr, *bitmap_kaddr; unsigned long group, group_offset; - int i, j, n, ret; + __u64 group_min_nr, last_nrs[8]; + const unsigned long epg = nilfs_palloc_entries_per_group(inode); + const unsigned epb = NILFS_MDT(inode)->mi_entries_per_block; + unsigned entry_start, end, pos; + spinlock_t *lock; + int i, j, k, ret; + u32 nfree; for (i = 0; i < nitems; i = j) { + int change_group = false; + int nempties = 0, n = 0; + group = nilfs_palloc_group(inode, entry_nrs[i], &group_offset); ret = nilfs_palloc_get_desc_block(inode, group, 0, &desc_bh); if (ret < 0) @@ -721,38 +760,89 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) brelse(desc_bh); return ret; } - desc_kaddr = kmap(desc_bh->b_page); - desc = nilfs_palloc_block_get_group_desc( - inode, group, desc_bh, desc_kaddr); + + /* Get the first entry number of the group */ + group_min_nr = (__u64)group * epg; + bitmap_kaddr = kmap(bitmap_bh->b_page); bitmap = bitmap_kaddr + bh_offset(bitmap_bh); - for (j = i, n = 0; - (j < nitems) && nilfs_palloc_group_is_in(inode, group, - entry_nrs[j]); - j++) { - nilfs_palloc_group(inode, entry_nrs[j], &group_offset); - if (!nilfs_clear_bit_atomic( - nilfs_mdt_bgl_lock(inode, group), - group_offset, bitmap)) { - printk(KERN_WARNING - "%s: entry number %llu already freed\n", - __func__, - (unsigned long long)entry_nrs[j]); + lock = nilfs_mdt_bgl_lock(inode, group); + + j = i; + entry_start = rounddown(group_offset, epb); + do { + if (!nilfs_clear_bit_atomic(lock, group_offset, + bitmap)) { + nilfs_warning(inode->i_sb, __func__, + "entry number %llu already freed: ino=%lu\n", + (unsigned long long)entry_nrs[j], + (unsigned long)inode->i_ino); } else { n++; } - } - nilfs_palloc_group_desc_add_entries(inode, group, desc, n); + + j++; + if (j >= nitems || entry_nrs[j] < group_min_nr || + entry_nrs[j] >= group_min_nr + epg) { + change_group = true; + } else { + group_offset = entry_nrs[j] - group_min_nr; + if (group_offset >= entry_start && + group_offset < entry_start + epb) { + /* This entry is in the same block */ + continue; + } + } + + /* Test if the entry block is empty or not */ + end = entry_start + epb; + pos = nilfs_find_next_bit(bitmap, end, entry_start); + if (pos >= end) { + last_nrs[nempties++] = entry_nrs[j - 1]; + if (nempties >= ARRAY_SIZE(last_nrs)) + break; + } + + if (change_group) + break; + + /* Go on to the next entry block */ + entry_start = rounddown(group_offset, epb); + } while (true); kunmap(bitmap_bh->b_page); - kunmap(desc_bh->b_page); + mark_buffer_dirty(bitmap_bh); + brelse(bitmap_bh); + for (k = 0; k < nempties; k++) { + ret = nilfs_palloc_delete_entry_block(inode, + last_nrs[k]); + if (ret && ret != -ENOENT) { + nilfs_warning(inode->i_sb, __func__, + "failed to delete block of entry %llu: ino=%lu, err=%d\n", + (unsigned long long)last_nrs[k], + (unsigned long)inode->i_ino, ret); + } + } + + desc_kaddr = kmap_atomic(desc_bh->b_page); + desc = nilfs_palloc_block_get_group_desc( + inode, group, desc_bh, desc_kaddr); + nfree = nilfs_palloc_group_desc_add_entries(desc, lock, n); + kunmap_atomic(desc_kaddr); mark_buffer_dirty(desc_bh); - mark_buffer_dirty(bitmap_bh); nilfs_mdt_mark_dirty(inode); - - brelse(bitmap_bh); brelse(desc_bh); + + if (nfree == nilfs_palloc_entries_per_group(inode)) { + ret = nilfs_palloc_delete_bitmap_block(inode, group); + if (ret && ret != -ENOENT) { + nilfs_warning(inode->i_sb, __func__, + "failed to delete bitmap block of group %lu: ino=%lu, err=%d\n", + group, + (unsigned long)inode->i_ino, ret); + } + } } return 0; } diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h index 4bd6451b5703..6e6f49aa53df 100644 --- a/fs/nilfs2/alloc.h +++ b/fs/nilfs2/alloc.h @@ -77,6 +77,7 @@ int nilfs_palloc_freev(struct inode *, __u64 *, size_t); #define nilfs_set_bit_atomic ext2_set_bit_atomic #define nilfs_clear_bit_atomic ext2_clear_bit_atomic #define nilfs_find_next_zero_bit find_next_zero_bit_le +#define nilfs_find_next_bit find_next_bit_le /** * struct nilfs_bh_assoc - block offset and buffer head association diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 919fd5bb14a8..3a3821b00486 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -919,8 +919,6 @@ static void nilfs_btree_split(struct nilfs_bmap *btree, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *node, *right; - __u64 newkey; - __u64 newptr; int nchildren, n, move, ncblk; node = nilfs_btree_get_nonroot_node(path, level); @@ -942,9 +940,6 @@ static void nilfs_btree_split(struct nilfs_bmap *btree, if (!buffer_dirty(path[level].bp_sib_bh)) mark_buffer_dirty(path[level].bp_sib_bh); - newkey = nilfs_btree_node_get_key(right, 0); - newptr = path[level].bp_newreq.bpr_ptr; - if (move) { path[level].bp_index -= nilfs_btree_node_get_nchildren(node); nilfs_btree_node_insert(right, path[level].bp_index, @@ -1856,7 +1851,7 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *btree, __u64 key, __u64 ptr, const __u64 *keys, const __u64 *ptrs, int n) { - struct buffer_head *bh; + struct buffer_head *bh = NULL; union nilfs_bmap_ptr_req dreq, nreq, *di, *ni; struct nilfs_bmap_stats stats; int ret; diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index 0d5fada91191..7dc23f100e57 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -155,7 +155,6 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req, int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req) { struct nilfs_dat_entry *entry; - __u64 start; sector_t blocknr; void *kaddr; int ret; @@ -169,7 +168,6 @@ int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req) kaddr = kmap_atomic(req->pr_entry_bh->b_page); entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, req->pr_entry_bh, kaddr); - start = le64_to_cpu(entry->de_start); blocknr = le64_to_cpu(entry->de_blocknr); kunmap_atomic(kaddr); diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 4a73d6dffabf..ac2f64943ff4 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -356,7 +356,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) goto failed; mapping_set_gfp_mask(inode->i_mapping, - mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); + mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS)); root = NILFS_I(dir)->i_root; ii = NILFS_I(inode); @@ -522,7 +522,7 @@ static int __nilfs_read_inode(struct super_block *sb, up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); nilfs_set_inode_flags(inode); mapping_set_gfp_mask(inode->i_mapping, - mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); + mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS)); return 0; failed_unmap: diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index dee34d990281..1125f40233ff 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -33,6 +33,7 @@ #include "page.h" #include "mdt.h" +#include <trace/events/nilfs2.h> #define NILFS_MDT_MAX_RA_BLOCKS (16 - 1) @@ -68,6 +69,9 @@ nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block, set_buffer_uptodate(bh); mark_buffer_dirty(bh); nilfs_mdt_mark_dirty(inode); + + trace_nilfs2_mdt_insert_new_block(inode, inode->i_ino, block); + return 0; } @@ -158,6 +162,8 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, get_bh(bh); submit_bh(mode, bh); ret = 0; + + trace_nilfs2_mdt_submit_block(inode, inode->i_ino, blkoff, mode); out: get_bh(bh); *out_bh = bh; diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h index fe529a87a208..03246cac3338 100644 --- a/fs/nilfs2/mdt.h +++ b/fs/nilfs2/mdt.h @@ -72,7 +72,7 @@ static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode) } /* Default GFP flags using highmem */ -#define NILFS_MDT_GFP (__GFP_WAIT | __GFP_IO | __GFP_HIGHMEM) +#define NILFS_MDT_GFP (__GFP_RECLAIM | __GFP_IO | __GFP_HIGHMEM) int nilfs_mdt_get_block(struct inode *, unsigned long, int, void (*init_block)(struct inode *, diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index ff00a0b7acb9..9b4f205d1173 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -582,7 +582,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs, struct nilfs_recovery_info *ri) { struct buffer_head *bh_sum = NULL; - struct nilfs_segment_summary *sum; + struct nilfs_segment_summary *sum = NULL; sector_t pseg_start; sector_t seg_start, seg_end; /* Starting/ending DBN of full segment */ unsigned long nsalvaged_blocks = 0; @@ -814,7 +814,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_recovery_info *ri) { struct buffer_head *bh_sum = NULL; - struct nilfs_segment_summary *sum; + struct nilfs_segment_summary *sum = NULL; sector_t pseg_start, pseg_end, sr_pseg_start = 0; sector_t seg_start, seg_end; /* range of full segment (block number) */ sector_t b, end; diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index c6abbad9b8e3..3b65adaae7e4 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -77,6 +77,36 @@ enum { NILFS_ST_DONE, }; +#define CREATE_TRACE_POINTS +#include <trace/events/nilfs2.h> + +/* + * nilfs_sc_cstage_inc(), nilfs_sc_cstage_set(), nilfs_sc_cstage_get() are + * wrapper functions of stage count (nilfs_sc_info->sc_stage.scnt). Users of + * the variable must use them because transition of stage count must involve + * trace events (trace_nilfs2_collection_stage_transition). + * + * nilfs_sc_cstage_get() isn't required for the above purpose because it doesn't + * produce tracepoint events. It is provided just for making the intention + * clear. + */ +static inline void nilfs_sc_cstage_inc(struct nilfs_sc_info *sci) +{ + sci->sc_stage.scnt++; + trace_nilfs2_collection_stage_transition(sci); +} + +static inline void nilfs_sc_cstage_set(struct nilfs_sc_info *sci, int next_scnt) +{ + sci->sc_stage.scnt = next_scnt; + trace_nilfs2_collection_stage_transition(sci); +} + +static inline int nilfs_sc_cstage_get(struct nilfs_sc_info *sci) +{ + return sci->sc_stage.scnt; +} + /* State flags of collection */ #define NILFS_CF_NODE 0x0001 /* Collecting node blocks */ #define NILFS_CF_IFILE_STARTED 0x0002 /* IFILE stage has started */ @@ -184,11 +214,18 @@ int nilfs_transaction_begin(struct super_block *sb, { struct the_nilfs *nilfs; int ret = nilfs_prepare_segment_lock(ti); + struct nilfs_transaction_info *trace_ti; if (unlikely(ret < 0)) return ret; - if (ret > 0) + if (ret > 0) { + trace_ti = current->journal_info; + + trace_nilfs2_transaction_transition(sb, trace_ti, + trace_ti->ti_count, trace_ti->ti_flags, + TRACE_NILFS2_TRANSACTION_BEGIN); return 0; + } sb_start_intwrite(sb); @@ -199,6 +236,11 @@ int nilfs_transaction_begin(struct super_block *sb, ret = -ENOSPC; goto failed; } + + trace_ti = current->journal_info; + trace_nilfs2_transaction_transition(sb, trace_ti, trace_ti->ti_count, + trace_ti->ti_flags, + TRACE_NILFS2_TRANSACTION_BEGIN); return 0; failed: @@ -231,6 +273,8 @@ int nilfs_transaction_commit(struct super_block *sb) ti->ti_flags |= NILFS_TI_COMMIT; if (ti->ti_count > 0) { ti->ti_count--; + trace_nilfs2_transaction_transition(sb, ti, ti->ti_count, + ti->ti_flags, TRACE_NILFS2_TRANSACTION_COMMIT); return 0; } if (nilfs->ns_writer) { @@ -242,6 +286,9 @@ int nilfs_transaction_commit(struct super_block *sb) nilfs_segctor_do_flush(sci, 0); } up_read(&nilfs->ns_segctor_sem); + trace_nilfs2_transaction_transition(sb, ti, ti->ti_count, + ti->ti_flags, TRACE_NILFS2_TRANSACTION_COMMIT); + current->journal_info = ti->ti_save; if (ti->ti_flags & NILFS_TI_SYNC) @@ -260,10 +307,15 @@ void nilfs_transaction_abort(struct super_block *sb) BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC); if (ti->ti_count > 0) { ti->ti_count--; + trace_nilfs2_transaction_transition(sb, ti, ti->ti_count, + ti->ti_flags, TRACE_NILFS2_TRANSACTION_ABORT); return; } up_read(&nilfs->ns_segctor_sem); + trace_nilfs2_transaction_transition(sb, ti, ti->ti_count, + ti->ti_flags, TRACE_NILFS2_TRANSACTION_ABORT); + current->journal_info = ti->ti_save; if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) kmem_cache_free(nilfs_transaction_cachep, ti); @@ -309,6 +361,9 @@ static void nilfs_transaction_lock(struct super_block *sb, current->journal_info = ti; for (;;) { + trace_nilfs2_transaction_transition(sb, ti, ti->ti_count, + ti->ti_flags, TRACE_NILFS2_TRANSACTION_TRYLOCK); + down_write(&nilfs->ns_segctor_sem); if (!test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags)) break; @@ -320,6 +375,9 @@ static void nilfs_transaction_lock(struct super_block *sb, } if (gcflag) ti->ti_flags |= NILFS_TI_GC; + + trace_nilfs2_transaction_transition(sb, ti, ti->ti_count, + ti->ti_flags, TRACE_NILFS2_TRANSACTION_LOCK); } static void nilfs_transaction_unlock(struct super_block *sb) @@ -332,6 +390,9 @@ static void nilfs_transaction_unlock(struct super_block *sb) up_write(&nilfs->ns_segctor_sem); current->journal_info = ti->ti_save; + + trace_nilfs2_transaction_transition(sb, ti, ti->ti_count, + ti->ti_flags, TRACE_NILFS2_TRANSACTION_UNLOCK); } static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci, @@ -1062,7 +1123,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) size_t ndone; int err = 0; - switch (sci->sc_stage.scnt) { + switch (nilfs_sc_cstage_get(sci)) { case NILFS_ST_INIT: /* Pre-processes */ sci->sc_stage.flags = 0; @@ -1071,7 +1132,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) sci->sc_nblk_inc = 0; sci->sc_curseg->sb_sum.flags = NILFS_SS_LOGBGN; if (mode == SC_LSEG_DSYNC) { - sci->sc_stage.scnt = NILFS_ST_DSYNC; + nilfs_sc_cstage_set(sci, NILFS_ST_DSYNC); goto dsync_mode; } } @@ -1079,10 +1140,10 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) sci->sc_stage.dirty_file_ptr = NULL; sci->sc_stage.gc_inode_ptr = NULL; if (mode == SC_FLUSH_DAT) { - sci->sc_stage.scnt = NILFS_ST_DAT; + nilfs_sc_cstage_set(sci, NILFS_ST_DAT); goto dat_stage; } - sci->sc_stage.scnt++; /* Fall through */ + nilfs_sc_cstage_inc(sci); /* Fall through */ case NILFS_ST_GC: if (nilfs_doing_gc()) { head = &sci->sc_gc_inodes; @@ -1103,7 +1164,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) } sci->sc_stage.gc_inode_ptr = NULL; } - sci->sc_stage.scnt++; /* Fall through */ + nilfs_sc_cstage_inc(sci); /* Fall through */ case NILFS_ST_FILE: head = &sci->sc_dirty_files; ii = list_prepare_entry(sci->sc_stage.dirty_file_ptr, head, @@ -1125,10 +1186,10 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) } sci->sc_stage.dirty_file_ptr = NULL; if (mode == SC_FLUSH_FILE) { - sci->sc_stage.scnt = NILFS_ST_DONE; + nilfs_sc_cstage_set(sci, NILFS_ST_DONE); return 0; } - sci->sc_stage.scnt++; + nilfs_sc_cstage_inc(sci); sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED; /* Fall through */ case NILFS_ST_IFILE: @@ -1136,7 +1197,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) &nilfs_sc_file_ops); if (unlikely(err)) break; - sci->sc_stage.scnt++; + nilfs_sc_cstage_inc(sci); /* Creating a checkpoint */ err = nilfs_segctor_create_checkpoint(sci); if (unlikely(err)) @@ -1147,7 +1208,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) &nilfs_sc_file_ops); if (unlikely(err)) break; - sci->sc_stage.scnt++; /* Fall through */ + nilfs_sc_cstage_inc(sci); /* Fall through */ case NILFS_ST_SUFILE: err = nilfs_sufile_freev(nilfs->ns_sufile, sci->sc_freesegs, sci->sc_nfreesegs, &ndone); @@ -1163,7 +1224,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) &nilfs_sc_file_ops); if (unlikely(err)) break; - sci->sc_stage.scnt++; /* Fall through */ + nilfs_sc_cstage_inc(sci); /* Fall through */ case NILFS_ST_DAT: dat_stage: err = nilfs_segctor_scan_file(sci, nilfs->ns_dat, @@ -1171,10 +1232,10 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) if (unlikely(err)) break; if (mode == SC_FLUSH_DAT) { - sci->sc_stage.scnt = NILFS_ST_DONE; + nilfs_sc_cstage_set(sci, NILFS_ST_DONE); return 0; } - sci->sc_stage.scnt++; /* Fall through */ + nilfs_sc_cstage_inc(sci); /* Fall through */ case NILFS_ST_SR: if (mode == SC_LSEG_SR) { /* Appending a super root */ @@ -1184,7 +1245,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) } /* End of a logical segment */ sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND; - sci->sc_stage.scnt = NILFS_ST_DONE; + nilfs_sc_cstage_set(sci, NILFS_ST_DONE); return 0; case NILFS_ST_DSYNC: dsync_mode: @@ -1197,7 +1258,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) if (unlikely(err)) break; sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND; - sci->sc_stage.scnt = NILFS_ST_DONE; + nilfs_sc_cstage_set(sci, NILFS_ST_DONE); return 0; case NILFS_ST_DONE: return 0; @@ -1442,7 +1503,8 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci, goto failed; /* The current segment is filled up */ - if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE) + if (mode != SC_LSEG_SR || + nilfs_sc_cstage_get(sci) < NILFS_ST_CPFILE) break; nilfs_clear_logs(&sci->sc_segbufs); @@ -1946,7 +2008,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) struct the_nilfs *nilfs = sci->sc_super->s_fs_info; int err; - sci->sc_stage.scnt = NILFS_ST_INIT; + nilfs_sc_cstage_set(sci, NILFS_ST_INIT); sci->sc_cno = nilfs->ns_cno; err = nilfs_segctor_collect_dirty_files(sci, nilfs); @@ -1974,7 +2036,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) goto failed; /* Avoid empty segment */ - if (sci->sc_stage.scnt == NILFS_ST_DONE && + if (nilfs_sc_cstage_get(sci) == NILFS_ST_DONE && nilfs_segbuf_empty(sci->sc_curseg)) { nilfs_segctor_abort_construction(sci, nilfs, 1); goto out; @@ -1988,7 +2050,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) nilfs_segctor_fill_in_file_bmap(sci); if (mode == SC_LSEG_SR && - sci->sc_stage.scnt >= NILFS_ST_CPFILE) { + nilfs_sc_cstage_get(sci) >= NILFS_ST_CPFILE) { err = nilfs_segctor_fill_in_checkpoint(sci); if (unlikely(err)) goto failed_to_write; @@ -2007,7 +2069,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) if (unlikely(err)) goto failed_to_write; - if (sci->sc_stage.scnt == NILFS_ST_DONE || + if (nilfs_sc_cstage_get(sci) == NILFS_ST_DONE || nilfs->ns_blocksize_bits != PAGE_CACHE_SHIFT) { /* * At this point, we avoid double buffering @@ -2020,7 +2082,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) if (err) goto failed_to_write; } - } while (sci->sc_stage.scnt != NILFS_ST_DONE); + } while (nilfs_sc_cstage_get(sci) != NILFS_ST_DONE); out: nilfs_segctor_drop_written_files(sci, nilfs); @@ -2430,7 +2492,6 @@ static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode) static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci) { int mode = 0; - int err; spin_lock(&sci->sc_state_lock); mode = (sci->sc_flush_request & FLUSH_DAT_BIT) ? @@ -2438,7 +2499,7 @@ static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci) spin_unlock(&sci->sc_state_lock); if (mode) { - err = nilfs_segctor_do_construct(sci, mode); + nilfs_segctor_do_construct(sci, mode); spin_lock(&sci->sc_state_lock); sci->sc_flush_request &= (mode == SC_FLUSH_FILE) ? diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index a48d6de1e02c..0408b9b2814b 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -67,7 +67,8 @@ struct nilfs_recovery_info { /** * struct nilfs_cstage - Context of collection stage - * @scnt: Stage count + * @scnt: Stage count, must be accessed via wrappers: + * nilfs_sc_cstage_inc(), nilfs_sc_cstage_set(), nilfs_sc_cstage_get() * @flags: State flags * @dirty_file_ptr: Pointer on dirty_files list, or inode of a target file * @gc_inode_ptr: Pointer on the list of gc-inodes diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 2a869c35c362..52821ffc11f4 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -30,6 +30,8 @@ #include "mdt.h" #include "sufile.h" +#include <trace/events/nilfs2.h> + /** * struct nilfs_sufile_info - on-memory private data of sufile * @mi: on-memory private data of metadata file @@ -317,7 +319,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) size_t susz = NILFS_MDT(sufile)->mi_entry_size; __u64 segnum, maxsegnum, last_alloc; void *kaddr; - unsigned long nsegments, ncleansegs, nsus, cnt; + unsigned long nsegments, nsus, cnt; int ret, j; down_write(&NILFS_MDT(sufile)->mi_sem); @@ -327,7 +329,6 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) goto out_sem; kaddr = kmap_atomic(header_bh->b_page); header = kaddr + bh_offset(header_bh); - ncleansegs = le64_to_cpu(header->sh_ncleansegs); last_alloc = le64_to_cpu(header->sh_last_alloc); kunmap_atomic(kaddr); @@ -358,6 +359,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) break; /* never happens */ } } + trace_nilfs2_segment_usage_check(sufile, segnum, cnt); ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1, &su_bh); if (ret < 0) @@ -388,6 +390,9 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) nilfs_mdt_mark_dirty(sufile); brelse(su_bh); *segnump = segnum; + + trace_nilfs2_segment_usage_allocated(sufile, segnum); + goto out_header; } @@ -490,6 +495,8 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum, NILFS_SUI(sufile)->ncleansegs++; nilfs_mdt_mark_dirty(sufile); + + trace_nilfs2_segment_usage_freed(sufile, segnum); } /** diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index f47585bfeb01..354013ea22ec 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -361,7 +361,7 @@ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off) struct nilfs_super_block *nsbp; sector_t blocknr, newblocknr; unsigned long offset; - int sb2i = -1; /* array index of the secondary superblock */ + int sb2i; /* array index of the secondary superblock */ int ret = 0; /* nilfs->ns_sem must be locked by the caller. */ @@ -372,6 +372,9 @@ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off) } else if (nilfs->ns_sbh[0]->b_blocknr > nilfs->ns_first_data_block) { sb2i = 0; blocknr = nilfs->ns_sbh[0]->b_blocknr; + } else { + sb2i = -1; + blocknr = 0; } if (sb2i >= 0 && (u64)blocknr << nilfs->ns_blocksize_bits == sb2off) goto out; /* super block location is unchanged */ @@ -1405,14 +1408,10 @@ static void nilfs_destroy_cachep(void) */ rcu_barrier(); - if (nilfs_inode_cachep) - kmem_cache_destroy(nilfs_inode_cachep); - if (nilfs_transaction_cachep) - kmem_cache_destroy(nilfs_transaction_cachep); - if (nilfs_segbuf_cachep) - kmem_cache_destroy(nilfs_segbuf_cachep); - if (nilfs_btree_path_cache) - kmem_cache_destroy(nilfs_btree_path_cache); + kmem_cache_destroy(nilfs_inode_cachep); + kmem_cache_destroy(nilfs_transaction_cachep); + kmem_cache_destroy(nilfs_segbuf_cachep); + kmem_cache_destroy(nilfs_btree_path_cache); } static int __init nilfs_init_cachep(void) diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 262561fea923..9d383e5eff0e 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -525,8 +525,8 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping, } } err = add_to_page_cache_lru(*cached_page, mapping, - index, - GFP_KERNEL & mapping_gfp_mask(mapping)); + index, + mapping_gfp_constraint(mapping, GFP_KERNEL)); if (unlikely(err)) { if (err == -EEXIST) continue; diff --git a/fs/proc/array.c b/fs/proc/array.c index eed2050db9be..d73291f5f0fc 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -91,18 +91,18 @@ static inline void task_name(struct seq_file *m, struct task_struct *p) { char *buf; + size_t size; char tcomm[sizeof(p->comm)]; + int ret; get_task_comm(tcomm, p); seq_puts(m, "Name:\t"); - buf = m->buf + m->count; - /* Ignore error for now */ - buf += string_escape_str(tcomm, buf, m->size - m->count, - ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\"); + size = seq_get_buf(m, &buf); + ret = string_escape_str(tcomm, buf, size, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\"); + seq_commit(m, ret < size ? ret : -1); - m->count = buf - m->buf; seq_putc(m, '\n'); } diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 6e5fcd00733e..3c2a915c695a 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -291,11 +291,19 @@ static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry, */ int proc_fd_permission(struct inode *inode, int mask) { - int rv = generic_permission(inode, mask); + struct task_struct *p; + int rv; + + rv = generic_permission(inode, mask); if (rv == 0) - return 0; - if (task_tgid(current) == proc_pid(inode)) + return rv; + + rcu_read_lock(); + p = pid_task(proc_pid(inode), PIDTYPE_PID); + if (p && same_thread_group(p, current)) rv = 0; + rcu_read_unlock(); + return rv; } diff --git a/fs/seq_file.c b/fs/seq_file.c index 225586e141ca..e85664b7c7d9 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -13,6 +13,7 @@ #include <linux/cred.h> #include <linux/mm.h> #include <linux/printk.h> +#include <linux/string_helpers.h> #include <asm/uaccess.h> #include <asm/page.h> @@ -25,12 +26,17 @@ static void seq_set_overflow(struct seq_file *m) static void *seq_buf_alloc(unsigned long size) { void *buf; + gfp_t gfp = GFP_KERNEL; /* - * __GFP_NORETRY to avoid oom-killings with high-order allocations - - * it's better to fall back to vmalloc() than to kill things. + * For high order allocations, use __GFP_NORETRY to avoid oom-killing - + * it's better to fall back to vmalloc() than to kill things. For small + * allocations, just use GFP_KERNEL which will oom kill, thus no need + * for vmalloc fallback. */ - buf = kmalloc(size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); + if (size > PAGE_SIZE) + gfp |= __GFP_NORETRY | __GFP_NOWARN; + buf = kmalloc(size, gfp); if (!buf && size > PAGE_SIZE) buf = vmalloc(size); return buf; @@ -377,26 +383,12 @@ EXPORT_SYMBOL(seq_release); */ void seq_escape(struct seq_file *m, const char *s, const char *esc) { - char *end = m->buf + m->size; - char *p; - char c; + char *buf; + size_t size = seq_get_buf(m, &buf); + int ret; - for (p = m->buf + m->count; (c = *s) != '\0' && p < end; s++) { - if (!strchr(esc, c)) { - *p++ = c; - continue; - } - if (p + 3 < end) { - *p++ = '\\'; - *p++ = '0' + ((c & 0300) >> 6); - *p++ = '0' + ((c & 070) >> 3); - *p++ = '0' + (c & 07); - continue; - } - seq_set_overflow(m); - return; - } - m->count = p - m->buf; + ret = string_escape_str(s, buf, size, ESCAPE_OCTAL, esc); + seq_commit(m, ret < size ? ret : -1); } EXPORT_SYMBOL(seq_escape); @@ -773,6 +765,8 @@ void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type, { const u8 *ptr = buf; int i, linelen, remaining = len; + char *buffer; + size_t size; int ret; if (rowsize != 16 && rowsize != 32) @@ -794,15 +788,12 @@ void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type, break; } + size = seq_get_buf(m, &buffer); ret = hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize, - m->buf + m->count, m->size - m->count, - ascii); - if (ret >= m->size - m->count) { - seq_set_overflow(m); - } else { - m->count += ret; - seq_putc(m, '\n'); - } + buffer, size, ascii); + seq_commit(m, ret < size ? ret : -1); + + seq_putc(m, '\n'); } } EXPORT_SYMBOL(seq_hex_dump); diff --git a/fs/splice.c b/fs/splice.c index 5fc1e50a7f30..801c21cd77fe 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -360,7 +360,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, break; error = add_to_page_cache_lru(page, mapping, index, - GFP_KERNEL & mapping_gfp_mask(mapping)); + mapping_gfp_constraint(mapping, GFP_KERNEL)); if (unlikely(error)) { page_cache_release(page); if (error == -EEXIST) diff --git a/fs/sync.c b/fs/sync.c index 4ec430ae2b0d..dd5d1711c7ac 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -348,7 +348,8 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes, } if (flags & SYNC_FILE_RANGE_WRITE) { - ret = filemap_fdatawrite_range(mapping, offset, endbyte); + ret = __filemap_fdatawrite_range(mapping, offset, endbyte, + WB_SYNC_NONE); if (ret < 0) goto out_put; } diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index eac9549efd52..587174fd4f2c 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -525,7 +525,7 @@ xfs_qm_shrink_scan( unsigned long freed; int error; - if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT)) + if ((sc->gfp_mask & (__GFP_FS|__GFP_DIRECT_RECLAIM)) != (__GFP_FS|__GFP_DIRECT_RECLAIM)) return 0; INIT_LIST_HEAD(&isol.buffers); diff --git a/include/drm/drmP.h b/include/drm/drmP.h index 8b5ce7c5d9bb..f56cdcecc1c9 100644 --- a/include/drm/drmP.h +++ b/include/drm/drmP.h @@ -822,7 +822,6 @@ struct drm_device { struct drm_sg_mem *sg; /**< Scatter gather memory */ unsigned int num_crtcs; /**< Number of CRTCs on this device */ - sigset_t sigmask; struct { int context; diff --git a/include/linux/bitops.h b/include/linux/bitops.h index e63553386ae7..2b8ed123ad36 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -164,6 +164,8 @@ static inline __u8 ror8(__u8 word, unsigned int shift) * sign_extend32 - sign extend a 32-bit value using specified bit as sign-bit * @value: value to sign extend * @index: 0 based bit index (0<=index<32) to sign bit + * + * This is safe to use for 16- and 8-bit types as well. */ static inline __s32 sign_extend32(__u32 value, int index) { @@ -171,6 +173,17 @@ static inline __s32 sign_extend32(__u32 value, int index) return (__s32)(value << shift) >> shift; } +/** + * sign_extend64 - sign extend a 64-bit value using specified bit as sign-bit + * @value: value to sign extend + * @index: 0 based bit index (0<=index<64) to sign bit + */ +static inline __s64 sign_extend64(__u64 value, int index) +{ + __u8 shift = 63 - index; + return (__s64)(value << shift) >> shift; +} + static inline unsigned fls_long(unsigned long l) { if (sizeof(l) == 4) diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 0e3110a0b771..22ab246feed3 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -205,7 +205,10 @@ #if GCC_VERSION >= 40600 /* - * Tell the optimizer that something else uses this function or variable. + * When used with Link Time Optimization, gcc can optimize away C functions or + * variables which are referenced only from assembly code. __visible tells the + * optimizer that something else uses this function or variable, thus preventing + * this. */ #define __visible __attribute__((externally_visible)) #endif diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 5a1311942358..85a868ccb493 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -104,6 +104,9 @@ extern void cpuset_print_current_mems_allowed(void); */ static inline unsigned int read_mems_allowed_begin(void) { + if (!cpusets_enabled()) + return 0; + return read_seqcount_begin(¤t->mems_allowed_seq); } @@ -115,6 +118,9 @@ static inline unsigned int read_mems_allowed_begin(void) */ static inline bool read_mems_allowed_retry(unsigned int seq) { + if (!cpusets_enabled()) + return false; + return read_seqcount_retry(¤t->mems_allowed_seq, seq); } diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index ac07ff090919..2e551e2d2d03 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -1,6 +1,7 @@ #ifndef _LINUX_DMA_MAPPING_H #define _LINUX_DMA_MAPPING_H +#include <linux/sizes.h> #include <linux/string.h> #include <linux/device.h> #include <linux/err.h> @@ -145,7 +146,9 @@ static inline void arch_teardown_dma_ops(struct device *dev) { } static inline unsigned int dma_get_max_seg_size(struct device *dev) { - return dev->dma_parms ? dev->dma_parms->max_segment_size : 65536; + if (dev->dma_parms && dev->dma_parms->max_segment_size) + return dev->dma_parms->max_segment_size; + return SZ_64K; } static inline unsigned int dma_set_max_seg_size(struct device *dev, @@ -154,14 +157,15 @@ static inline unsigned int dma_set_max_seg_size(struct device *dev, if (dev->dma_parms) { dev->dma_parms->max_segment_size = size; return 0; - } else - return -EIO; + } + return -EIO; } static inline unsigned long dma_get_seg_boundary(struct device *dev) { - return dev->dma_parms ? - dev->dma_parms->segment_boundary_mask : 0xffffffff; + if (dev->dma_parms && dev->dma_parms->segment_boundary_mask) + return dev->dma_parms->segment_boundary_mask; + return DMA_BIT_MASK(32); } static inline int dma_set_seg_boundary(struct device *dev, unsigned long mask) @@ -169,8 +173,8 @@ static inline int dma_set_seg_boundary(struct device *dev, unsigned long mask) if (dev->dma_parms) { dev->dma_parms->segment_boundary_mask = mask; return 0; - } else - return -EIO; + } + return -EIO; } #ifndef dma_max_pfn diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f92cbd2f4450..6523109e136d 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -14,7 +14,7 @@ struct vm_area_struct; #define ___GFP_HIGHMEM 0x02u #define ___GFP_DMA32 0x04u #define ___GFP_MOVABLE 0x08u -#define ___GFP_WAIT 0x10u +#define ___GFP_RECLAIMABLE 0x10u #define ___GFP_HIGH 0x20u #define ___GFP_IO 0x40u #define ___GFP_FS 0x80u @@ -29,18 +29,17 @@ struct vm_area_struct; #define ___GFP_NOMEMALLOC 0x10000u #define ___GFP_HARDWALL 0x20000u #define ___GFP_THISNODE 0x40000u -#define ___GFP_RECLAIMABLE 0x80000u +#define ___GFP_ATOMIC 0x80000u #define ___GFP_NOACCOUNT 0x100000u #define ___GFP_NOTRACK 0x200000u -#define ___GFP_NO_KSWAPD 0x400000u +#define ___GFP_DIRECT_RECLAIM 0x400000u #define ___GFP_OTHER_NODE 0x800000u #define ___GFP_WRITE 0x1000000u +#define ___GFP_KSWAPD_RECLAIM 0x2000000u /* If the above are modified, __GFP_BITS_SHIFT may need updating */ /* - * GFP bitmasks.. - * - * Zone modifiers (see linux/mmzone.h - low three bits) + * Physical address zone modifiers (see linux/mmzone.h - low four bits) * * Do not put any conditional on these. If necessary modify the definitions * without the underscores and use them consistently. The definitions here may @@ -50,116 +49,229 @@ struct vm_area_struct; #define __GFP_HIGHMEM ((__force gfp_t)___GFP_HIGHMEM) #define __GFP_DMA32 ((__force gfp_t)___GFP_DMA32) #define __GFP_MOVABLE ((__force gfp_t)___GFP_MOVABLE) /* Page is movable */ +#define __GFP_MOVABLE ((__force gfp_t)___GFP_MOVABLE) /* ZONE_MOVABLE allowed */ #define GFP_ZONEMASK (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE) + +/* + * Page mobility and placement hints + * + * These flags provide hints about how mobile the page is. Pages with similar + * mobility are placed within the same pageblocks to minimise problems due + * to external fragmentation. + * + * __GFP_MOVABLE (also a zone modifier) indicates that the page can be + * moved by page migration during memory compaction or can be reclaimed. + * + * __GFP_RECLAIMABLE is used for slab allocations that specify + * SLAB_RECLAIM_ACCOUNT and whose pages can be freed via shrinkers. + * + * __GFP_WRITE indicates the caller intends to dirty the page. Where possible, + * these pages will be spread between local zones to avoid all the dirty + * pages being in one zone (fair zone allocation policy). + * + * __GFP_HARDWALL enforces the cpuset memory allocation policy. + * + * __GFP_THISNODE forces the allocation to be satisified from the requested + * node with no fallbacks or placement policy enforcements. + */ +#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) +#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) +#define __GFP_HARDWALL ((__force gfp_t)___GFP_HARDWALL) +#define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE) + /* - * Action modifiers - doesn't change the zoning + * Watermark modifiers -- controls access to emergency reserves + * + * __GFP_HIGH indicates that the caller is high-priority and that granting + * the request is necessary before the system can make forward progress. + * For example, creating an IO context to clean pages. + * + * __GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is + * high priority. Users are typically interrupt handlers. This may be + * used in conjunction with __GFP_HIGH + * + * __GFP_MEMALLOC allows access to all memory. This should only be used when + * the caller guarantees the allocation will allow more memory to be freed + * very shortly e.g. process exiting or swapping. Users either should + * be the MM or co-ordinating closely with the VM (e.g. swap over NFS). + * + * __GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves. + * This takes precedence over the __GFP_MEMALLOC flag if both are set. + * + * __GFP_NOACCOUNT ignores the accounting for kmemcg limit enforcement. + */ +#define __GFP_ATOMIC ((__force gfp_t)___GFP_ATOMIC) +#define __GFP_HIGH ((__force gfp_t)___GFP_HIGH) +#define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC) +#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) +#define __GFP_NOACCOUNT ((__force gfp_t)___GFP_NOACCOUNT) + +/* + * Reclaim modifiers + * + * __GFP_IO can start physical IO. + * + * __GFP_FS can call down to the low-level FS. Clearing the flag avoids the + * allocator recursing into the filesystem which might already be holding + * locks. + * + * __GFP_DIRECT_RECLAIM indicates that the caller may enter direct reclaim. + * This flag can be cleared to avoid unnecessary delays when a fallback + * option is available. + * + * __GFP_KSWAPD_RECLAIM indicates that the caller wants to wake kswapd when + * the low watermark is reached and have it reclaim pages until the high + * watermark is reached. A caller may wish to clear this flag when fallback + * options are available and the reclaim is likely to disrupt the system. The + * canonical example is THP allocation where a fallback is cheap but + * reclaim/compaction may cause indirect stalls. + * + * __GFP_RECLAIM is shorthand to allow/forbid both direct and kswapd reclaim. * * __GFP_REPEAT: Try hard to allocate the memory, but the allocation attempt - * _might_ fail. This depends upon the particular VM implementation. + * _might_ fail. This depends upon the particular VM implementation. * * __GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller - * cannot handle allocation failures. New users should be evaluated carefully - * (and the flag should be used only when there is no reasonable failure policy) - * but it is definitely preferable to use the flag rather than opencode endless - * loop around allocator. + * cannot handle allocation failures. New users should be evaluated carefully + * (and the flag should be used only when there is no reasonable failure + * policy) but it is definitely preferable to use the flag rather than + * opencode endless loop around allocator. * * __GFP_NORETRY: The VM implementation must not retry indefinitely and will - * return NULL when direct reclaim and memory compaction have failed to allow - * the allocation to succeed. The OOM killer is not called with the current - * implementation. - * - * __GFP_MOVABLE: Flag that this page will be movable by the page migration - * mechanism or reclaimed + * return NULL when direct reclaim and memory compaction have failed to allow + * the allocation to succeed. The OOM killer is not called with the current + * implementation. */ -#define __GFP_WAIT ((__force gfp_t)___GFP_WAIT) /* Can wait and reschedule? */ -#define __GFP_HIGH ((__force gfp_t)___GFP_HIGH) /* Should access emergency pools? */ -#define __GFP_IO ((__force gfp_t)___GFP_IO) /* Can start physical IO? */ -#define __GFP_FS ((__force gfp_t)___GFP_FS) /* Can call down to low-level FS? */ -#define __GFP_COLD ((__force gfp_t)___GFP_COLD) /* Cache-cold page required */ -#define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN) /* Suppress page allocation failure warning */ -#define __GFP_REPEAT ((__force gfp_t)___GFP_REPEAT) /* See above */ -#define __GFP_NOFAIL ((__force gfp_t)___GFP_NOFAIL) /* See above */ -#define __GFP_NORETRY ((__force gfp_t)___GFP_NORETRY) /* See above */ -#define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC)/* Allow access to emergency reserves */ -#define __GFP_COMP ((__force gfp_t)___GFP_COMP) /* Add compound page metadata */ -#define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) /* Return zeroed page on success */ -#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) /* Don't use emergency reserves. - * This takes precedence over the - * __GFP_MEMALLOC flag if both are - * set - */ -#define __GFP_HARDWALL ((__force gfp_t)___GFP_HARDWALL) /* Enforce hardwall cpuset memory allocs */ -#define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE)/* No fallback, no policies */ -#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */ -#define __GFP_NOACCOUNT ((__force gfp_t)___GFP_NOACCOUNT) /* Don't account to kmemcg */ -#define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) /* Don't track with kmemcheck */ - -#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) -#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */ -#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */ +#define __GFP_IO ((__force gfp_t)___GFP_IO) +#define __GFP_FS ((__force gfp_t)___GFP_FS) +#define __GFP_DIRECT_RECLAIM ((__force gfp_t)___GFP_DIRECT_RECLAIM) /* Caller can reclaim */ +#define __GFP_KSWAPD_RECLAIM ((__force gfp_t)___GFP_KSWAPD_RECLAIM) /* kswapd can wake */ +#define __GFP_RECLAIM ((__force gfp_t)(___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM)) +#define __GFP_REPEAT ((__force gfp_t)___GFP_REPEAT) +#define __GFP_NOFAIL ((__force gfp_t)___GFP_NOFAIL) +#define __GFP_NORETRY ((__force gfp_t)___GFP_NORETRY) /* - * This may seem redundant, but it's a way of annotating false positives vs. - * allocations that simply cannot be supported (e.g. page tables). + * Action modifiers + * + * __GFP_COLD indicates that the caller does not expect to be used in the near + * future. Where possible, a cache-cold page will be returned. + * + * __GFP_NOWARN suppresses allocation failure reports. + * + * __GFP_COMP address compound page metadata. + * + * __GFP_ZERO returns a zeroed page on success. + * + * __GFP_NOTRACK avoids tracking with kmemcheck. + * + * __GFP_NOTRACK_FALSE_POSITIVE is an alias of __GFP_NOTRACK. It's a means of + * distinguishing in the source between false positives and allocations that + * cannot be supported (e.g. page tables). + * + * __GFP_OTHER_NODE is for allocations that are on a remote node but that + * should not be accounted for as a remote allocation in vmstat. A + * typical user would be khugepaged collapsing a huge page on a remote + * node. */ +#define __GFP_COLD ((__force gfp_t)___GFP_COLD) +#define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN) +#define __GFP_COMP ((__force gfp_t)___GFP_COMP) +#define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) +#define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) +#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) -#define __GFP_BITS_SHIFT 25 /* Room for N __GFP_FOO bits */ +/* Room for N __GFP_FOO bits */ +#define __GFP_BITS_SHIFT 26 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) -/* This equals 0, but use constants in case they ever change */ -#define GFP_NOWAIT (GFP_ATOMIC & ~__GFP_HIGH) -/* GFP_ATOMIC means both !wait (__GFP_WAIT not set) and use emergency pool */ -#define GFP_ATOMIC (__GFP_HIGH) -#define GFP_NOIO (__GFP_WAIT) -#define GFP_NOFS (__GFP_WAIT | __GFP_IO) -#define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS) -#define GFP_TEMPORARY (__GFP_WAIT | __GFP_IO | __GFP_FS | \ +/* + * Useful GFP flag combinations that are commonly used. It is recommended + * that subsystems start with one of these combinations and then set/clear + * __GFP_FOO flags as necessary. + * + * GFP_ATOMIC users can not sleep and need the allocation to succeed. A lower + * watermark is applied to allow access to "atomic reserves" + * + * GFP_KERNEL is typical for kernel-internal allocations. The caller requires + * ZONE_NORMAL or a lower zone for direct access but can direct reclaim. + * + * GFP_NOWAIT is for kernel allocations that should not stall for direct + * reclaim, start physical IO or use any filesystem callback. + * + * GFP_NOIO will use direct reclaim to discard clean pages or slab pages + * that do not require the starting of any physical IO. + * + * GFP_NOFS will use direct reclaim but will not use any filesystem interfaces. + * + * GFP_USER is for userspace allocations that also need to be directly + * accessibly by the kernel or hardware. It is typically used by hardware + * for buffers that are mapped to userspace (e.g. graphics) that hardware + * still must DMA to. cpuset limits are enforced for these allocations. + * + * GFP_DMA exists for historical reasons and should be avoided where possible. + * The flags indicates that the caller requires that the lowest zone be + * used (ZONE_DMA or 16M on x86-64). Ideally, this would be removed but + * it would require careful auditing as some users really require it and + * others use the flag to avoid lowmem reserves in ZONE_DMA and treat the + * lowest zone as a type of emergency reserve. + * + * GFP_DMA32 is similar to GFP_DMA except that the caller requires a 32-bit + * address. + * + * GFP_HIGHUSER is for userspace allocations that may be mapped to userspace, + * do not need to be directly accessible by the kernel but that cannot + * move once in use. An example may be a hardware allocation that maps + * data directly into userspace but has no addressing limitations. + * + * GFP_HIGHUSER_MOVABLE is for userspace allocations that the kernel does not + * need direct access to but can use kmap() when access is required. They + * are expected to be movable via page reclaim or page migration. Typically, + * pages on the LRU would also be allocated with GFP_HIGHUSER_MOVABLE. + * + * GFP_TRANSHUGE is used for THP allocations. They are compound allocations + * that will fail quickly if memory is not available and will not wake + * kswapd on failure. + */ +#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) +#define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) +#define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) +#define GFP_NOIO (__GFP_RECLAIM) +#define GFP_NOFS (__GFP_RECLAIM | __GFP_IO) +#define GFP_TEMPORARY (__GFP_RECLAIM | __GFP_IO | __GFP_FS | \ __GFP_RECLAIMABLE) -#define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL) +#define GFP_USER (__GFP_RECLAIM | __GFP_IO | __GFP_FS | __GFP_HARDWALL) +#define GFP_DMA __GFP_DMA +#define GFP_DMA32 __GFP_DMA32 #define GFP_HIGHUSER (GFP_USER | __GFP_HIGHMEM) #define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE) -#define GFP_IOFS (__GFP_IO | __GFP_FS) -#define GFP_TRANSHUGE (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ - __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \ - __GFP_NO_KSWAPD) +#define GFP_TRANSHUGE ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ + __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) & \ + ~__GFP_KSWAPD_RECLAIM) -/* This mask makes up all the page movable related flags */ +/* Convert GFP flags to their corresponding migrate type */ #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) +#define GFP_MOVABLE_SHIFT 3 -/* Control page allocator reclaim behavior */ -#define GFP_RECLAIM_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS|\ - __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\ - __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC) - -/* Control slab gfp mask during early boot */ -#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_WAIT|__GFP_IO|__GFP_FS)) - -/* Control allocation constraints */ -#define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE) - -/* Do not use these with a slab allocator */ -#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) - -/* Flag - indicates that the buffer will be suitable for DMA. Ignored on some - platforms, used as appropriate on others */ - -#define GFP_DMA __GFP_DMA - -/* 4GB DMA on some platforms */ -#define GFP_DMA32 __GFP_DMA32 - -/* Convert GFP flags to their corresponding migrate type */ static inline int gfpflags_to_migratetype(const gfp_t gfp_flags) { - WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK); + VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK); + BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE); + BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE); if (unlikely(page_group_by_mobility_disabled)) return MIGRATE_UNMOVABLE; /* Group based on mobility */ - return (((gfp_flags & __GFP_MOVABLE) != 0) << 1) | - ((gfp_flags & __GFP_RECLAIMABLE) != 0); + return (gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT; +} +#undef GFP_MOVABLE_MASK +#undef GFP_MOVABLE_SHIFT + +static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) +{ + return gfp_flags & __GFP_DIRECT_RECLAIM; } #ifdef CONFIG_HIGHMEM diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index 7edd30515298..24154c26d469 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -32,7 +32,7 @@ static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page) if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER) return NULL; - return (struct hugetlb_cgroup *)page[2].lru.next; + return (struct hugetlb_cgroup *)page[2].private; } static inline @@ -42,7 +42,7 @@ int set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg) if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER) return -1; - page[2].lru.next = (void *)h_cg; + page[2].private = (unsigned long)h_cg; return 0; } diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 5582410727cb..2c13f747ac2e 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -413,6 +413,8 @@ extern __printf(2, 3) char *kasprintf(gfp_t gfp, const char *fmt, ...); extern __printf(2, 0) char *kvasprintf(gfp_t gfp, const char *fmt, va_list args); +extern __printf(2, 0) +const char *kvasprintf_const(gfp_t gfp, const char *fmt, va_list args); extern __scanf(2, 3) int sscanf(const char *, const char *, ...); diff --git a/include/linux/mm.h b/include/linux/mm.h index 906c46a05707..00bad7793788 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -430,46 +430,6 @@ static inline void compound_unlock_irqrestore(struct page *page, #endif } -static inline struct page *compound_head_by_tail(struct page *tail) -{ - struct page *head = tail->first_page; - - /* - * page->first_page may be a dangling pointer to an old - * compound page, so recheck that it is still a tail - * page before returning. - */ - smp_rmb(); - if (likely(PageTail(tail))) - return head; - return tail; -} - -/* - * Since either compound page could be dismantled asynchronously in THP - * or we access asynchronously arbitrary positioned struct page, there - * would be tail flag race. To handle this race, we should call - * smp_rmb() before checking tail flag. compound_head_by_tail() did it. - */ -static inline struct page *compound_head(struct page *page) -{ - if (unlikely(PageTail(page))) - return compound_head_by_tail(page); - return page; -} - -/* - * If we access compound page synchronously such as access to - * allocated page, there is no need to handle tail flag race, so we can - * check tail flag directly without any synchronization primitive. - */ -static inline struct page *compound_head_fast(struct page *page) -{ - if (unlikely(PageTail(page))) - return page->first_page; - return page; -} - /* * The atomic page->_mapcount, starts from -1: so that transitions * both from it and to it can be tracked, using atomic_inc_and_test @@ -518,7 +478,7 @@ static inline void get_huge_page_tail(struct page *page) VM_BUG_ON_PAGE(!PageTail(page), page); VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page); - if (compound_tail_refcounted(page->first_page)) + if (compound_tail_refcounted(compound_head(page))) atomic_inc(&page->_mapcount); } @@ -541,13 +501,7 @@ static inline struct page *virt_to_head_page(const void *x) { struct page *page = virt_to_page(x); - /* - * We don't need to worry about synchronization of tail flag - * when we call virt_to_head_page() since it is only called for - * already allocated page and this page won't be freed until - * this virt_to_head_page() is finished. So use _fast variant. - */ - return compound_head_fast(page); + return compound_head(page); } /* @@ -568,28 +522,42 @@ int split_free_page(struct page *page); /* * Compound pages have a destructor function. Provide a * prototype for that function and accessor functions. - * These are _only_ valid on the head of a PG_compound page. + * These are _only_ valid on the head of a compound page. */ +typedef void compound_page_dtor(struct page *); + +/* Keep the enum in sync with compound_page_dtors array in mm/page_alloc.c */ +enum compound_dtor_id { + NULL_COMPOUND_DTOR, + COMPOUND_PAGE_DTOR, +#ifdef CONFIG_HUGETLB_PAGE + HUGETLB_PAGE_DTOR, +#endif + NR_COMPOUND_DTORS, +}; +extern compound_page_dtor * const compound_page_dtors[]; static inline void set_compound_page_dtor(struct page *page, - compound_page_dtor *dtor) + enum compound_dtor_id compound_dtor) { - page[1].compound_dtor = dtor; + VM_BUG_ON_PAGE(compound_dtor >= NR_COMPOUND_DTORS, page); + page[1].compound_dtor = compound_dtor; } static inline compound_page_dtor *get_compound_page_dtor(struct page *page) { - return page[1].compound_dtor; + VM_BUG_ON_PAGE(page[1].compound_dtor >= NR_COMPOUND_DTORS, page); + return compound_page_dtors[page[1].compound_dtor]; } -static inline int compound_order(struct page *page) +static inline unsigned int compound_order(struct page *page) { if (!PageHead(page)) return 0; return page[1].compound_order; } -static inline void set_compound_order(struct page *page, unsigned long order) +static inline void set_compound_order(struct page *page, unsigned int order) { page[1].compound_order = order; } @@ -1572,8 +1540,7 @@ static inline bool ptlock_init(struct page *page) * with 0. Make sure nobody took it in use in between. * * It can happen if arch try to use slab for page table allocation: - * slab code uses page->slab_cache and page->first_page (for tail - * pages), which share storage with page->ptl. + * slab code uses page->slab_cache, which share storage with page->ptl. */ VM_BUG_ON_PAGE(*(unsigned long *)&page->ptl, page); if (!ptlock_alloc(page)) @@ -1843,7 +1810,8 @@ extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); extern __printf(3, 4) -void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...); +void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, + const char *fmt, ...); extern void setup_per_cpu_pageset(void); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0a85da25a822..f8d1492a114f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -28,8 +28,6 @@ struct mem_cgroup; IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK)) #define ALLOC_SPLIT_PTLOCKS (SPINLOCK_SIZE > BITS_PER_LONG/8) -typedef void compound_page_dtor(struct page *); - /* * Each physical page in the system has a struct page associated with * it to keep track of whatever it is we are using the page for at the @@ -113,7 +111,13 @@ struct page { }; }; - /* Third double word block */ + /* + * Third double word block + * + * WARNING: bit 0 of the first word encode PageTail(). That means + * the rest users of the storage space MUST NOT use the bit to + * avoid collision and false-positive PageTail(). + */ union { struct list_head lru; /* Pageout list, eg. active_list * protected by zone->lru_lock ! @@ -131,18 +135,37 @@ struct page { #endif }; - struct slab *slab_page; /* slab fields */ struct rcu_head rcu_head; /* Used by SLAB * when destroying via RCU */ - /* First tail page of compound page */ + /* Tail pages of compound page */ struct { - compound_page_dtor *compound_dtor; - unsigned long compound_order; + unsigned long compound_head; /* If bit zero is set */ + + /* First tail page only */ +#ifdef CONFIG_64BIT + /* + * On 64 bit system we have enough space in struct page + * to encode compound_dtor and compound_order with + * unsigned int. It can help compiler generate better or + * smaller code on some archtectures. + */ + unsigned int compound_dtor; + unsigned int compound_order; +#else + unsigned short int compound_dtor; + unsigned short int compound_order; +#endif }; #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS - pgtable_t pmd_huge_pte; /* protected by page->ptl */ + struct { + unsigned long __pad; /* do not overlay pmd_huge_pte + * with compound_head to avoid + * possible bit 0 collision. + */ + pgtable_t pmd_huge_pte; /* protected by page->ptl */ + }; #endif }; @@ -163,7 +186,6 @@ struct page { #endif #endif struct kmem_cache *slab_cache; /* SL[AU]B: Pointer to slab */ - struct page *first_page; /* Compound tail pages */ }; #ifdef CONFIG_MEMCG diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 2d7e660cdefe..e23a9e704536 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -37,10 +37,10 @@ enum { MIGRATE_UNMOVABLE, - MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, + MIGRATE_RECLAIMABLE, MIGRATE_PCPTYPES, /* the number of types on the pcp lists */ - MIGRATE_RESERVE = MIGRATE_PCPTYPES, + MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES, #ifdef CONFIG_CMA /* * MIGRATE_CMA migration type is designed to mimic the way @@ -334,13 +334,16 @@ struct zone { /* zone watermarks, access with *_wmark_pages(zone) macros */ unsigned long watermark[NR_WMARK]; + unsigned long nr_reserved_highatomic; + /* - * We don't know if the memory that we're going to allocate will be freeable - * or/and it will be released eventually, so to avoid totally wasting several - * GB of ram we must reserve some of the lower zone memory (otherwise we risk - * to run OOM on the lower zones despite there's tons of freeable ram - * on the higher zones). This array is recalculated at runtime if the - * sysctl_lowmem_reserve_ratio sysctl changes. + * We don't know if the memory that we're going to allocate will be + * freeable or/and it will be released eventually, so to avoid totally + * wasting several GB of ram we must reserve some of the lower zone + * memory (otherwise we risk to run OOM on the lower zones despite + * there being tons of freeable ram on the higher zones). This array is + * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl + * changes. */ long lowmem_reserve[MAX_NR_ZONES]; @@ -429,12 +432,6 @@ struct zone { const char *name; - /* - * Number of MIGRATE_RESERVE page block. To maintain for just - * optimization. Protected by zone->lock. - */ - int nr_migrate_reserve_block; - #ifdef CONFIG_MEMORY_ISOLATION /* * Number of isolated pageblock. It is used to solve incorrect @@ -589,75 +586,8 @@ static inline bool zone_is_empty(struct zone *zone) * [1] : No fallback (__GFP_THISNODE) */ #define MAX_ZONELISTS 2 - - -/* - * We cache key information from each zonelist for smaller cache - * footprint when scanning for free pages in get_page_from_freelist(). - * - * 1) The BITMAP fullzones tracks which zones in a zonelist have come - * up short of free memory since the last time (last_fullzone_zap) - * we zero'd fullzones. - * 2) The array z_to_n[] maps each zone in the zonelist to its node - * id, so that we can efficiently evaluate whether that node is - * set in the current tasks mems_allowed. - * - * Both fullzones and z_to_n[] are one-to-one with the zonelist, - * indexed by a zones offset in the zonelist zones[] array. - * - * The get_page_from_freelist() routine does two scans. During the - * first scan, we skip zones whose corresponding bit in 'fullzones' - * is set or whose corresponding node in current->mems_allowed (which - * comes from cpusets) is not set. During the second scan, we bypass - * this zonelist_cache, to ensure we look methodically at each zone. - * - * Once per second, we zero out (zap) fullzones, forcing us to - * reconsider nodes that might have regained more free memory. - * The field last_full_zap is the time we last zapped fullzones. - * - * This mechanism reduces the amount of time we waste repeatedly - * reexaming zones for free memory when they just came up low on - * memory momentarilly ago. - * - * The zonelist_cache struct members logically belong in struct - * zonelist. However, the mempolicy zonelists constructed for - * MPOL_BIND are intentionally variable length (and usually much - * shorter). A general purpose mechanism for handling structs with - * multiple variable length members is more mechanism than we want - * here. We resort to some special case hackery instead. - * - * The MPOL_BIND zonelists don't need this zonelist_cache (in good - * part because they are shorter), so we put the fixed length stuff - * at the front of the zonelist struct, ending in a variable length - * zones[], as is needed by MPOL_BIND. - * - * Then we put the optional zonelist cache on the end of the zonelist - * struct. This optional stuff is found by a 'zlcache_ptr' pointer in - * the fixed length portion at the front of the struct. This pointer - * both enables us to find the zonelist cache, and in the case of - * MPOL_BIND zonelists, (which will just set the zlcache_ptr to NULL) - * to know that the zonelist cache is not there. - * - * The end result is that struct zonelists come in two flavors: - * 1) The full, fixed length version, shown below, and - * 2) The custom zonelists for MPOL_BIND. - * The custom MPOL_BIND zonelists have a NULL zlcache_ptr and no zlcache. - * - * Even though there may be multiple CPU cores on a node modifying - * fullzones or last_full_zap in the same zonelist_cache at the same - * time, we don't lock it. This is just hint data - if it is wrong now - * and then, the allocator will still function, perhaps a bit slower. - */ - - -struct zonelist_cache { - unsigned short z_to_n[MAX_ZONES_PER_ZONELIST]; /* zone->nid */ - DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST); /* zone full? */ - unsigned long last_full_zap; /* when last zap'd (jiffies) */ -}; #else #define MAX_ZONELISTS 1 -struct zonelist_cache; #endif /* @@ -675,9 +605,6 @@ struct zoneref { * allocation, the other zones are fallback zones, in decreasing * priority. * - * If zlcache_ptr is not NULL, then it is just the address of zlcache, - * as explained above. If zlcache_ptr is NULL, there is no zlcache. - * * * To speed the reading of the zonelist, the zonerefs contain the zone index * of the entry being read. Helper functions to access information given * a struct zoneref are @@ -687,11 +614,7 @@ struct zoneref { * zonelist_node_idx() - Return the index of the node for an entry */ struct zonelist { - struct zonelist_cache *zlcache_ptr; // NULL or &zlcache struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1]; -#ifdef CONFIG_NUMA - struct zonelist_cache zlcache; // optional ... -#endif }; #ifndef CONFIG_DISCONTIGMEM @@ -817,7 +740,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx, int alloc_flags); bool zone_watermark_ok_safe(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx, int alloc_flags); + unsigned long mark, int classzone_idx); enum memmap_context { MEMMAP_EARLY, MEMMAP_HOTPLUG, diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index c12f2147c350..52666d90ca94 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -386,6 +386,7 @@ extern int param_get_ullong(char *buffer, const struct kernel_param *kp); extern const struct kernel_param_ops param_ops_charp; extern int param_set_charp(const char *val, const struct kernel_param *kp); extern int param_get_charp(char *buffer, const struct kernel_param *kp); +extern void param_free_charp(void *arg); #define param_check_charp(name, p) __param_check(name, p, char *) /* We used to allow int as well as bool. We're taking that away! */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index a525e5067484..bb53c7b86315 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -86,12 +86,7 @@ enum pageflags { PG_private, /* If pagecache, has fs-private data */ PG_private_2, /* If pagecache, has fs aux data */ PG_writeback, /* Page is under writeback */ -#ifdef CONFIG_PAGEFLAGS_EXTENDED PG_head, /* A head page */ - PG_tail, /* A tail page */ -#else - PG_compound, /* A compound page */ -#endif PG_swapcache, /* Swap page: swp_entry_t in private */ PG_mappedtodisk, /* Has blocks allocated on-disk */ PG_reclaim, /* To be reclaimed asap */ @@ -398,85 +393,46 @@ static inline void set_page_writeback_keepwrite(struct page *page) test_set_page_writeback_keepwrite(page); } -#ifdef CONFIG_PAGEFLAGS_EXTENDED -/* - * System with lots of page flags available. This allows separate - * flags for PageHead() and PageTail() checks of compound pages so that bit - * tests can be used in performance sensitive paths. PageCompound is - * generally not used in hot code paths except arch/powerpc/mm/init_64.c - * and arch/powerpc/kvm/book3s_64_vio_hv.c which use it to detect huge pages - * and avoid handling those in real mode. - */ __PAGEFLAG(Head, head) CLEARPAGEFLAG(Head, head) -__PAGEFLAG(Tail, tail) -static inline int PageCompound(struct page *page) -{ - return page->flags & ((1L << PG_head) | (1L << PG_tail)); - -} -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -static inline void ClearPageCompound(struct page *page) +static inline int PageTail(struct page *page) { - BUG_ON(!PageHead(page)); - ClearPageHead(page); + return READ_ONCE(page->compound_head) & 1; } -#endif - -#define PG_head_mask ((1L << PG_head)) -#else -/* - * Reduce page flag use as much as possible by overlapping - * compound page flags with the flags used for page cache pages. Possible - * because PageCompound is always set for compound pages and not for - * pages on the LRU and/or pagecache. - */ -TESTPAGEFLAG(Compound, compound) -__SETPAGEFLAG(Head, compound) __CLEARPAGEFLAG(Head, compound) - -/* - * PG_reclaim is used in combination with PG_compound to mark the - * head and tail of a compound page. This saves one page flag - * but makes it impossible to use compound pages for the page cache. - * The PG_reclaim bit would have to be used for reclaim or readahead - * if compound pages enter the page cache. - * - * PG_compound & PG_reclaim => Tail page - * PG_compound & ~PG_reclaim => Head page - */ -#define PG_head_mask ((1L << PG_compound)) -#define PG_head_tail_mask ((1L << PG_compound) | (1L << PG_reclaim)) - -static inline int PageHead(struct page *page) +static inline void set_compound_head(struct page *page, struct page *head) { - return ((page->flags & PG_head_tail_mask) == PG_head_mask); + WRITE_ONCE(page->compound_head, (unsigned long)head + 1); } -static inline int PageTail(struct page *page) +static inline void clear_compound_head(struct page *page) { - return ((page->flags & PG_head_tail_mask) == PG_head_tail_mask); + WRITE_ONCE(page->compound_head, 0); } -static inline void __SetPageTail(struct page *page) +static inline struct page *compound_head(struct page *page) { - page->flags |= PG_head_tail_mask; + unsigned long head = READ_ONCE(page->compound_head); + + if (unlikely(head & 1)) + return (struct page *) (head - 1); + return page; } -static inline void __ClearPageTail(struct page *page) +static inline int PageCompound(struct page *page) { - page->flags &= ~PG_head_tail_mask; -} + return PageHead(page) || PageTail(page); +} #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void ClearPageCompound(struct page *page) { - BUG_ON((page->flags & PG_head_tail_mask) != (1 << PG_compound)); - clear_bit(PG_compound, &page->flags); + BUG_ON(!PageHead(page)); + ClearPageHead(page); } #endif -#endif /* !PAGEFLAGS_EXTENDED */ +#define PG_head_mask ((1L << PG_head)) #ifdef CONFIG_HUGETLB_PAGE int PageHuge(struct page *page); diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index 2baeee12f48e..e942558b3585 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -44,7 +44,7 @@ enum pageblock_bits { #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE /* Huge page sizes are variable */ -extern int pageblock_order; +extern unsigned int pageblock_order; #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index a6c78e00ea96..26eabf5ec718 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -69,6 +69,13 @@ static inline gfp_t mapping_gfp_mask(struct address_space * mapping) return (__force gfp_t)mapping->flags & __GFP_BITS_MASK; } +/* Restricts the given gfp_mask to what the mapping allows. */ +static inline gfp_t mapping_gfp_constraint(struct address_space *mapping, + gfp_t gfp_mask) +{ + return mapping_gfp_mask(mapping) & gfp_mask; +} + /* * This is non-atomic. Only to be used before the mapping is activated. * Probably needs a barrier... diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 830c4992088d..a5aa7ae671f4 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -101,13 +101,21 @@ static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent }) /** - * rbtree_postorder_for_each_entry_safe - iterate over rb_root in post order of - * given type safe against removal of rb_node entry + * rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of + * given type allowing the backing memory of @pos to be invalidated * * @pos: the 'type *' to use as a loop cursor. * @n: another 'type *' to use as temporary storage * @root: 'rb_root *' of the rbtree. * @field: the name of the rb_node field within 'type'. + * + * rbtree_postorder_for_each_entry_safe() provides a similar guarantee as + * list_for_each_entry_safe() and allows the iteration to continue independent + * of changes to @pos by the body of the loop. + * + * Note, however, that it cannot handle other modifications that re-order the + * rbtree it is iterating over. This includes calling rb_erase() on @pos, as + * rb_erase() may rebalance the tree, causing us to miss some nodes. */ #define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \ for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \ diff --git a/include/linux/sched.h b/include/linux/sched.h index eeb5066a44fb..4069febaa34a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1570,9 +1570,7 @@ struct task_struct { unsigned long sas_ss_sp; size_t sas_ss_size; - int (*notifier)(void *priv); - void *notifier_data; - sigset_t *notifier_mask; + struct callback_head *task_works; struct audit_context *audit_context; @@ -2464,21 +2462,29 @@ extern void ignore_signals(struct task_struct *); extern void flush_signal_handlers(struct task_struct *, int force_default); extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info); -static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) +static inline int kernel_dequeue_signal(siginfo_t *info) { - unsigned long flags; + struct task_struct *tsk = current; + siginfo_t __info; int ret; - spin_lock_irqsave(&tsk->sighand->siglock, flags); - ret = dequeue_signal(tsk, mask, info); - spin_unlock_irqrestore(&tsk->sighand->siglock, flags); + spin_lock_irq(&tsk->sighand->siglock); + ret = dequeue_signal(tsk, &tsk->blocked, info ?: &__info); + spin_unlock_irq(&tsk->sighand->siglock); return ret; } -extern void block_all_signals(int (*notifier)(void *priv), void *priv, - sigset_t *mask); -extern void unblock_all_signals(void); +static inline void kernel_signal_stop(void) +{ + spin_lock_irq(¤t->sighand->siglock); + if (current->jobctl & JOBCTL_STOP_DEQUEUED) + __set_current_state(TASK_STOPPED); + spin_unlock_irq(¤t->sighand->siglock); + + schedule(); +} + extern void release_task(struct task_struct * p); extern int send_sig_info(int, struct siginfo *, struct task_struct *); extern int force_sigsegv(int, struct task_struct *); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 24f4dfd94c51..4355129fff91 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1224,7 +1224,7 @@ static inline int skb_cloned(const struct sk_buff *skb) static inline int skb_unclone(struct sk_buff *skb, gfp_t pri) { - might_sleep_if(pri & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_cloned(skb)) return pskb_expand_head(skb, 0, 0, pri); @@ -1308,7 +1308,7 @@ static inline int skb_shared(const struct sk_buff *skb) */ static inline struct sk_buff *skb_share_check(struct sk_buff *skb, gfp_t pri) { - might_sleep_if(pri & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_shared(skb)) { struct sk_buff *nskb = skb_clone(skb, pri); @@ -1344,7 +1344,7 @@ static inline struct sk_buff *skb_share_check(struct sk_buff *skb, gfp_t pri) static inline struct sk_buff *skb_unshare(struct sk_buff *skb, gfp_t pri) { - might_sleep_if(pri & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_cloned(skb)) { struct sk_buff *nskb = skb_copy(skb, pri); diff --git a/include/linux/zpool.h b/include/linux/zpool.h index 42f8ec992452..2e97b7707dff 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -38,10 +38,10 @@ enum zpool_mapmode { bool zpool_has_pool(char *type); -struct zpool *zpool_create_pool(char *type, char *name, +struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, const struct zpool_ops *ops); -char *zpool_get_type(struct zpool *pool); +const char *zpool_get_type(struct zpool *pool); void zpool_destroy_pool(struct zpool *pool); @@ -83,7 +83,9 @@ struct zpool_driver { atomic_t refcount; struct list_head list; - void *(*create)(char *name, gfp_t gfp, const struct zpool_ops *ops, + void *(*create)(const char *name, + gfp_t gfp, + const struct zpool_ops *ops, struct zpool *zpool); void (*destroy)(void *pool); diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 6398dfae53f1..34eb16098a33 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -41,7 +41,7 @@ struct zs_pool_stats { struct zs_pool; -struct zs_pool *zs_create_pool(char *name, gfp_t flags); +struct zs_pool *zs_create_pool(const char *name, gfp_t flags); void zs_destroy_pool(struct zs_pool *pool); unsigned long zs_malloc(struct zs_pool *pool, size_t size); diff --git a/include/linux/zutil.h b/include/linux/zutil.h index 6adfa9a6ffe9..663689521759 100644 --- a/include/linux/zutil.h +++ b/include/linux/zutil.h @@ -68,10 +68,10 @@ typedef uLong (*check_func) (uLong check, const Byte *buf, An Adler-32 checksum is almost as reliable as a CRC32 but can be computed much faster. Usage example: - uLong adler = adler32(0L, NULL, 0); + uLong adler = zlib_adler32(0L, NULL, 0); while (read_buffer(buffer, length) != EOF) { - adler = adler32(adler, buffer, length); + adler = zlib_adler32(adler, buffer, length); } if (adler != original_adler) error(); */ diff --git a/include/net/sock.h b/include/net/sock.h index f570e75e3da9..bbf7c2cf15b4 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2041,7 +2041,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, */ static inline struct page_frag *sk_page_frag(struct sock *sk) { - if (sk->sk_allocation & __GFP_WAIT) + if (gfpflags_allow_blocking(sk->sk_allocation)) return ¤t->task_frag; return &sk->sk_frag; diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h index d6fd8e5b14b7..dde6bf092c8a 100644 --- a/include/trace/events/gfpflags.h +++ b/include/trace/events/gfpflags.h @@ -20,7 +20,7 @@ {(unsigned long)GFP_ATOMIC, "GFP_ATOMIC"}, \ {(unsigned long)GFP_NOIO, "GFP_NOIO"}, \ {(unsigned long)__GFP_HIGH, "GFP_HIGH"}, \ - {(unsigned long)__GFP_WAIT, "GFP_WAIT"}, \ + {(unsigned long)__GFP_ATOMIC, "GFP_ATOMIC"}, \ {(unsigned long)__GFP_IO, "GFP_IO"}, \ {(unsigned long)__GFP_COLD, "GFP_COLD"}, \ {(unsigned long)__GFP_NOWARN, "GFP_NOWARN"}, \ @@ -36,7 +36,8 @@ {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \ {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \ - {(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \ + {(unsigned long)__GFP_DIRECT_RECLAIM, "GFP_DIRECT_RECLAIM"}, \ + {(unsigned long)__GFP_KSWAPD_RECLAIM, "GFP_KSWAPD_RECLAIM"}, \ {(unsigned long)__GFP_OTHER_NODE, "GFP_OTHER_NODE"} \ ) : "GFP_NOWAIT" diff --git a/include/trace/events/nilfs2.h b/include/trace/events/nilfs2.h new file mode 100644 index 000000000000..c7805818fcc6 --- /dev/null +++ b/include/trace/events/nilfs2.h @@ -0,0 +1,224 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM nilfs2 + +#if !defined(_TRACE_NILFS2_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_NILFS2_H + +#include <linux/tracepoint.h> + +struct nilfs_sc_info; + +#define show_collection_stage(type) \ + __print_symbolic(type, \ + { NILFS_ST_INIT, "ST_INIT" }, \ + { NILFS_ST_GC, "ST_GC" }, \ + { NILFS_ST_FILE, "ST_FILE" }, \ + { NILFS_ST_IFILE, "ST_IFILE" }, \ + { NILFS_ST_CPFILE, "ST_CPFILE" }, \ + { NILFS_ST_SUFILE, "ST_SUFILE" }, \ + { NILFS_ST_DAT, "ST_DAT" }, \ + { NILFS_ST_SR, "ST_SR" }, \ + { NILFS_ST_DSYNC, "ST_DSYNC" }, \ + { NILFS_ST_DONE, "ST_DONE"}) + +TRACE_EVENT(nilfs2_collection_stage_transition, + + TP_PROTO(struct nilfs_sc_info *sci), + + TP_ARGS(sci), + + TP_STRUCT__entry( + __field(void *, sci) + __field(int, stage) + ), + + TP_fast_assign( + __entry->sci = sci; + __entry->stage = sci->sc_stage.scnt; + ), + + TP_printk("sci = %p stage = %s", + __entry->sci, + show_collection_stage(__entry->stage)) +); + +#ifndef TRACE_HEADER_MULTI_READ +enum nilfs2_transaction_transition_state { + TRACE_NILFS2_TRANSACTION_BEGIN, + TRACE_NILFS2_TRANSACTION_COMMIT, + TRACE_NILFS2_TRANSACTION_ABORT, + TRACE_NILFS2_TRANSACTION_TRYLOCK, + TRACE_NILFS2_TRANSACTION_LOCK, + TRACE_NILFS2_TRANSACTION_UNLOCK, +}; +#endif + +#define show_transaction_state(type) \ + __print_symbolic(type, \ + { TRACE_NILFS2_TRANSACTION_BEGIN, "BEGIN" }, \ + { TRACE_NILFS2_TRANSACTION_COMMIT, "COMMIT" }, \ + { TRACE_NILFS2_TRANSACTION_ABORT, "ABORT" }, \ + { TRACE_NILFS2_TRANSACTION_TRYLOCK, "TRYLOCK" }, \ + { TRACE_NILFS2_TRANSACTION_LOCK, "LOCK" }, \ + { TRACE_NILFS2_TRANSACTION_UNLOCK, "UNLOCK" }) + +TRACE_EVENT(nilfs2_transaction_transition, + TP_PROTO(struct super_block *sb, + struct nilfs_transaction_info *ti, + int count, + unsigned int flags, + enum nilfs2_transaction_transition_state state), + + TP_ARGS(sb, ti, count, flags, state), + + TP_STRUCT__entry( + __field(void *, sb) + __field(void *, ti) + __field(int, count) + __field(unsigned int, flags) + __field(int, state) + ), + + TP_fast_assign( + __entry->sb = sb; + __entry->ti = ti; + __entry->count = count; + __entry->flags = flags; + __entry->state = state; + ), + + TP_printk("sb = %p ti = %p count = %d flags = %x state = %s", + __entry->sb, + __entry->ti, + __entry->count, + __entry->flags, + show_transaction_state(__entry->state)) +); + +TRACE_EVENT(nilfs2_segment_usage_check, + TP_PROTO(struct inode *sufile, + __u64 segnum, + unsigned long cnt), + + TP_ARGS(sufile, segnum, cnt), + + TP_STRUCT__entry( + __field(struct inode *, sufile) + __field(__u64, segnum) + __field(unsigned long, cnt) + ), + + TP_fast_assign( + __entry->sufile = sufile; + __entry->segnum = segnum; + __entry->cnt = cnt; + ), + + TP_printk("sufile = %p segnum = %llu cnt = %lu", + __entry->sufile, + __entry->segnum, + __entry->cnt) +); + +TRACE_EVENT(nilfs2_segment_usage_allocated, + TP_PROTO(struct inode *sufile, + __u64 segnum), + + TP_ARGS(sufile, segnum), + + TP_STRUCT__entry( + __field(struct inode *, sufile) + __field(__u64, segnum) + ), + + TP_fast_assign( + __entry->sufile = sufile; + __entry->segnum = segnum; + ), + + TP_printk("sufile = %p segnum = %llu", + __entry->sufile, + __entry->segnum) +); + +TRACE_EVENT(nilfs2_segment_usage_freed, + TP_PROTO(struct inode *sufile, + __u64 segnum), + + TP_ARGS(sufile, segnum), + + TP_STRUCT__entry( + __field(struct inode *, sufile) + __field(__u64, segnum) + ), + + TP_fast_assign( + __entry->sufile = sufile; + __entry->segnum = segnum; + ), + + TP_printk("sufile = %p segnum = %llu", + __entry->sufile, + __entry->segnum) +); + +TRACE_EVENT(nilfs2_mdt_insert_new_block, + TP_PROTO(struct inode *inode, + unsigned long ino, + unsigned long block), + + TP_ARGS(inode, ino, block), + + TP_STRUCT__entry( + __field(struct inode *, inode) + __field(unsigned long, ino) + __field(unsigned long, block) + ), + + TP_fast_assign( + __entry->inode = inode; + __entry->ino = ino; + __entry->block = block; + ), + + TP_printk("inode = %p ino = %lu block = %lu", + __entry->inode, + __entry->ino, + __entry->block) +); + +TRACE_EVENT(nilfs2_mdt_submit_block, + TP_PROTO(struct inode *inode, + unsigned long ino, + unsigned long blkoff, + int mode), + + TP_ARGS(inode, ino, blkoff, mode), + + TP_STRUCT__entry( + __field(struct inode *, inode) + __field(unsigned long, ino) + __field(unsigned long, blkoff) + __field(int, mode) + ), + + TP_fast_assign( + __entry->inode = inode; + __entry->ino = ino; + __entry->blkoff = blkoff; + __entry->mode = mode; + ), + + TP_printk("inode = %p ino = %lu blkoff = %lu mode = %x", + __entry->inode, + __entry->ino, + __entry->blkoff, + __entry->mode) +); + +#endif /* _TRACE_NILFS2_H */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE nilfs2 +#include <trace/define_trace.h> diff --git a/ipc/msgutil.c b/ipc/msgutil.c index 71f448e5e927..ed81aafd2392 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -123,7 +123,6 @@ struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst) size_t len = src->m_ts; size_t alen; - WARN_ON(dst == NULL); if (src->m_ts > dst->m_ts) return ERR_PTR(-EINVAL); diff --git a/kernel/audit.c b/kernel/audit.c index 8a056a32ded7..5ffcbd354a52 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1371,16 +1371,16 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, if (unlikely(audit_filter_type(type))) return NULL; - if (gfp_mask & __GFP_WAIT) { + if (gfp_mask & __GFP_DIRECT_RECLAIM) { if (audit_pid && audit_pid == current->pid) - gfp_mask &= ~__GFP_WAIT; + gfp_mask &= ~__GFP_DIRECT_RECLAIM; else reserve = 0; } while (audit_backlog_limit && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { - if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) { + if (gfp_mask & __GFP_DIRECT_RECLAIM && audit_backlog_wait_time) { long sleep_time; sleep_time = timeout_start + audit_backlog_wait_time - jiffies; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b9d0cce3f9ce..f1603c153890 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -299,7 +299,7 @@ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, idr_preload(gfp_mask); spin_lock_bh(&cgroup_idr_lock); - ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_WAIT); + ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM); spin_unlock_bh(&cgroup_idr_lock); idr_preload_end(); return ret; diff --git a/kernel/kexec.c b/kernel/kexec.c index 4c5edc357923..d873b64fbddc 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -6,6 +6,8 @@ * Version 2. See the file COPYING for more details. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/capability.h> #include <linux/mm.h> #include <linux/file.h> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index bd9f8a03cefa..11b64a63c0f8 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -6,7 +6,7 @@ * Version 2. See the file COPYING for more details. */ -#define pr_fmt(fmt) "kexec: " fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/capability.h> #include <linux/mm.h> @@ -1027,7 +1027,7 @@ static int __init crash_notes_memory_init(void) crash_notes = __alloc_percpu(size, align); if (!crash_notes) { - pr_warn("Kexec: Memory allocation for saving cpu register states failed\n"); + pr_warn("Memory allocation for saving cpu register states failed\n"); return -ENOMEM; } return 0; diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 6a9a3f2a0e8e..b70ada0028d2 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -9,6 +9,8 @@ * Version 2. See the file COPYING for more details. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/capability.h> #include <linux/mm.h> #include <linux/file.h> diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 4e49cc4c9952..deae3907ac1e 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2738,7 +2738,7 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) return; /* no reclaim without waiting on it */ - if (!(gfp_mask & __GFP_WAIT)) + if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) return; /* this guy won't enter reclaim */ diff --git a/kernel/panic.c b/kernel/panic.c index 04e91ff7560b..4579dbb7ed87 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -23,6 +23,7 @@ #include <linux/sysrq.h> #include <linux/init.h> #include <linux/nmi.h> +#include <linux/console.h> #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 @@ -147,6 +148,15 @@ void panic(const char *fmt, ...) bust_spinlocks(0); + /* + * We may have ended up stopping the CPU holding the lock (in + * smp_send_stop()) while still having some valuable data in the console + * buffer. Try to acquire the lock then release it regardless of the + * result. The release will also print the buffers out. + */ + console_trylock(); + console_unlock(); + if (!panic_blink) panic_blink = no_blink; diff --git a/kernel/params.c b/kernel/params.c index b6554aa71094..93a380a2345d 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -325,10 +325,11 @@ int param_get_charp(char *buffer, const struct kernel_param *kp) } EXPORT_SYMBOL(param_get_charp); -static void param_free_charp(void *arg) +void param_free_charp(void *arg) { maybe_kfree_parameter(*((char **)arg)); } +EXPORT_SYMBOL(param_free_charp); const struct kernel_param_ops param_ops_charp = { .set = param_set_charp, diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 5235dd4e1e2f..3a970604308f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1779,7 +1779,7 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem) while (to_alloc-- > 0) { struct page *page; - page = alloc_image_page(__GFP_HIGHMEM); + page = alloc_image_page(__GFP_HIGHMEM|__GFP_KSWAPD_RECLAIM); memory_bm_set_bit(bm, page_to_pfn(page)); } return nr_highmem; diff --git a/kernel/power/swap.c b/kernel/power/swap.c index b2066fb5b10f..12cd989dadf6 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -257,7 +257,7 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr, struct bio *bio; int error = 0; - bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); + bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1); bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); bio->bi_bdev = hib_resume_bdev; @@ -356,7 +356,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) return -ENOSPC; if (hb) { - src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN | + src = (void *)__get_free_page(__GFP_RECLAIM | __GFP_NOWARN | __GFP_NORETRY); if (src) { copy_page(src, buf); @@ -364,7 +364,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) ret = hib_wait_io(hb); /* Free pages */ if (ret) return ret; - src = (void *)__get_free_page(__GFP_WAIT | + src = (void *)__get_free_page(__GFP_RECLAIM | __GFP_NOWARN | __GFP_NORETRY); if (src) { @@ -672,7 +672,7 @@ static int save_image_lzo(struct swap_map_handle *handle, nr_threads = num_online_cpus() - 1; nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); - page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); + page = (void *)__get_free_page(__GFP_RECLAIM | __GFP_HIGH); if (!page) { printk(KERN_ERR "PM: Failed to allocate LZO page\n"); ret = -ENOMEM; @@ -975,7 +975,7 @@ static int get_swap_reader(struct swap_map_handle *handle, last = tmp; tmp->map = (struct swap_map_page *) - __get_free_page(__GFP_WAIT | __GFP_HIGH); + __get_free_page(__GFP_RECLAIM | __GFP_HIGH); if (!tmp->map) { release_swap_reader(handle); return -ENOMEM; @@ -1242,9 +1242,9 @@ static int load_image_lzo(struct swap_map_handle *handle, for (i = 0; i < read_pages; i++) { page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? - __GFP_WAIT | __GFP_HIGH : - __GFP_WAIT | __GFP_NOWARN | - __GFP_NORETRY); + __GFP_RECLAIM | __GFP_HIGH : + __GFP_RECLAIM | __GFP_NOWARN | + __GFP_NORETRY); if (!page[i]) { if (i < LZO_CMP_PAGES) { diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index b16f35487b67..2ce8826f1053 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -269,6 +269,9 @@ static u32 clear_idx; #define PREFIX_MAX 32 #define LOG_LINE_MAX (1024 - PREFIX_MAX) +#define LOG_LEVEL(v) ((v) & 0x07) +#define LOG_FACILITY(v) ((v) >> 3 & 0xff) + /* record buffer */ #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) #define LOG_ALIGN 4 @@ -612,7 +615,6 @@ struct devkmsg_user { static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) { char *buf, *line; - int i; int level = default_message_loglevel; int facility = 1; /* LOG_USER */ size_t len = iov_iter_count(from); @@ -642,12 +644,13 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) line = buf; if (line[0] == '<') { char *endp = NULL; + unsigned int u; - i = simple_strtoul(line+1, &endp, 10); + u = simple_strtoul(line + 1, &endp, 10); if (endp && endp[0] == '>') { - level = i & 7; - if (i >> 3) - facility = i >> 3; + level = LOG_LEVEL(u); + if (LOG_FACILITY(u) != 0) + facility = LOG_FACILITY(u); endp++; len -= endp - line; line = endp; diff --git a/kernel/signal.c b/kernel/signal.c index 0f6bbbe77b46..c0b01fe24bbd 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -503,41 +503,6 @@ int unhandled_signal(struct task_struct *tsk, int sig) return !tsk->ptrace; } -/* - * Notify the system that a driver wants to block all signals for this - * process, and wants to be notified if any signals at all were to be - * sent/acted upon. If the notifier routine returns non-zero, then the - * signal will be acted upon after all. If the notifier routine returns 0, - * then then signal will be blocked. Only one block per process is - * allowed. priv is a pointer to private data that the notifier routine - * can use to determine if the signal should be blocked or not. - */ -void -block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask) -{ - unsigned long flags; - - spin_lock_irqsave(¤t->sighand->siglock, flags); - current->notifier_mask = mask; - current->notifier_data = priv; - current->notifier = notifier; - spin_unlock_irqrestore(¤t->sighand->siglock, flags); -} - -/* Notify the system that blocking has ended. */ - -void -unblock_all_signals(void) -{ - unsigned long flags; - - spin_lock_irqsave(¤t->sighand->siglock, flags); - current->notifier = NULL; - current->notifier_data = NULL; - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, flags); -} - static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) { struct sigqueue *q, *first = NULL; @@ -580,19 +545,8 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, { int sig = next_signal(pending, mask); - if (sig) { - if (current->notifier) { - if (sigismember(current->notifier_mask, sig)) { - if (!(current->notifier)(current->notifier_data)) { - clear_thread_flag(TIF_SIGPENDING); - return 0; - } - } - } - + if (sig) collect_signal(sig, pending, info); - } - return sig; } @@ -834,7 +788,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) sigset_t flush; if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) { - if (signal->flags & SIGNAL_GROUP_COREDUMP) + if (!(signal->flags & SIGNAL_GROUP_EXIT)) return sig == SIGKILL; /* * The process is in the middle of dying, nothing to do. @@ -2483,9 +2437,6 @@ EXPORT_SYMBOL(force_sig); EXPORT_SYMBOL(send_sig); EXPORT_SYMBOL(send_sig_info); EXPORT_SYMBOL(sigprocmask); -EXPORT_SYMBOL(block_all_signals); -EXPORT_SYMBOL(unblock_all_signals); - /* * System call entry points. diff --git a/kernel/smp.c b/kernel/smp.c index 07854477c164..d903c02223af 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -669,7 +669,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), cpumask_var_t cpus; int cpu, ret; - might_sleep_if(gfp_flags & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(gfp_flags)); if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) { preempt_disable(); diff --git a/kernel/sys.c b/kernel/sys.c index fa2f2f671a5c..6af9212ab5aa 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -222,7 +222,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) goto out_unlock; /* No processes for this user */ } do_each_thread(g, p) { - if (uid_eq(task_uid(p), uid)) + if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) error = set_one_prio(p, niceval, error); } while_each_thread(g, p); if (!uid_eq(uid, cred->uid)) @@ -290,7 +290,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) goto out_unlock; /* No processes for this user */ } do_each_thread(g, p) { - if (uid_eq(task_uid(p), uid)) { + if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) { niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 1d1521c26302..16bf3bc25e3e 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1686,6 +1686,9 @@ config TEST_STRING_HELPERS config TEST_KSTRTOX tristate "Test kstrto*() family of functions at runtime" +config TEST_PRINTF + tristate "Test printf() family of functions at runtime" + config TEST_RHASHTABLE tristate "Perform selftest on resizable hash table" default n diff --git a/lib/Makefile b/lib/Makefile index 8de3b012eac7..7f1de26613d2 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -42,6 +42,7 @@ obj-$(CONFIG_TEST_RHASHTABLE) += test_rhashtable.o obj-$(CONFIG_TEST_USER_COPY) += test_user_copy.o obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_keys.o obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_key_base.o +obj-$(CONFIG_TEST_PRINTF) += test_printf.o ifeq ($(CONFIG_DEBUG_KOBJECT),y) CFLAGS_kobject.o += -DDEBUG diff --git a/lib/dma-debug.c b/lib/dma-debug.c index fcb65d2a0b94..8855f019ebe8 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -1249,6 +1249,14 @@ static void check_sync(struct device *dev, dir2name[entry->direction], dir2name[ref->direction]); + if (ref->sg_call_ents && ref->type == dma_debug_sg && + ref->sg_call_ents != entry->sg_call_ents) { + err_printk(ref->dev, entry, "DMA-API: device driver syncs " + "DMA sg list with different entry count " + "[map count=%d] [sync count=%d]\n", + entry->sg_call_ents, ref->sg_call_ents); + } + out: put_hash_bucket(bucket, &flags); } diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index e491e02eff54..e3952e9c8ec0 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -42,7 +42,7 @@ extern struct _ddebug __stop___verbose[]; struct ddebug_table { struct list_head link; - char *mod_name; + const char *mod_name; unsigned int num_ddebugs; struct _ddebug *ddebugs; }; @@ -841,12 +841,12 @@ int ddebug_add_module(struct _ddebug *tab, unsigned int n, const char *name) { struct ddebug_table *dt; - char *new_name; + const char *new_name; dt = kzalloc(sizeof(*dt), GFP_KERNEL); if (dt == NULL) return -ENOMEM; - new_name = kstrdup(name, GFP_KERNEL); + new_name = kstrdup_const(name, GFP_KERNEL); if (new_name == NULL) { kfree(dt); return -ENOMEM; @@ -907,7 +907,7 @@ int ddebug_dyndbg_module_param_cb(char *param, char *val, const char *module) static void ddebug_table_free(struct ddebug_table *dt) { list_del_init(&dt->link); - kfree(dt->mod_name); + kfree_const(dt->mod_name); kfree(dt); } diff --git a/lib/halfmd4.c b/lib/halfmd4.c index a8fe6274a13c..137e861d9690 100644 --- a/lib/halfmd4.c +++ b/lib/halfmd4.c @@ -1,6 +1,7 @@ #include <linux/compiler.h> #include <linux/export.h> #include <linux/cryptohash.h> +#include <linux/bitops.h> /* F, G and H are basic MD4 functions: selection, majority, parity */ #define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) @@ -14,7 +15,7 @@ * Rotation is separate from addition to prevent recomputation */ #define ROUND(f, a, b, c, d, x, s) \ - (a += f(b, c, d) + x, a = (a << s) | (a >> (32 - s))) + (a += f(b, c, d) + x, a = rol32(a, s)) #define K1 0 #define K2 013240474631UL #define K3 015666365641UL diff --git a/lib/hexdump.c b/lib/hexdump.c index 8d74c20d8595..992457b1284c 100644 --- a/lib/hexdump.c +++ b/lib/hexdump.c @@ -169,11 +169,15 @@ int hex_dump_to_buffer(const void *buf, size_t len, int rowsize, int groupsize, } } else { for (j = 0; j < len; j++) { - if (linebuflen < lx + 3) + if (linebuflen < lx + 2) goto overflow2; ch = ptr[j]; linebuf[lx++] = hex_asc_hi(ch); + if (linebuflen < lx + 2) + goto overflow2; linebuf[lx++] = hex_asc_lo(ch); + if (linebuflen < lx + 2) + goto overflow2; linebuf[lx++] = ' '; } if (j) diff --git a/lib/idr.c b/lib/idr.c index 5335c43adf46..6098336df267 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -399,7 +399,7 @@ void idr_preload(gfp_t gfp_mask) * allocation guarantee. Disallow usage from those contexts. */ WARN_ON_ONCE(in_interrupt()); - might_sleep_if(gfp_mask & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(gfp_mask)); preempt_disable(); @@ -453,7 +453,7 @@ int idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask) struct idr_layer *pa[MAX_IDR_LEVEL + 1]; int id; - might_sleep_if(gfp_mask & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(gfp_mask)); /* sanity checks */ if (WARN_ON_ONCE(start < 0)) diff --git a/lib/is_single_threaded.c b/lib/is_single_threaded.c index bd2bea963364..391fd23976a2 100644 --- a/lib/is_single_threaded.c +++ b/lib/is_single_threaded.c @@ -36,8 +36,7 @@ bool current_is_single_threaded(void) if (unlikely(p == task->group_leader)) continue; - t = p; - do { + for_each_thread(p, t) { if (unlikely(t->mm == mm)) goto found; if (likely(t->mm)) @@ -48,7 +47,7 @@ bool current_is_single_threaded(void) * forked before exiting. */ smp_rmb(); - } while_each_thread(p, t); + } } ret = true; found: diff --git a/lib/kasprintf.c b/lib/kasprintf.c index 32f12150fc4f..f194e6e593e1 100644 --- a/lib/kasprintf.c +++ b/lib/kasprintf.c @@ -31,6 +31,22 @@ char *kvasprintf(gfp_t gfp, const char *fmt, va_list ap) } EXPORT_SYMBOL(kvasprintf); +/* + * If fmt contains no % (or is exactly %s), use kstrdup_const. If fmt + * (or the sole vararg) points to rodata, we will then save a memory + * allocation and string copy. In any case, the return value should be + * freed using kfree_const(). + */ +const char *kvasprintf_const(gfp_t gfp, const char *fmt, va_list ap) +{ + if (!strchr(fmt, '%')) + return kstrdup_const(fmt, gfp); + if (!strcmp(fmt, "%s")) + return kstrdup_const(va_arg(ap, const char*), gfp); + return kvasprintf(gfp, fmt, ap); +} +EXPORT_SYMBOL(kvasprintf_const); + char *kasprintf(gfp_t gfp, const char *fmt, ...) { va_list ap; diff --git a/lib/kobject.c b/lib/kobject.c index 055407746266..7cbccd2b4c72 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -257,18 +257,32 @@ static int kobject_add_internal(struct kobject *kobj) int kobject_set_name_vargs(struct kobject *kobj, const char *fmt, va_list vargs) { - char *s; + const char *s; if (kobj->name && !fmt) return 0; - s = kvasprintf(GFP_KERNEL, fmt, vargs); + s = kvasprintf_const(GFP_KERNEL, fmt, vargs); if (!s) return -ENOMEM; - /* ewww... some of these buggers have '/' in the name ... */ - strreplace(s, '/', '!'); - kfree(kobj->name); + /* + * ewww... some of these buggers have '/' in the name ... If + * that's the case, we need to make sure we have an actual + * allocated copy to modify, since kvasprintf_const may have + * returned something from .rodata. + */ + if (strchr(s, '/')) { + char *t; + + t = kstrdup(s, GFP_KERNEL); + kfree_const(s); + if (!t) + return -ENOMEM; + strreplace(t, '/', '!'); + s = t; + } + kfree_const(kobj->name); kobj->name = s; return 0; @@ -466,7 +480,7 @@ int kobject_rename(struct kobject *kobj, const char *new_name) envp[0] = devpath_string; envp[1] = NULL; - name = dup_name = kstrdup(new_name, GFP_KERNEL); + name = dup_name = kstrdup_const(new_name, GFP_KERNEL); if (!name) { error = -ENOMEM; goto out; @@ -486,7 +500,7 @@ int kobject_rename(struct kobject *kobj, const char *new_name) kobject_uevent_env(kobj, KOBJ_MOVE, envp); out: - kfree(dup_name); + kfree_const(dup_name); kfree(devpath_string); kfree(devpath); kobject_put(kobj); @@ -634,7 +648,7 @@ static void kobject_cleanup(struct kobject *kobj) /* free name if we allocated it */ if (name) { pr_debug("kobject: '%s': free name\n", name); - kfree(name); + kfree_const(name); } } diff --git a/lib/llist.c b/lib/llist.c index 0b0e9779d675..ae5872b1df0c 100644 --- a/lib/llist.c +++ b/lib/llist.c @@ -66,12 +66,12 @@ struct llist_node *llist_del_first(struct llist_head *head) { struct llist_node *entry, *old_entry, *next; - entry = head->first; + entry = smp_load_acquire(&head->first); for (;;) { if (entry == NULL) return NULL; old_entry = entry; - next = entry->next; + next = READ_ONCE(entry->next); entry = cmpxchg(&head->first, old_entry, next); if (entry == old_entry) break; diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c index f75715131f20..6d40944960de 100644 --- a/lib/percpu_ida.c +++ b/lib/percpu_ida.c @@ -135,7 +135,7 @@ static inline unsigned alloc_local_tag(struct percpu_ida_cpu *tags) * TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, of course). * * @gfp indicates whether or not to wait until a free id is available (it's not - * used for internal memory allocations); thus if passed __GFP_WAIT we may sleep + * used for internal memory allocations); thus if passed __GFP_RECLAIM we may sleep * however long it takes until another thread frees an id (same semantics as a * mempool). * diff --git a/lib/radix-tree.c b/lib/radix-tree.c index f9ebe1c82060..fcf5d98574ce 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -188,7 +188,7 @@ radix_tree_node_alloc(struct radix_tree_root *root) * preloading in the interrupt anyway as all the allocations have to * be atomic. So just do normal allocation when in interrupt. */ - if (!(gfp_mask & __GFP_WAIT) && !in_interrupt()) { + if (!gfpflags_allow_blocking(gfp_mask) && !in_interrupt()) { struct radix_tree_preload *rtp; /* @@ -249,7 +249,7 @@ radix_tree_node_free(struct radix_tree_node *node) * with preemption not disabled. * * To make use of this facility, the radix tree must be initialised without - * __GFP_WAIT being passed to INIT_RADIX_TREE(). + * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). */ static int __radix_tree_preload(gfp_t gfp_mask) { @@ -286,12 +286,12 @@ out: * with preemption not disabled. * * To make use of this facility, the radix tree must be initialised without - * __GFP_WAIT being passed to INIT_RADIX_TREE(). + * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). */ int radix_tree_preload(gfp_t gfp_mask) { /* Warn on non-sensical use... */ - WARN_ON_ONCE(!(gfp_mask & __GFP_WAIT)); + WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask)); return __radix_tree_preload(gfp_mask); } EXPORT_SYMBOL(radix_tree_preload); @@ -303,7 +303,7 @@ EXPORT_SYMBOL(radix_tree_preload); */ int radix_tree_maybe_preload(gfp_t gfp_mask) { - if (gfp_mask & __GFP_WAIT) + if (gfpflags_allow_blocking(gfp_mask)) return __radix_tree_preload(gfp_mask); /* Preloading doesn't help anything with this gfp mask, skip it */ preempt_disable(); diff --git a/lib/test-string_helpers.c b/lib/test-string_helpers.c index 8e376efd88a4..98866a770770 100644 --- a/lib/test-string_helpers.c +++ b/lib/test-string_helpers.c @@ -326,6 +326,39 @@ out: kfree(out_test); } +#define string_get_size_maxbuf 16 +#define test_string_get_size_one(size, blk_size, units, exp_result) \ + do { \ + BUILD_BUG_ON(sizeof(exp_result) >= string_get_size_maxbuf); \ + __test_string_get_size((size), (blk_size), (units), \ + (exp_result)); \ + } while (0) + + +static __init void __test_string_get_size(const u64 size, const u64 blk_size, + const enum string_size_units units, + const char *exp_result) +{ + char buf[string_get_size_maxbuf]; + + string_get_size(size, blk_size, units, buf, sizeof(buf)); + if (!memcmp(buf, exp_result, strlen(exp_result) + 1)) + return; + + buf[sizeof(buf) - 1] = '\0'; + pr_warn("Test 'test_string_get_size_one' failed!\n"); + pr_warn("string_get_size(size = %llu, blk_size = %llu, units = %d\n", + size, blk_size, units); + pr_warn("expected: '%s', got '%s'\n", exp_result, buf); +} + +static __init void test_string_get_size(void) +{ + test_string_get_size_one(16384, 512, STRING_UNITS_2, "8.00 MiB"); + test_string_get_size_one(8192, 4096, STRING_UNITS_10, "32.7 MB"); + test_string_get_size_one(1, 512, STRING_UNITS_10, "512 B"); +} + static int __init test_string_helpers_init(void) { unsigned int i; @@ -344,6 +377,9 @@ static int __init test_string_helpers_init(void) for (i = 0; i < (ESCAPE_ANY_NP | ESCAPE_HEX) + 1; i++) test_string_escape("escape 1", escape1, i, TEST_STRING_2_DICT_1); + /* Test string_get_size() */ + test_string_get_size(); + return -EINVAL; } module_init(test_string_helpers_init); diff --git a/lib/test_printf.c b/lib/test_printf.c new file mode 100644 index 000000000000..c5a666af9ba5 --- /dev/null +++ b/lib/test_printf.c @@ -0,0 +1,362 @@ +/* + * Test cases for printf facility. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/printk.h> +#include <linux/random.h> +#include <linux/slab.h> +#include <linux/string.h> + +#include <linux/socket.h> +#include <linux/in.h> + +#define BUF_SIZE 256 +#define FILL_CHAR '$' + +#define PTR1 ((void*)0x01234567) +#define PTR2 ((void*)(long)(int)0xfedcba98) + +#if BITS_PER_LONG == 64 +#define PTR1_ZEROES "000000000" +#define PTR1_SPACES " " +#define PTR1_STR "1234567" +#define PTR2_STR "fffffffffedcba98" +#define PTR_WIDTH 16 +#else +#define PTR1_ZEROES "0" +#define PTR1_SPACES " " +#define PTR1_STR "1234567" +#define PTR2_STR "fedcba98" +#define PTR_WIDTH 8 +#endif +#define PTR_WIDTH_STR stringify(PTR_WIDTH) + +static unsigned total_tests __initdata; +static unsigned failed_tests __initdata; +static char *test_buffer __initdata; + +static int __printf(4, 0) __init +do_test(int bufsize, const char *expect, int elen, + const char *fmt, va_list ap) +{ + va_list aq; + int ret, written; + + total_tests++; + + memset(test_buffer, FILL_CHAR, BUF_SIZE); + va_copy(aq, ap); + ret = vsnprintf(test_buffer, bufsize, fmt, aq); + va_end(aq); + + if (ret != elen) { + pr_warn("vsnprintf(buf, %d, \"%s\", ...) returned %d, expected %d\n", + bufsize, fmt, ret, elen); + return 1; + } + + if (!bufsize) { + if (memchr_inv(test_buffer, FILL_CHAR, BUF_SIZE)) { + pr_warn("vsnprintf(buf, 0, \"%s\", ...) wrote to buffer\n", + fmt); + return 1; + } + return 0; + } + + written = min(bufsize-1, elen); + if (test_buffer[written]) { + pr_warn("vsnprintf(buf, %d, \"%s\", ...) did not nul-terminate buffer\n", + bufsize, fmt); + return 1; + } + + if (memcmp(test_buffer, expect, written)) { + pr_warn("vsnprintf(buf, %d, \"%s\", ...) wrote '%s', expected '%.*s'\n", + bufsize, fmt, test_buffer, written, expect); + return 1; + } + return 0; +} + +static void __printf(3, 4) __init +__test(const char *expect, int elen, const char *fmt, ...) +{ + va_list ap; + int rand; + char *p; + + BUG_ON(elen >= BUF_SIZE); + + va_start(ap, fmt); + + /* + * Every fmt+args is subjected to four tests: Three where we + * tell vsnprintf varying buffer sizes (plenty, not quite + * enough and 0), and then we also test that kvasprintf would + * be able to print it as expected. + */ + failed_tests += do_test(BUF_SIZE, expect, elen, fmt, ap); + rand = 1 + prandom_u32_max(elen+1); + /* Since elen < BUF_SIZE, we have 1 <= rand <= BUF_SIZE. */ + failed_tests += do_test(rand, expect, elen, fmt, ap); + failed_tests += do_test(0, expect, elen, fmt, ap); + + p = kvasprintf(GFP_KERNEL, fmt, ap); + if (p) { + if (memcmp(p, expect, elen+1)) { + pr_warn("kvasprintf(..., \"%s\", ...) returned '%s', expected '%s'\n", + fmt, p, expect); + failed_tests++; + } + kfree(p); + } + va_end(ap); +} + +#define test(expect, fmt, ...) \ + __test(expect, strlen(expect), fmt, ##__VA_ARGS__) + +static void __init +test_basic(void) +{ + /* Work around annoying "warning: zero-length gnu_printf format string". */ + char nul = '\0'; + + test("", &nul); + test("100%", "100%%"); + test("xxx%yyy", "xxx%cyyy", '%'); + __test("xxx\0yyy", 7, "xxx%cyyy", '\0'); +} + +static void __init +test_number(void) +{ + test("0x1234abcd ", "%#-12x", 0x1234abcd); + test(" 0x1234abcd", "%#12x", 0x1234abcd); + test("0|001| 12|+123| 1234|-123|-1234", "%d|%03d|%3d|%+d|% d|%+d|% d", 0, 1, 12, 123, 1234, -123, -1234); +} + +static void __init +test_string(void) +{ + test("", "%s%.0s", "", "123"); + test("ABCD|abc|123", "%s|%.3s|%.*s", "ABCD", "abcdef", 3, "123456"); + test("1 | 2|3 | 4|5 ", "%-3s|%3s|%-*s|%*s|%*s", "1", "2", 3, "3", 3, "4", -3, "5"); + /* + * POSIX and C99 say that a missing precision should be + * treated as a precision of 0. However, the kernel's printf + * implementation treats this case as if the . wasn't + * present. Let's add a test case documenting the current + * behaviour; should anyone ever feel the need to follow the + * standards more closely, this can be revisited. + */ + test("a||", "%.s|%.0s|%.*s", "a", "b", 0, "c"); + test("a | | ", "%-3.s|%-3.0s|%-3.*s", "a", "b", 0, "c"); +} + +static void __init +plain(void) +{ + test(PTR1_ZEROES PTR1_STR " " PTR2_STR, "%p %p", PTR1, PTR2); + /* + * The field width is overloaded for some %p extensions to + * pass another piece of information. For plain pointers, the + * behaviour is slightly odd: One cannot pass either the 0 + * flag nor a precision to %p without gcc complaining, and if + * one explicitly gives a field width, the number is no longer + * zero-padded. + */ + test("|" PTR1_STR PTR1_SPACES " | " PTR1_SPACES PTR1_STR "|", + "|%-*p|%*p|", PTR_WIDTH+2, PTR1, PTR_WIDTH+2, PTR1); + test("|" PTR2_STR " | " PTR2_STR "|", + "|%-*p|%*p|", PTR_WIDTH+2, PTR2, PTR_WIDTH+2, PTR2); + + /* + * Unrecognized %p extensions are treated as plain %p, but the + * alphanumeric suffix is ignored (that is, does not occur in + * the output.) + */ + test("|"PTR1_ZEROES PTR1_STR"|", "|%p0y|", PTR1); + test("|"PTR2_STR"|", "|%p0y|", PTR2); +} + +static void __init +symbol_ptr(void) +{ +} + +static void __init +kernel_ptr(void) +{ +} + +static void __init +struct_resource(void) +{ +} + +static void __init +addr(void) +{ +} + +static void __init +escaped_str(void) +{ +} + +static void __init +hex_string(void) +{ + const char buf[3] = {0xc0, 0xff, 0xee}; + + test("c0 ff ee|c0:ff:ee|c0-ff-ee|c0ffee", + "%3ph|%3phC|%3phD|%3phN", buf, buf, buf, buf); + test("c0 ff ee|c0:ff:ee|c0-ff-ee|c0ffee", + "%*ph|%*phC|%*phD|%*phN", 3, buf, 3, buf, 3, buf, 3, buf); +} + +static void __init +mac(void) +{ + const u8 addr[6] = {0x2d, 0x48, 0xd6, 0xfc, 0x7a, 0x05}; + + test("2d:48:d6:fc:7a:05", "%pM", addr); + test("05:7a:fc:d6:48:2d", "%pMR", addr); + test("2d-48-d6-fc-7a-05", "%pMF", addr); + test("2d48d6fc7a05", "%pm", addr); + test("057afcd6482d", "%pmR", addr); +} + +static void __init +ip4(void) +{ + struct sockaddr_in sa; + + sa.sin_family = AF_INET; + sa.sin_port = cpu_to_be16(12345); + sa.sin_addr.s_addr = cpu_to_be32(0x7f000001); + + test("127.000.000.001|127.0.0.1", "%pi4|%pI4", &sa.sin_addr, &sa.sin_addr); + test("127.000.000.001|127.0.0.1", "%piS|%pIS", &sa, &sa); + sa.sin_addr.s_addr = cpu_to_be32(0x01020304); + test("001.002.003.004:12345|1.2.3.4:12345", "%piSp|%pISp", &sa, &sa); +} + +static void __init +ip6(void) +{ +} + +static void __init +ip(void) +{ + ip4(); + ip6(); +} + +static void __init +uuid(void) +{ + const char uuid[16] = {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf}; + + test("00010203-0405-0607-0809-0a0b0c0d0e0f", "%pUb", uuid); + test("00010203-0405-0607-0809-0A0B0C0D0E0F", "%pUB", uuid); + test("03020100-0504-0706-0809-0a0b0c0d0e0f", "%pUl", uuid); + test("03020100-0504-0706-0809-0A0B0C0D0E0F", "%pUL", uuid); +} + +static void __init +dentry(void) +{ +} + +static void __init +struct_va_format(void) +{ +} + +static void __init +struct_clk(void) +{ +} + +static void __init +bitmap(void) +{ + DECLARE_BITMAP(bits, 20); + const int primes[] = {2,3,5,7,11,13,17,19}; + int i; + + bitmap_zero(bits, 20); + test("00000|00000", "%20pb|%*pb", bits, 20, bits); + test("|", "%20pbl|%*pbl", bits, 20, bits); + + for (i = 0; i < ARRAY_SIZE(primes); ++i) + set_bit(primes[i], bits); + test("a28ac|a28ac", "%20pb|%*pb", bits, 20, bits); + test("2-3,5,7,11,13,17,19|2-3,5,7,11,13,17,19", "%20pbl|%*pbl", bits, 20, bits); + + bitmap_fill(bits, 20); + test("fffff|fffff", "%20pb|%*pb", bits, 20, bits); + test("0-19|0-19", "%20pbl|%*pbl", bits, 20, bits); +} + +static void __init +netdev_features(void) +{ +} + +static void __init +test_pointer(void) +{ + plain(); + symbol_ptr(); + kernel_ptr(); + struct_resource(); + addr(); + escaped_str(); + hex_string(); + mac(); + ip(); + uuid(); + dentry(); + struct_va_format(); + struct_clk(); + bitmap(); + netdev_features(); +} + +static int __init +test_printf_init(void) +{ + test_buffer = kmalloc(BUF_SIZE, GFP_KERNEL); + if (!test_buffer) + return -ENOMEM; + + test_basic(); + test_number(); + test_string(); + test_pointer(); + + kfree(test_buffer); + + if (failed_tests == 0) + pr_info("all %u tests passed\n", total_tests); + else + pr_warn("failed %u out of %u tests\n", failed_tests, total_tests); + + return failed_tests ? -EINVAL : 0; +} + +module_init(test_printf_init); + +MODULE_AUTHOR("Rasmus Villemoes <linux@rasmusvillemoes.dk>"); +MODULE_LICENSE("GPL"); diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 95cd63b43b99..f9cee8e1233c 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1449,6 +1449,8 @@ int kptr_restrict __read_mostly; * (legacy clock framework) of the clock * - 'Cr' For a clock, it prints the current rate of the clock * + * ** Please update also Documentation/printk-formats.txt when making changes ** + * * Note: The difference between 'S' and 'F' is that on ia64 and ppc64 * function pointers are really function descriptors, which contain a * pointer to the real address. @@ -1457,7 +1459,7 @@ static noinline_for_stack char *pointer(const char *fmt, char *buf, char *end, void *ptr, struct printf_spec spec) { - int default_width = 2 * sizeof(void *) + (spec.flags & SPECIAL ? 2 : 0); + const int default_width = 2 * sizeof(void *); if (!ptr && *fmt != 'K') { /* @@ -1769,14 +1771,14 @@ qualifier: case 'n': /* - * Since %n poses a greater security risk than utility, treat - * it as an invalid format specifier. Warn about its use so - * that new instances don't get added. + * Since %n poses a greater security risk than + * utility, treat it as any other invalid or + * unsupported format specifier. */ - WARN_ONCE(1, "Please remove ignored %%n in '%s'\n", fmt); /* Fall-through */ default: + WARN_ONCE(1, "Please remove unsupported %%%c in format string\n", *fmt); spec->type = FORMAT_TYPE_INVALID; return fmt - start; } @@ -1811,41 +1813,16 @@ qualifier: * @fmt: The format string to use * @args: Arguments for the format string * - * This function follows C99 vsnprintf, but has some extensions: - * %pS output the name of a text symbol with offset - * %ps output the name of a text symbol without offset - * %pF output the name of a function pointer with its offset - * %pf output the name of a function pointer without its offset - * %pB output the name of a backtrace symbol with its offset - * %pR output the address range in a struct resource with decoded flags - * %pr output the address range in a struct resource with raw flags - * %pb output the bitmap with field width as the number of bits - * %pbl output the bitmap as range list with field width as the number of bits - * %pM output a 6-byte MAC address with colons - * %pMR output a 6-byte MAC address with colons in reversed order - * %pMF output a 6-byte MAC address with dashes - * %pm output a 6-byte MAC address without colons - * %pmR output a 6-byte MAC address without colons in reversed order - * %pI4 print an IPv4 address without leading zeros - * %pi4 print an IPv4 address with leading zeros - * %pI6 print an IPv6 address with colons - * %pi6 print an IPv6 address without colons - * %pI6c print an IPv6 address as specified by RFC 5952 - * %pIS depending on sa_family of 'struct sockaddr *' print IPv4/IPv6 address - * %piS depending on sa_family of 'struct sockaddr *' print IPv4/IPv6 address - * %pU[bBlL] print a UUID/GUID in big or little endian using lower or upper - * case. - * %*pE[achnops] print an escaped buffer - * %*ph[CDN] a variable-length hex string with a separator (supports up to 64 - * bytes of the input) - * %pC output the name (Common Clock Framework) or address (legacy clock - * framework) of a clock - * %pCn output the name (Common Clock Framework) or address (legacy clock - * framework) of a clock - * %pCr output the current rate of a clock - * %n is ignored + * This function generally follows C99 vsnprintf, but has some + * extensions and a few limitations: + * + * %n is unsupported + * %p* is handled by pointer() * - * ** Please update Documentation/printk-formats.txt when making changes ** + * See pointer() or Documentation/printk-formats.txt for more + * extensive description. + * + * ** Please update the documentation in both places when making changes ** * * The return value is the number of characters which would * be generated for the given input, excluding the trailing @@ -1944,10 +1921,15 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) break; case FORMAT_TYPE_INVALID: - if (str < end) - *str = '%'; - ++str; - break; + /* + * Presumably the arguments passed gcc's type + * checking, but there is no safe or sane way + * for us to continue parsing the format and + * fetching from the va_list; the remaining + * specifiers and arguments would be out of + * sync. + */ + goto out; default: switch (spec.type) { @@ -1992,6 +1974,7 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) } } +out: if (size > 0) { if (str < end) *str = '\0'; @@ -2189,9 +2172,10 @@ do { \ switch (spec.type) { case FORMAT_TYPE_NONE: - case FORMAT_TYPE_INVALID: case FORMAT_TYPE_PERCENT_CHAR: break; + case FORMAT_TYPE_INVALID: + goto out; case FORMAT_TYPE_WIDTH: case FORMAT_TYPE_PRECISION: @@ -2253,6 +2237,7 @@ do { \ } } +out: return (u32 *)(PTR_ALIGN(str, sizeof(u32))) - bin_buf; #undef save_arg } @@ -2286,7 +2271,7 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) char *str, *end; const char *args = (const char *)bin_buf; - if (WARN_ON_ONCE((int) size < 0)) + if (WARN_ON_ONCE(size > INT_MAX)) return 0; str = buf; @@ -2375,12 +2360,14 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) break; case FORMAT_TYPE_PERCENT_CHAR: - case FORMAT_TYPE_INVALID: if (str < end) *str = '%'; ++str; break; + case FORMAT_TYPE_INVALID: + goto out; + default: { unsigned long long num; @@ -2423,6 +2410,7 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) } /* switch(spec.type) */ } /* while(*fmt) */ +out: if (size > 0) { if (str < end) *str = '\0'; diff --git a/mm/Kconfig b/mm/Kconfig index 0d9fdcd01e47..97a4e06b15c0 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -200,18 +200,6 @@ config MEMORY_HOTREMOVE depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE depends on MIGRATION -# -# If we have space for more page flags then we can enable additional -# optimizations and functionality. -# -# Regular Sparsemem takes page flag bits for the sectionid if it does not -# use a virtual memmap. Disable extended page flags for 32 bit platforms -# that require the use of a sectionid in the page flags. -# -config PAGEFLAGS_EXTENDED - def_bool y - depends on 64BIT || SPARSEMEM_VMEMMAP || !SPARSEMEM - # Heavily threaded applications may benefit from splitting the mm-wide # page_table_lock, so that faults on different parts of the user address # space can be handled with less contention: split it at this NR_CPUS. diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 619984fc07ec..8ed2ffd963c5 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -637,7 +637,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, { struct bdi_writeback *wb; - might_sleep_if(gfp & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(gfp)); if (!memcg_css->parent) return &bdi->wb; diff --git a/mm/debug.c b/mm/debug.c index e784110fb51d..668aa35191ca 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -25,12 +25,7 @@ static const struct trace_print_flags pageflag_names[] = { {1UL << PG_private, "private" }, {1UL << PG_private_2, "private_2" }, {1UL << PG_writeback, "writeback" }, -#ifdef CONFIG_PAGEFLAGS_EXTENDED {1UL << PG_head, "head" }, - {1UL << PG_tail, "tail" }, -#else - {1UL << PG_compound, "compound" }, -#endif {1UL << PG_swapcache, "swapcache" }, {1UL << PG_mappedtodisk, "mappedtodisk" }, {1UL << PG_reclaim, "reclaim" }, diff --git a/mm/dmapool.c b/mm/dmapool.c index 312a716fa14c..57312b5d6e12 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -326,7 +326,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, size_t offset; void *retval; - might_sleep_if(mem_flags & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(mem_flags)); spin_lock_irqsave(&pool->lock, flags); list_for_each_entry(page, &pool->page_list, page_list) { diff --git a/mm/failslab.c b/mm/failslab.c index 98fb490311eb..79171b4a5826 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -3,11 +3,11 @@ static struct { struct fault_attr attr; - bool ignore_gfp_wait; + bool ignore_gfp_reclaim; bool cache_filter; } failslab = { .attr = FAULT_ATTR_INITIALIZER, - .ignore_gfp_wait = true, + .ignore_gfp_reclaim = true, .cache_filter = false, }; @@ -16,7 +16,7 @@ bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags) if (gfpflags & __GFP_NOFAIL) return false; - if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT)) + if (failslab.ignore_gfp_reclaim && (gfpflags & __GFP_RECLAIM)) return false; if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB)) @@ -42,7 +42,7 @@ static int __init failslab_debugfs_init(void) return PTR_ERR(dir); if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, - &failslab.ignore_gfp_wait)) + &failslab.ignore_gfp_reclaim)) goto fail; if (!debugfs_create_bool("cache-filter", mode, dir, &failslab.cache_filter)) diff --git a/mm/filemap.c b/mm/filemap.c index 58e04e26f996..1bb007624b53 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1722,7 +1722,7 @@ no_cached_page: goto out; } error = add_to_page_cache_lru(page, mapping, index, - GFP_KERNEL & mapping_gfp_mask(mapping)); + mapping_gfp_constraint(mapping, GFP_KERNEL)); if (error) { page_cache_release(page); if (error == -EEXIST) { @@ -1824,7 +1824,7 @@ static int page_cache_read(struct file *file, pgoff_t offset) return -ENOMEM; ret = add_to_page_cache_lru(page, mapping, offset, - GFP_KERNEL & mapping_gfp_mask(mapping)); + mapping_gfp_constraint(mapping, GFP_KERNEL)); if (ret == 0) ret = mapping->a_ops->readpage(file, page); else if (ret == -EEXIST) @@ -2713,7 +2713,7 @@ EXPORT_SYMBOL(generic_file_write_iter); * page is known to the local caching routines. * * The @gfp_mask argument specifies whether I/O may be performed to release - * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). + * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS). * */ int try_to_release_page(struct page *page, gfp_t gfp_mask) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 00cfd1ae2271..c29ddebc8705 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -116,7 +116,7 @@ static void set_recommended_min_free_kbytes(void) for_each_populated_zone(zone) nr_zones++; - /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */ + /* Ensure 2 pageblocks are free to assist fragmentation avoidance */ recommended_min = pageblock_nr_pages * nr_zones * 2; /* @@ -786,7 +786,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) { - return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; + return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_RECLAIM)) | extra_gfp; } /* Caller must hold page table lock. */ @@ -1755,8 +1755,7 @@ static void __split_huge_page_refcount(struct page *page, (1L << PG_unevictable))); page_tail->flags |= (1L << PG_dirty); - /* clear PageTail before overwriting first_page */ - smp_wmb(); + clear_compound_head(page_tail); if (page_is_young(page)) set_page_young(page_tail); @@ -2413,8 +2412,7 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) static struct page * khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - int node) + unsigned long address, int node) { VM_BUG_ON_PAGE(*hpage, *hpage); @@ -2481,8 +2479,7 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) static struct page * khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - int node) + unsigned long address, int node) { up_read(&mm->mmap_sem); VM_BUG_ON(!*hpage); @@ -2530,7 +2527,7 @@ static void collapse_huge_page(struct mm_struct *mm, __GFP_THISNODE; /* release the mmap_sem read lock. */ - new_page = khugepaged_alloc_page(hpage, gfp, mm, vma, address, node); + new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node); if (!new_page) return; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 74ef0c6a25dd..7ce07d681265 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -994,23 +994,22 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) #if defined(CONFIG_CMA) && defined(CONFIG_X86_64) static void destroy_compound_gigantic_page(struct page *page, - unsigned long order) + unsigned int order) { int i; int nr_pages = 1 << order; struct page *p = page + 1; for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { - __ClearPageTail(p); + clear_compound_head(p); set_page_refcounted(p); - p->first_page = NULL; } set_compound_order(page, 0); __ClearPageHead(page); } -static void free_gigantic_page(struct page *page, unsigned order) +static void free_gigantic_page(struct page *page, unsigned int order) { free_contig_range(page_to_pfn(page), 1 << order); } @@ -1054,7 +1053,7 @@ static bool zone_spans_last_pfn(const struct zone *zone, return zone_spans_pfn(zone, last_pfn); } -static struct page *alloc_gigantic_page(int nid, unsigned order) +static struct page *alloc_gigantic_page(int nid, unsigned int order) { unsigned long nr_pages = 1 << order; unsigned long ret, pfn, flags; @@ -1090,7 +1089,7 @@ static struct page *alloc_gigantic_page(int nid, unsigned order) } static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); -static void prep_compound_gigantic_page(struct page *page, unsigned long order); +static void prep_compound_gigantic_page(struct page *page, unsigned int order); static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid) { @@ -1123,9 +1122,9 @@ static int alloc_fresh_gigantic_page(struct hstate *h, static inline bool gigantic_page_supported(void) { return true; } #else static inline bool gigantic_page_supported(void) { return false; } -static inline void free_gigantic_page(struct page *page, unsigned order) { } +static inline void free_gigantic_page(struct page *page, unsigned int order) { } static inline void destroy_compound_gigantic_page(struct page *page, - unsigned long order) { } + unsigned int order) { } static inline int alloc_fresh_gigantic_page(struct hstate *h, nodemask_t *nodes_allowed) { return 0; } #endif @@ -1146,7 +1145,7 @@ static void update_and_free_page(struct hstate *h, struct page *page) 1 << PG_writeback); } VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); - set_compound_page_dtor(page, NULL); + set_compound_page_dtor(page, NULL_COMPOUND_DTOR); set_page_refcounted(page); if (hstate_is_gigantic(h)) { destroy_compound_gigantic_page(page, huge_page_order(h)); @@ -1242,7 +1241,7 @@ void free_huge_page(struct page *page) static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { INIT_LIST_HEAD(&page->lru); - set_compound_page_dtor(page, free_huge_page); + set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); spin_lock(&hugetlb_lock); set_hugetlb_cgroup(page, NULL); h->nr_huge_pages++; @@ -1251,7 +1250,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) put_page(page); /* free it into the hugepage allocator */ } -static void prep_compound_gigantic_page(struct page *page, unsigned long order) +static void prep_compound_gigantic_page(struct page *page, unsigned int order) { int i; int nr_pages = 1 << order; @@ -1276,10 +1275,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) */ __ClearPageReserved(p); set_page_count(p, 0); - p->first_page = page; - /* Make sure p->first_page is always valid for PageTail() */ - smp_wmb(); - __SetPageTail(p); + set_compound_head(p, page); } } @@ -1294,7 +1290,7 @@ int PageHuge(struct page *page) return 0; page = compound_head(page); - return get_compound_page_dtor(page) == free_huge_page; + return page[1].compound_dtor == HUGETLB_PAGE_DTOR; } EXPORT_SYMBOL_GPL(PageHuge); @@ -1568,7 +1564,7 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, if (page) { INIT_LIST_HEAD(&page->lru); r_nid = page_to_nid(page); - set_compound_page_dtor(page, free_huge_page); + set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); set_hugetlb_cgroup(page, NULL); /* * We incremented the global counters already @@ -1972,7 +1968,8 @@ found: return 1; } -static void __init prep_compound_huge_page(struct page *page, int order) +static void __init prep_compound_huge_page(struct page *page, + unsigned int order) { if (unlikely(order > (MAX_ORDER - 1))) prep_compound_gigantic_page(page, order); @@ -2683,7 +2680,7 @@ static int __init hugetlb_init(void) module_init(hugetlb_init); /* Should be called on processing a hugepagesz=... option */ -void __init hugetlb_add_hstate(unsigned order) +void __init hugetlb_add_hstate(unsigned int order) { struct hstate *h; unsigned long i; diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 33d59abe91f1..d8fb10de0f14 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -385,7 +385,7 @@ void __init hugetlb_cgroup_file_init(void) /* * Add cgroup control files only if the huge page consists * of more than two normal pages. This is because we use - * page[2].lru.next for storing cgroup details. + * page[2].private for storing cgroup details. */ if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER) __hugetlb_cgroup_file_init(hstate_index(h)); diff --git a/mm/internal.h b/mm/internal.h index d4b807d6c963..38e24b89e4c4 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -14,6 +14,25 @@ #include <linux/fs.h> #include <linux/mm.h> +/* + * The set of flags that only affect watermark checking and reclaim + * behaviour. This is used by the MM to obey the caller constraints + * about IO, FS and watermark checking while ignoring placement + * hints such as HIGHMEM usage. + */ +#define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ + __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\ + __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC) + +/* The GFP flags allowed during early boot */ +#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) + +/* Control allocation cpuset and node placement constraints */ +#define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE) + +/* Do not use these with a slab allocator */ +#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) + void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); @@ -61,9 +80,9 @@ static inline void __get_page_tail_foll(struct page *page, * speculative page access (like in * page_cache_get_speculative()) on tail pages. */ - VM_BUG_ON_PAGE(atomic_read(&page->first_page->_count) <= 0, page); + VM_BUG_ON_PAGE(atomic_read(&compound_head(page)->_count) <= 0, page); if (get_page_head) - atomic_inc(&page->first_page->_count); + atomic_inc(&compound_head(page)->_count); get_huge_page_tail(page); } @@ -129,6 +148,7 @@ struct alloc_context { int classzone_idx; int migratetype; enum zone_type high_zoneidx; + bool spread_dirty_pages; }; /* @@ -157,7 +177,7 @@ __find_buddy_index(unsigned long page_idx, unsigned int order) extern int __isolate_free_page(struct page *page, unsigned int order); extern void __free_pages_bootmem(struct page *page, unsigned long pfn, unsigned int order); -extern void prep_compound_page(struct page *page, unsigned long order); +extern void prep_compound_page(struct page *page, unsigned int order); #ifdef CONFIG_MEMORY_FAILURE extern bool is_free_buddy_page(struct page *page); #endif @@ -215,7 +235,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, * page cannot be allocated or merged in parallel. Alternatively, it must * handle invalid values gracefully, and use page_order_unsafe() below. */ -static inline unsigned long page_order(struct page *page) +static inline unsigned int page_order(struct page *page) { /* PageBuddy() must be checked by the caller */ return page_private(page); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bc502e590366..9acfb165eb52 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2046,7 +2046,7 @@ retry: if (unlikely(task_in_memcg_oom(current))) goto nomem; - if (!(gfp_mask & __GFP_WAIT)) + if (!gfpflags_allow_blocking(gfp_mask)) goto nomem; mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1); @@ -2120,7 +2120,7 @@ done_restock: /* * If the hierarchy is above the normal consumption range, schedule * reclaim on returning to userland. We can perform reclaim here - * if __GFP_WAIT but let's always punt for simplicity and so that + * if __GFP_RECLAIM but let's always punt for simplicity and so that * GFP_KERNEL can consistently be used during reclaim. @memcg is * not recorded as it most likely matches current's and won't * change in the meantime. As high limit is checked again before @@ -2801,7 +2801,7 @@ static unsigned long tree_stat(struct mem_cgroup *memcg, return val; } -static inline unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) +static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) { unsigned long val; @@ -4364,8 +4364,8 @@ static int mem_cgroup_do_precharge(unsigned long count) { int ret; - /* Try a single bulk charge without reclaim first */ - ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); + /* Try a single bulk charge without reclaim first, kswapd may wake */ + ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count); if (!ret) { mc.precharge += count; return ret; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 16a0ec385320..8424b64711ac 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -776,8 +776,6 @@ static int me_huge_page(struct page *p, unsigned long pfn) #define lru (1UL << PG_lru) #define swapbacked (1UL << PG_swapbacked) #define head (1UL << PG_head) -#define tail (1UL << PG_tail) -#define compound (1UL << PG_compound) #define slab (1UL << PG_slab) #define reserved (1UL << PG_reserved) @@ -800,12 +798,7 @@ static struct page_state { */ { slab, slab, MF_MSG_SLAB, me_kernel }, -#ifdef CONFIG_PAGEFLAGS_EXTENDED { head, head, MF_MSG_HUGE, me_huge_page }, - { tail, tail, MF_MSG_HUGE, me_huge_page }, -#else - { compound, compound, MF_MSG_HUGE, me_huge_page }, -#endif { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty }, { sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean }, diff --git a/mm/mempool.c b/mm/mempool.c index 4c533bc51d73..004d42b1dfaf 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -320,13 +320,13 @@ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask) gfp_t gfp_temp; VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); - might_sleep_if(gfp_mask & __GFP_WAIT); + might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ gfp_mask |= __GFP_NOWARN; /* failures are OK */ - gfp_temp = gfp_mask & ~(__GFP_WAIT|__GFP_IO); + gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO); repeat_alloc: @@ -349,7 +349,7 @@ repeat_alloc: } /* - * We use gfp mask w/o __GFP_WAIT or IO for the first round. If + * We use gfp mask w/o direct reclaim or IO for the first round. If * alloc failed with that and @pool was empty, retry immediately. */ if (gfp_temp != gfp_mask) { @@ -358,8 +358,8 @@ repeat_alloc: goto repeat_alloc; } - /* We must not sleep if !__GFP_WAIT */ - if (!(gfp_mask & __GFP_WAIT)) { + /* We must not sleep if !__GFP_DIRECT_RECLAIM */ + if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { spin_unlock_irqrestore(&pool->lock, flags); return NULL; } diff --git a/mm/migrate.c b/mm/migrate.c index 2834faba719a..7890d0bb5e23 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1578,7 +1578,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page, (GFP_HIGHUSER_MOVABLE | __GFP_THISNODE | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) & - ~GFP_IOFS, 0); + ~(__GFP_IO | __GFP_FS), 0); return newpage; } @@ -1752,7 +1752,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, goto out_dropref; new_page = alloc_pages_node(node, - (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_WAIT, + (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM, HPAGE_PMD_ORDER); if (!new_page) goto out_fail; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index e4778285d8d1..d13a33918fa2 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -118,6 +118,15 @@ found: return t; } +/* + * order == -1 means the oom kill is required by sysrq, otherwise only + * for display purposes. + */ +static inline bool is_sysrq_oom(struct oom_control *oc) +{ + return oc->order == -1; +} + /* return true if the task is not adequate as candidate victim task. */ static bool oom_unkillable_task(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask) @@ -265,7 +274,7 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, * Don't allow any other task to have access to the reserves. */ if (test_tsk_thread_flag(task, TIF_MEMDIE)) { - if (oc->order != -1) + if (!is_sysrq_oom(oc)) return OOM_SCAN_ABORT; } if (!task->mm) @@ -278,7 +287,7 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, if (oom_task_origin(task)) return OOM_SCAN_SELECT; - if (task_will_free_mem(task) && oc->order != -1) + if (task_will_free_mem(task) && !is_sysrq_oom(oc)) return OOM_SCAN_ABORT; return OOM_SCAN_OK; @@ -629,7 +638,7 @@ void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint, return; } /* Do not panic for oom kills triggered by sysrq */ - if (oc->order == -1) + if (is_sysrq_oom(oc)) return; dump_header(oc, NULL, memcg); panic("Out of memory: %s panic_on_oom is enabled\n", @@ -709,7 +718,7 @@ bool out_of_memory(struct oom_control *oc) p = select_bad_process(oc, &points, totalpages); /* Found nothing?!?! Either we hang forever, or we panic. */ - if (!p && oc->order != -1) { + if (!p && !is_sysrq_oom(oc)) { dump_header(oc, NULL, NULL); panic("Out of memory and no killable processes...\n"); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 446bb36ee59d..208e4c7e771b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -169,19 +169,19 @@ void pm_restrict_gfp_mask(void) WARN_ON(!mutex_is_locked(&pm_mutex)); WARN_ON(saved_gfp_mask); saved_gfp_mask = gfp_allowed_mask; - gfp_allowed_mask &= ~GFP_IOFS; + gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); } bool pm_suspended_storage(void) { - if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS) + if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) return false; return true; } #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE -int pageblock_order __read_mostly; +unsigned int pageblock_order __read_mostly; #endif static void __free_pages_ok(struct page *page, unsigned int order); @@ -229,6 +229,15 @@ static char * const zone_names[MAX_NR_ZONES] = { #endif }; +static void free_compound_page(struct page *page); +compound_page_dtor * const compound_page_dtors[] = { + NULL, + free_compound_page, +#ifdef CONFIG_HUGETLB_PAGE + free_huge_page, +#endif +}; + int min_free_kbytes = 1024; int user_min_free_kbytes = -1; @@ -436,15 +445,15 @@ out: /* * Higher-order pages are called "compound pages". They are structured thusly: * - * The first PAGE_SIZE page is called the "head page". + * The first PAGE_SIZE page is called the "head page" and have PG_head set. * - * The remaining PAGE_SIZE pages are called "tail pages". + * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded + * in bit 0 of page->compound_head. The rest of bits is pointer to head page. * - * All pages have PG_compound set. All tail pages have their ->first_page - * pointing at the head page. + * The first tail page's ->compound_dtor holds the offset in array of compound + * page destructors. See compound_page_dtors. * - * The first tail page's ->lru.next holds the address of the compound page's - * put_page() function. Its ->lru.prev holds the order of allocation. + * The first tail page's ->compound_order holds the order of allocation. * This usage means that zero-order pages may not be compound. */ @@ -453,21 +462,18 @@ static void free_compound_page(struct page *page) __free_pages_ok(page, compound_order(page)); } -void prep_compound_page(struct page *page, unsigned long order) +void prep_compound_page(struct page *page, unsigned int order) { int i; int nr_pages = 1 << order; - set_compound_page_dtor(page, free_compound_page); + set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); set_compound_order(page, order); __SetPageHead(page); for (i = 1; i < nr_pages; i++) { struct page *p = page + i; set_page_count(p, 0); - p->first_page = page; - /* Make sure p->first_page is always valid for PageTail() */ - smp_wmb(); - __SetPageTail(p); + set_compound_head(p, page); } } @@ -656,7 +662,7 @@ static inline void __free_one_page(struct page *page, unsigned long combined_idx; unsigned long uninitialized_var(buddy_idx); struct page *buddy; - int max_order = MAX_ORDER; + unsigned int max_order = MAX_ORDER; VM_BUG_ON(!zone_is_initialized(zone)); VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); @@ -669,7 +675,7 @@ static inline void __free_one_page(struct page *page, * pageblock. Without this, pageblock isolation * could cause incorrect freepage accounting. */ - max_order = min(MAX_ORDER, pageblock_order + 1); + max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); } else { __mod_zone_freepage_state(zone, 1 << order, migratetype); } @@ -817,7 +823,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, if (unlikely(has_isolate_pageblock(zone))) mt = get_pageblock_migratetype(page); - /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ __free_one_page(page, page_to_pfn(page), zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); } while (--to_free && --batch_free && !list_empty(list)); @@ -846,17 +851,30 @@ static void free_one_page(struct zone *zone, static int free_tail_pages_check(struct page *head_page, struct page *page) { - if (!IS_ENABLED(CONFIG_DEBUG_VM)) - return 0; + int ret = 1; + + /* + * We rely page->lru.next never has bit 0 set, unless the page + * is PageTail(). Let's make sure that's true even for poisoned ->lru. + */ + BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); + + if (!IS_ENABLED(CONFIG_DEBUG_VM)) { + ret = 0; + goto out; + } if (unlikely(!PageTail(page))) { bad_page(page, "PageTail not set", 0); - return 1; + goto out; } - if (unlikely(page->first_page != head_page)) { - bad_page(page, "first_page not consistent", 0); - return 1; + if (unlikely(compound_head(page) != head_page)) { + bad_page(page, "compound_head not consistent", 0); + goto out; } - return 0; + ret = 0; +out: + clear_compound_head(page); + return ret; } static void __meminit __init_single_page(struct page *page, unsigned long pfn, @@ -923,6 +941,10 @@ void __meminit reserve_bootmem_region(unsigned long start, unsigned long end) struct page *page = pfn_to_page(start_pfn); init_reserved_page(start_pfn); + + /* Avoid false-positive PageTail() */ + INIT_LIST_HEAD(&page->lru); + SetPageReserved(page); } } @@ -1417,15 +1439,14 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, * the free lists for the desirable migrate type are depleted */ static int fallbacks[MIGRATE_TYPES][4] = { - [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, - [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, - [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, + [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, + [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, + [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, #ifdef CONFIG_CMA - [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ + [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */ #endif - [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ #ifdef CONFIG_MEMORY_ISOLATION - [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ + [MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */ #endif }; @@ -1450,7 +1471,7 @@ int move_freepages(struct zone *zone, int migratetype) { struct page *page; - unsigned long order; + unsigned int order; int pages_moved = 0; #ifndef CONFIG_HOLES_IN_ZONE @@ -1563,7 +1584,7 @@ static bool can_steal_fallback(unsigned int order, int start_mt) static void steal_suitable_fallback(struct zone *zone, struct page *page, int start_type) { - int current_order = page_order(page); + unsigned int current_order = page_order(page); int pages; /* Take ownership for orders >= pageblock_order */ @@ -1598,7 +1619,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, *can_steal = false; for (i = 0;; i++) { fallback_mt = fallbacks[migratetype][i]; - if (fallback_mt == MIGRATE_RESERVE) + if (fallback_mt == MIGRATE_TYPES) break; if (list_empty(&area->free_list[fallback_mt])) @@ -1617,6 +1638,101 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, return -1; } +/* + * Reserve a pageblock for exclusive use of high-order atomic allocations if + * there are no empty page blocks that contain a page with a suitable order + */ +static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, + unsigned int alloc_order) +{ + int mt; + unsigned long max_managed, flags; + + /* + * Limit the number reserved to 1 pageblock or roughly 1% of a zone. + * Check is race-prone but harmless. + */ + max_managed = (zone->managed_pages / 100) + pageblock_nr_pages; + if (zone->nr_reserved_highatomic >= max_managed) + return; + + spin_lock_irqsave(&zone->lock, flags); + + /* Recheck the nr_reserved_highatomic limit under the lock */ + if (zone->nr_reserved_highatomic >= max_managed) + goto out_unlock; + + /* Yoink! */ + mt = get_pageblock_migratetype(page); + if (mt != MIGRATE_HIGHATOMIC && + !is_migrate_isolate(mt) && !is_migrate_cma(mt)) { + zone->nr_reserved_highatomic += pageblock_nr_pages; + set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); + move_freepages_block(zone, page, MIGRATE_HIGHATOMIC); + } + +out_unlock: + spin_unlock_irqrestore(&zone->lock, flags); +} + +/* + * Used when an allocation is about to fail under memory pressure. This + * potentially hurts the reliability of high-order allocations when under + * intense memory pressure but failed atomic allocations should be easier + * to recover from than an OOM. + */ +static void unreserve_highatomic_pageblock(const struct alloc_context *ac) +{ + struct zonelist *zonelist = ac->zonelist; + unsigned long flags; + struct zoneref *z; + struct zone *zone; + struct page *page; + int order; + + for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, + ac->nodemask) { + /* Preserve at least one pageblock */ + if (zone->nr_reserved_highatomic <= pageblock_nr_pages) + continue; + + spin_lock_irqsave(&zone->lock, flags); + for (order = 0; order < MAX_ORDER; order++) { + struct free_area *area = &(zone->free_area[order]); + + if (list_empty(&area->free_list[MIGRATE_HIGHATOMIC])) + continue; + + page = list_entry(area->free_list[MIGRATE_HIGHATOMIC].next, + struct page, lru); + + /* + * It should never happen but changes to locking could + * inadvertently allow a per-cpu drain to add pages + * to MIGRATE_HIGHATOMIC while unreserving so be safe + * and watch for underflows. + */ + zone->nr_reserved_highatomic -= min(pageblock_nr_pages, + zone->nr_reserved_highatomic); + + /* + * Convert to ac->migratetype and avoid the normal + * pageblock stealing heuristics. Minimally, the caller + * is doing the work and needs the pages. More + * importantly, if the block was always converted to + * MIGRATE_UNMOVABLE or another type then the number + * of pageblocks that cannot be completely freed + * may increase. + */ + set_pageblock_migratetype(page, ac->migratetype); + move_freepages_block(zone, page, ac->migratetype); + spin_unlock_irqrestore(&zone->lock, flags); + return; + } + spin_unlock_irqrestore(&zone->lock, flags); + } +} + /* Remove an element from the buddy allocator from the fallback list */ static inline struct page * __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) @@ -1672,29 +1788,17 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) * Call me with the zone->lock already held. */ static struct page *__rmqueue(struct zone *zone, unsigned int order, - int migratetype) + int migratetype, gfp_t gfp_flags) { struct page *page; -retry_reserve: page = __rmqueue_smallest(zone, order, migratetype); - - if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { + if (unlikely(!page)) { if (migratetype == MIGRATE_MOVABLE) page = __rmqueue_cma_fallback(zone, order); if (!page) page = __rmqueue_fallback(zone, order, migratetype); - - /* - * Use MIGRATE_RESERVE rather than fail an allocation. goto - * is used because __rmqueue_smallest is an inline function - * and we want just one call site - */ - if (!page) { - migratetype = MIGRATE_RESERVE; - goto retry_reserve; - } } trace_mm_page_alloc_zone_locked(page, order, migratetype); @@ -1714,7 +1818,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, spin_lock(&zone->lock); for (i = 0; i < count; ++i) { - struct page *page = __rmqueue(zone, order, migratetype); + struct page *page = __rmqueue(zone, order, migratetype, 0); if (unlikely(page == NULL)) break; @@ -2086,7 +2190,7 @@ int split_free_page(struct page *page) static inline struct page *buffered_rmqueue(struct zone *preferred_zone, struct zone *zone, unsigned int order, - gfp_t gfp_flags, int migratetype) + gfp_t gfp_flags, int alloc_flags, int migratetype) { unsigned long flags; struct page *page; @@ -2129,7 +2233,15 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, WARN_ON_ONCE(order > 1); } spin_lock_irqsave(&zone->lock, flags); - page = __rmqueue(zone, order, migratetype); + + page = NULL; + if (alloc_flags & ALLOC_HARDER) { + page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); + if (page) + trace_mm_page_alloc_zone_locked(page, order, migratetype); + } + if (!page) + page = __rmqueue(zone, order, migratetype, gfp_flags); spin_unlock(&zone->lock); if (!page) goto failed; @@ -2160,11 +2272,11 @@ static struct { struct fault_attr attr; bool ignore_gfp_highmem; - bool ignore_gfp_wait; + bool ignore_gfp_reclaim; u32 min_order; } fail_page_alloc = { .attr = FAULT_ATTR_INITIALIZER, - .ignore_gfp_wait = true, + .ignore_gfp_reclaim = true, .ignore_gfp_highmem = true, .min_order = 1, }; @@ -2183,7 +2295,8 @@ static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) return false; if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) return false; - if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) + if (fail_page_alloc.ignore_gfp_reclaim && + (gfp_mask & __GFP_DIRECT_RECLAIM)) return false; return should_fail(&fail_page_alloc.attr, 1 << order); @@ -2202,7 +2315,7 @@ static int __init fail_page_alloc_debugfs(void) return PTR_ERR(dir); if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, - &fail_page_alloc.ignore_gfp_wait)) + &fail_page_alloc.ignore_gfp_reclaim)) goto fail; if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, &fail_page_alloc.ignore_gfp_highmem)) @@ -2232,42 +2345,77 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) #endif /* CONFIG_FAIL_PAGE_ALLOC */ /* - * Return true if free pages are above 'mark'. This takes into account the order - * of the allocation. + * Return true if free base pages are above 'mark'. For high-order checks it + * will return true of the order-0 watermark is reached and there is at least + * one free page of a suitable size. Checking now avoids taking the zone lock + * to check in the allocation paths if no pages are free. */ static bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx, int alloc_flags, long free_pages) { - /* free_pages may go negative - that's OK */ long min = mark; int o; - long free_cma = 0; + const int alloc_harder = (alloc_flags & ALLOC_HARDER); + /* free_pages may go negative - that's OK */ free_pages -= (1 << order) - 1; + if (alloc_flags & ALLOC_HIGH) min -= min / 2; - if (alloc_flags & ALLOC_HARDER) + + /* + * If the caller does not have rights to ALLOC_HARDER then subtract + * the high-atomic reserves. This will over-estimate the size of the + * atomic reserve but it avoids a search. + */ + if (likely(!alloc_harder)) + free_pages -= z->nr_reserved_highatomic; + else min -= min / 4; + #ifdef CONFIG_CMA /* If allocation can't use CMA areas don't use free CMA pages */ if (!(alloc_flags & ALLOC_CMA)) - free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); + free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); #endif - if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx]) + /* + * Check watermarks for an order-0 allocation request. If these + * are not met, then a high-order request also cannot go ahead + * even if a suitable page happened to be free. + */ + if (free_pages <= min + z->lowmem_reserve[classzone_idx]) return false; - for (o = 0; o < order; o++) { - /* At the next order, this order's pages become unavailable */ - free_pages -= z->free_area[o].nr_free << o; - /* Require fewer higher order pages to be free */ - min >>= 1; + /* If this is an order-0 request then the watermark is fine */ + if (!order) + return true; + + /* For a high-order request, check at least one suitable page is free */ + for (o = order; o < MAX_ORDER; o++) { + struct free_area *area = &z->free_area[o]; + int mt; + + if (!area->nr_free) + continue; + + if (alloc_harder) + return true; + + for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { + if (!list_empty(&area->free_list[mt])) + return true; + } - if (free_pages <= min) - return false; +#ifdef CONFIG_CMA + if ((alloc_flags & ALLOC_CMA) && + !list_empty(&area->free_list[MIGRATE_CMA])) { + return true; + } +#endif } - return true; + return false; } bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, @@ -2278,134 +2426,18 @@ bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, } bool zone_watermark_ok_safe(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx, int alloc_flags) + unsigned long mark, int classzone_idx) { long free_pages = zone_page_state(z, NR_FREE_PAGES); if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); - return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + return __zone_watermark_ok(z, order, mark, classzone_idx, 0, free_pages); } #ifdef CONFIG_NUMA -/* - * zlc_setup - Setup for "zonelist cache". Uses cached zone data to - * skip over zones that are not allowed by the cpuset, or that have - * been recently (in last second) found to be nearly full. See further - * comments in mmzone.h. Reduces cache footprint of zonelist scans - * that have to skip over a lot of full or unallowed zones. - * - * If the zonelist cache is present in the passed zonelist, then - * returns a pointer to the allowed node mask (either the current - * tasks mems_allowed, or node_states[N_MEMORY].) - * - * If the zonelist cache is not available for this zonelist, does - * nothing and returns NULL. - * - * If the fullzones BITMAP in the zonelist cache is stale (more than - * a second since last zap'd) then we zap it out (clear its bits.) - * - * We hold off even calling zlc_setup, until after we've checked the - * first zone in the zonelist, on the theory that most allocations will - * be satisfied from that first zone, so best to examine that zone as - * quickly as we can. - */ -static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) -{ - struct zonelist_cache *zlc; /* cached zonelist speedup info */ - nodemask_t *allowednodes; /* zonelist_cache approximation */ - - zlc = zonelist->zlcache_ptr; - if (!zlc) - return NULL; - - if (time_after(jiffies, zlc->last_full_zap + HZ)) { - bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); - zlc->last_full_zap = jiffies; - } - - allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? - &cpuset_current_mems_allowed : - &node_states[N_MEMORY]; - return allowednodes; -} - -/* - * Given 'z' scanning a zonelist, run a couple of quick checks to see - * if it is worth looking at further for free memory: - * 1) Check that the zone isn't thought to be full (doesn't have its - * bit set in the zonelist_cache fullzones BITMAP). - * 2) Check that the zones node (obtained from the zonelist_cache - * z_to_n[] mapping) is allowed in the passed in allowednodes mask. - * Return true (non-zero) if zone is worth looking at further, or - * else return false (zero) if it is not. - * - * This check -ignores- the distinction between various watermarks, - * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is - * found to be full for any variation of these watermarks, it will - * be considered full for up to one second by all requests, unless - * we are so low on memory on all allowed nodes that we are forced - * into the second scan of the zonelist. - * - * In the second scan we ignore this zonelist cache and exactly - * apply the watermarks to all zones, even it is slower to do so. - * We are low on memory in the second scan, and should leave no stone - * unturned looking for a free page. - */ -static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, - nodemask_t *allowednodes) -{ - struct zonelist_cache *zlc; /* cached zonelist speedup info */ - int i; /* index of *z in zonelist zones */ - int n; /* node that zone *z is on */ - - zlc = zonelist->zlcache_ptr; - if (!zlc) - return 1; - - i = z - zonelist->_zonerefs; - n = zlc->z_to_n[i]; - - /* This zone is worth trying if it is allowed but not full */ - return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); -} - -/* - * Given 'z' scanning a zonelist, set the corresponding bit in - * zlc->fullzones, so that subsequent attempts to allocate a page - * from that zone don't waste time re-examining it. - */ -static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) -{ - struct zonelist_cache *zlc; /* cached zonelist speedup info */ - int i; /* index of *z in zonelist zones */ - - zlc = zonelist->zlcache_ptr; - if (!zlc) - return; - - i = z - zonelist->_zonerefs; - - set_bit(i, zlc->fullzones); -} - -/* - * clear all zones full, called after direct reclaim makes progress so that - * a zone that was recently full is not skipped over for up to a second - */ -static void zlc_clear_zones_full(struct zonelist *zonelist) -{ - struct zonelist_cache *zlc; /* cached zonelist speedup info */ - - zlc = zonelist->zlcache_ptr; - if (!zlc) - return; - - bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); -} - static bool zone_local(struct zone *local_zone, struct zone *zone) { return local_zone->node == zone->node; @@ -2416,28 +2448,7 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < RECLAIM_DISTANCE; } - #else /* CONFIG_NUMA */ - -static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) -{ - return NULL; -} - -static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, - nodemask_t *allowednodes) -{ - return 1; -} - -static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) -{ -} - -static void zlc_clear_zones_full(struct zonelist *zonelist) -{ -} - static bool zone_local(struct zone *local_zone, struct zone *zone) { return true; @@ -2447,7 +2458,6 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { return true; } - #endif /* CONFIG_NUMA */ static void reset_alloc_batches(struct zone *preferred_zone) @@ -2474,11 +2484,6 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, struct zoneref *z; struct page *page = NULL; struct zone *zone; - nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ - int zlc_active = 0; /* set if using zonelist_cache */ - int did_zlc_setup = 0; /* just call zlc_setup() one time */ - bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && - (gfp_mask & __GFP_WRITE); int nr_fair_skipped = 0; bool zonelist_rescan; @@ -2493,9 +2498,6 @@ zonelist_scan: ac->nodemask) { unsigned long mark; - if (IS_ENABLED(CONFIG_NUMA) && zlc_active && - !zlc_zone_worth_trying(zonelist, z, allowednodes)) - continue; if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed(zone, gfp_mask)) @@ -2533,14 +2535,14 @@ zonelist_scan: * * XXX: For now, allow allocations to potentially * exceed the per-zone dirty limit in the slowpath - * (ALLOC_WMARK_LOW unset) before going into reclaim, + * (spread_dirty_pages unset) before going into reclaim, * which is important when on a NUMA setup the allowed * zones are together not big enough to reach the * global limit. The proper fix for these situations * will require awareness of zones in the * dirty-throttling and the flusher threads. */ - if (consider_zone_dirty && !zone_dirty_ok(zone)) + if (ac->spread_dirty_pages && !zone_dirty_ok(zone)) continue; mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; @@ -2553,28 +2555,8 @@ zonelist_scan: if (alloc_flags & ALLOC_NO_WATERMARKS) goto try_this_zone; - if (IS_ENABLED(CONFIG_NUMA) && - !did_zlc_setup && nr_online_nodes > 1) { - /* - * we do zlc_setup if there are multiple nodes - * and before considering the first zone allowed - * by the cpuset. - */ - allowednodes = zlc_setup(zonelist, alloc_flags); - zlc_active = 1; - did_zlc_setup = 1; - } - if (zone_reclaim_mode == 0 || !zone_allows_reclaim(ac->preferred_zone, zone)) - goto this_zone_full; - - /* - * As we may have just activated ZLC, check if the first - * eligible zone has failed zone_reclaim recently. - */ - if (IS_ENABLED(CONFIG_NUMA) && zlc_active && - !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; ret = zone_reclaim(zone, gfp_mask, order); @@ -2591,34 +2573,26 @@ zonelist_scan: ac->classzone_idx, alloc_flags)) goto try_this_zone; - /* - * Failed to reclaim enough to meet watermark. - * Only mark the zone full if checking the min - * watermark or if we failed to reclaim just - * 1<<order pages or else the page allocator - * fastpath will prematurely mark zones full - * when the watermark is between the low and - * min watermarks. - */ - if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) || - ret == ZONE_RECLAIM_SOME) - goto this_zone_full; - continue; } } try_this_zone: page = buffered_rmqueue(ac->preferred_zone, zone, order, - gfp_mask, ac->migratetype); + gfp_mask, alloc_flags, ac->migratetype); if (page) { if (prep_new_page(page, order, gfp_mask, alloc_flags)) goto try_this_zone; + + /* + * If this is a high-order atomic allocation then check + * if the pageblock should be reserved for the future + */ + if (unlikely(order && (alloc_flags & ALLOC_HARDER))) + reserve_highatomic_pageblock(page, zone, order); + return page; } -this_zone_full: - if (IS_ENABLED(CONFIG_NUMA) && zlc_active) - zlc_mark_zone_full(zonelist, z); } /* @@ -2639,12 +2613,6 @@ this_zone_full: zonelist_rescan = true; } - if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) { - /* Disable zlc cache for second zonelist scan */ - zlc_active = 0; - zonelist_rescan = true; - } - if (zonelist_rescan) goto zonelist_scan; @@ -2669,7 +2637,7 @@ static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); -void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) +void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...) { unsigned int filter = SHOW_MEM_FILTER_NODES; @@ -2686,7 +2654,7 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) if (test_thread_flag(TIF_MEMDIE) || (current->flags & (PF_MEMALLOC | PF_EXITING))) filter &= ~SHOW_MEM_FILTER_NODES; - if (in_interrupt() || !(gfp_mask & __GFP_WAIT)) + if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) filter &= ~SHOW_MEM_FILTER_NODES; if (fmt) { @@ -2703,7 +2671,7 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) va_end(args); } - pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n", + pr_warn("%s: page allocation failure: order:%u, mode:0x%x\n", current->comm, order, gfp_mask); dump_stack(); @@ -2889,19 +2857,17 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, if (unlikely(!(*did_some_progress))) return NULL; - /* After successful reclaim, reconsider all zones for allocation */ - if (IS_ENABLED(CONFIG_NUMA)) - zlc_clear_zones_full(ac->zonelist); - retry: page = get_page_from_freelist(gfp_mask, order, alloc_flags & ~ALLOC_NO_WATERMARKS, ac); /* * If an allocation failed after direct reclaim, it could be because - * pages are pinned on the per-cpu lists. Drain them and try again + * pages are pinned on the per-cpu lists or in high alloc reserves. + * Shrink them them and try again */ if (!page && !drained) { + unreserve_highatomic_pageblock(ac); drain_all_pages(NULL); drained = true; goto retry; @@ -2946,7 +2912,6 @@ static inline int gfp_to_alloc_flags(gfp_t gfp_mask) { int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; - const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD)); /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); @@ -2955,11 +2920,11 @@ gfp_to_alloc_flags(gfp_t gfp_mask) * The caller may dip into page reserves a bit more if the caller * cannot run direct reclaim, or if the caller has realtime scheduling * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will - * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH). + * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH). */ alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); - if (atomic) { + if (gfp_mask & __GFP_ATOMIC) { /* * Not worth trying to allocate harder for __GFP_NOMEMALLOC even * if it can't schedule. @@ -2996,11 +2961,16 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); } +static inline bool is_thp_gfp_mask(gfp_t gfp_mask) +{ + return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE; +} + static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) { - const gfp_t wait = gfp_mask & __GFP_WAIT; + bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; struct page *page = NULL; int alloc_flags; unsigned long pages_reclaimed = 0; @@ -3021,15 +2991,23 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, } /* + * We also sanity check to catch abuse of atomic reserves being used by + * callers that are not in atomic context. + */ + if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) == + (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) + gfp_mask &= ~__GFP_ATOMIC; + + /* * If this allocation cannot block and it is for a specific node, then * fail early. There's no need to wakeup kswapd or retry for a * speculative node-specific allocation. */ - if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !wait) + if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !can_direct_reclaim) goto nopage; retry: - if (!(gfp_mask & __GFP_NO_KSWAPD)) + if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, ac); /* @@ -3072,8 +3050,8 @@ retry: } } - /* Atomic allocations - we can't balance anything */ - if (!wait) { + /* Caller is not willing to reclaim, we can't balance anything */ + if (!can_direct_reclaim) { /* * All existing users of the deprecated __GFP_NOFAIL are * blockable, so warn of any new users that actually allow this @@ -3103,7 +3081,7 @@ retry: goto got_pg; /* Checks for THP-specific high-order allocations */ - if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) { + if (is_thp_gfp_mask(gfp_mask)) { /* * If compaction is deferred for high-order allocations, it is * because sync compaction recently failed. If this is the case @@ -3138,8 +3116,7 @@ retry: * fault, so use asynchronous memory compaction for THP unless it is * khugepaged trying to collapse. */ - if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE || - (current->flags & PF_KTHREAD)) + if (!is_thp_gfp_mask(gfp_mask) || (current->flags & PF_KTHREAD)) migration_mode = MIGRATE_SYNC_LIGHT; /* Try direct reclaim and then allocating */ @@ -3210,7 +3187,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, lockdep_trace_alloc(gfp_mask); - might_sleep_if(gfp_mask & __GFP_WAIT); + might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); if (should_fail_alloc_page(gfp_mask, order)) return NULL; @@ -3231,6 +3208,10 @@ retry_cpuset: /* We set it here, as __alloc_pages_slowpath might have changed it */ ac.zonelist = zonelist; + + /* Dirty zone balancing only done in the fast path */ + ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE); + /* The preferred zone is used for statistics later */ preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx, ac.nodemask ? : &cpuset_current_mems_allowed, @@ -3249,6 +3230,7 @@ retry_cpuset: * complete. */ alloc_mask = memalloc_noio_flags(gfp_mask); + ac.spread_dirty_pages = false; page = __alloc_pages_slowpath(alloc_mask, order, &ac); } @@ -3467,7 +3449,8 @@ void free_kmem_pages(unsigned long addr, unsigned int order) } } -static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) +static void *make_alloc_exact(unsigned long addr, unsigned int order, + size_t size) { if (addr) { unsigned long alloc_end = addr + (PAGE_SIZE << order); @@ -3517,7 +3500,7 @@ EXPORT_SYMBOL(alloc_pages_exact); */ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) { - unsigned order = get_order(size); + unsigned int order = get_order(size); struct page *p = alloc_pages_node(nid, gfp_mask, order); if (!p) return NULL; @@ -3666,7 +3649,6 @@ static void show_migration_types(unsigned char type) [MIGRATE_UNMOVABLE] = 'U', [MIGRATE_RECLAIMABLE] = 'E', [MIGRATE_MOVABLE] = 'M', - [MIGRATE_RESERVE] = 'R', #ifdef CONFIG_CMA [MIGRATE_CMA] = 'C', #endif @@ -3819,7 +3801,8 @@ void show_free_areas(unsigned int filter) } for_each_populated_zone(zone) { - unsigned long nr[MAX_ORDER], flags, order, total = 0; + unsigned int order; + unsigned long nr[MAX_ORDER], flags, total = 0; unsigned char types[MAX_ORDER]; if (skip_free_areas_node(filter, zone_to_nid(zone))) @@ -4168,7 +4151,7 @@ static void build_zonelists(pg_data_t *pgdat) nodemask_t used_mask; int local_node, prev_node; struct zonelist *zonelist; - int order = current_zonelist_order; + unsigned int order = current_zonelist_order; /* initialize zonelists */ for (i = 0; i < MAX_ZONELISTS; i++) { @@ -4212,20 +4195,6 @@ static void build_zonelists(pg_data_t *pgdat) build_thisnode_zonelists(pgdat); } -/* Construct the zonelist performance cache - see further mmzone.h */ -static void build_zonelist_cache(pg_data_t *pgdat) -{ - struct zonelist *zonelist; - struct zonelist_cache *zlc; - struct zoneref *z; - - zonelist = &pgdat->node_zonelists[0]; - zonelist->zlcache_ptr = zlc = &zonelist->zlcache; - bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); - for (z = zonelist->_zonerefs; z->zone; z++) - zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); -} - #ifdef CONFIG_HAVE_MEMORYLESS_NODES /* * Return node id of node used for "local" allocations. @@ -4286,12 +4255,6 @@ static void build_zonelists(pg_data_t *pgdat) zonelist->_zonerefs[j].zone_idx = 0; } -/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ -static void build_zonelist_cache(pg_data_t *pgdat) -{ - pgdat->node_zonelists[0].zlcache_ptr = NULL; -} - #endif /* CONFIG_NUMA */ /* @@ -4332,14 +4295,12 @@ static int __build_all_zonelists(void *data) if (self && !node_online(self->node_id)) { build_zonelists(self); - build_zonelist_cache(self); } for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); build_zonelists(pgdat); - build_zonelist_cache(pgdat); } /* @@ -4499,120 +4460,6 @@ static inline unsigned long wait_table_bits(unsigned long size) } /* - * Check if a pageblock contains reserved pages - */ -static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn) -{ - unsigned long pfn; - - for (pfn = start_pfn; pfn < end_pfn; pfn++) { - if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn))) - return 1; - } - return 0; -} - -/* - * Mark a number of pageblocks as MIGRATE_RESERVE. The number - * of blocks reserved is based on min_wmark_pages(zone). The memory within - * the reserve will tend to store contiguous free pages. Setting min_free_kbytes - * higher will lead to a bigger reserve which will get freed as contiguous - * blocks as reclaim kicks in - */ -static void setup_zone_migrate_reserve(struct zone *zone) -{ - unsigned long start_pfn, pfn, end_pfn, block_end_pfn; - struct page *page; - unsigned long block_migratetype; - int reserve; - int old_reserve; - - /* - * Get the start pfn, end pfn and the number of blocks to reserve - * We have to be careful to be aligned to pageblock_nr_pages to - * make sure that we always check pfn_valid for the first page in - * the block. - */ - start_pfn = zone->zone_start_pfn; - end_pfn = zone_end_pfn(zone); - start_pfn = roundup(start_pfn, pageblock_nr_pages); - reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> - pageblock_order; - - /* - * Reserve blocks are generally in place to help high-order atomic - * allocations that are short-lived. A min_free_kbytes value that - * would result in more than 2 reserve blocks for atomic allocations - * is assumed to be in place to help anti-fragmentation for the - * future allocation of hugepages at runtime. - */ - reserve = min(2, reserve); - old_reserve = zone->nr_migrate_reserve_block; - - /* When memory hot-add, we almost always need to do nothing */ - if (reserve == old_reserve) - return; - zone->nr_migrate_reserve_block = reserve; - - for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { - if (!early_page_nid_uninitialised(pfn, zone_to_nid(zone))) - return; - - if (!pfn_valid(pfn)) - continue; - page = pfn_to_page(pfn); - - /* Watch out for overlapping nodes */ - if (page_to_nid(page) != zone_to_nid(zone)) - continue; - - block_migratetype = get_pageblock_migratetype(page); - - /* Only test what is necessary when the reserves are not met */ - if (reserve > 0) { - /* - * Blocks with reserved pages will never free, skip - * them. - */ - block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); - if (pageblock_is_reserved(pfn, block_end_pfn)) - continue; - - /* If this block is reserved, account for it */ - if (block_migratetype == MIGRATE_RESERVE) { - reserve--; - continue; - } - - /* Suitable for reserving if this block is movable */ - if (block_migratetype == MIGRATE_MOVABLE) { - set_pageblock_migratetype(page, - MIGRATE_RESERVE); - move_freepages_block(zone, page, - MIGRATE_RESERVE); - reserve--; - continue; - } - } else if (!old_reserve) { - /* - * At boot time we don't need to scan the whole zone - * for turning off MIGRATE_RESERVE. - */ - break; - } - - /* - * If the reserve is met and this is a previous reserved block, - * take it back - */ - if (block_migratetype == MIGRATE_RESERVE) { - set_pageblock_migratetype(page, MIGRATE_MOVABLE); - move_freepages_block(zone, page, MIGRATE_MOVABLE); - } - } -} - -/* * Initially all pages are reserved - free ones are freed * up by free_all_bootmem() once the early boot process is * done. Non-atomic initialization, single-pass. @@ -4651,9 +4498,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, * movable at startup. This will force kernel allocations * to reserve their blocks rather than leaking throughout * the address space during boot when many long-lived - * kernel allocations are made. Later some blocks near - * the start are marked MIGRATE_RESERVE by - * setup_zone_migrate_reserve() + * kernel allocations are made. * * bitmap is created for zone's valid pfn range. but memmap * can be created for invalid pages (for alignment) @@ -6214,7 +6059,6 @@ static void __setup_per_zone_wmarks(void) high_wmark_pages(zone) - low_wmark_pages(zone) - atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); - setup_zone_migrate_reserve(zone); spin_unlock_irqrestore(&zone->lock, flags); } @@ -6836,7 +6680,8 @@ int alloc_contig_range(unsigned long start, unsigned long end, unsigned migratetype) { unsigned long outer_start, outer_end; - int ret = 0, order; + unsigned int order; + int ret = 0; struct compact_control cc = { .nr_migratepages = 0, diff --git a/mm/readahead.c b/mm/readahead.c index 998ad592f408..ba22d7fe0afb 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -90,7 +90,7 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, page = list_to_page(pages); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, - GFP_KERNEL & mapping_gfp_mask(mapping))) { + mapping_gfp_constraint(mapping, GFP_KERNEL))) { read_cache_pages_invalidate_page(mapping, page); continue; } @@ -128,7 +128,7 @@ static int read_pages(struct address_space *mapping, struct file *filp, struct page *page = list_to_page(pages); list_del(&page->lru); if (!add_to_page_cache_lru(page, mapping, page->index, - GFP_KERNEL & mapping_gfp_mask(mapping))) { + mapping_gfp_constraint(mapping, GFP_KERNEL))) { mapping->a_ops->readpage(filp, page); } page_cache_release(page); diff --git a/mm/shmem.c b/mm/shmem.c index 3b8b73928398..9187eee4128b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -73,6 +73,8 @@ static struct vfsmount *shm_mnt; #include <asm/uaccess.h> #include <asm/pgtable.h> +#include "internal.h" + #define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) #define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) diff --git a/mm/slab.c b/mm/slab.c index 272e809404d5..e0819fa96559 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1031,12 +1031,12 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) } /* - * Construct gfp mask to allocate from a specific node but do not invoke reclaim - * or warn about failures. + * Construct gfp mask to allocate from a specific node but do not direct reclaim + * or warn about failures. kswapd may still wake to reclaim in the background. */ static inline gfp_t gfp_exact_node(gfp_t flags) { - return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_WAIT; + return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_DIRECT_RECLAIM; } #endif @@ -1889,21 +1889,10 @@ static void slab_destroy(struct kmem_cache *cachep, struct page *page) freelist = page->freelist; slab_destroy_debugcheck(cachep, page); - if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { - struct rcu_head *head; - - /* - * RCU free overloads the RCU head over the LRU. - * slab_page has been overloeaded over the LRU, - * however it is not used from now on so that - * we can use it safely. - */ - head = (void *)&page->rcu_head; - call_rcu(head, kmem_rcu_free); - - } else { + if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) + call_rcu(&page->rcu_head, kmem_rcu_free); + else kmem_freepages(cachep, page); - } /* * From now on, we don't use freelist @@ -2633,7 +2622,7 @@ static int cache_grow(struct kmem_cache *cachep, offset *= cachep->colour_off; - if (local_flags & __GFP_WAIT) + if (gfpflags_allow_blocking(local_flags)) local_irq_enable(); /* @@ -2663,7 +2652,7 @@ static int cache_grow(struct kmem_cache *cachep, cache_init_objs(cachep, page); - if (local_flags & __GFP_WAIT) + if (gfpflags_allow_blocking(local_flags)) local_irq_disable(); check_irq_off(); spin_lock(&n->list_lock); @@ -2677,7 +2666,7 @@ static int cache_grow(struct kmem_cache *cachep, opps1: kmem_freepages(cachep, page); failed: - if (local_flags & __GFP_WAIT) + if (gfpflags_allow_blocking(local_flags)) local_irq_disable(); return 0; } @@ -2869,7 +2858,7 @@ force_grow: static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags) { - might_sleep_if(flags & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(flags)); #if DEBUG kmem_flagcheck(cachep, flags); #endif @@ -3057,11 +3046,11 @@ retry: */ struct page *page; - if (local_flags & __GFP_WAIT) + if (gfpflags_allow_blocking(local_flags)) local_irq_enable(); kmem_flagcheck(cache, flags); page = kmem_getpages(cache, local_flags, numa_mem_id()); - if (local_flags & __GFP_WAIT) + if (gfpflags_allow_blocking(local_flags)) local_irq_disable(); if (page) { /* diff --git a/mm/slub.c b/mm/slub.c index 75a5fa92ac2a..7cb4bf9ae320 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1265,7 +1265,7 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, { flags &= gfp_allowed_mask; lockdep_trace_alloc(flags); - might_sleep_if(flags & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(flags)); if (should_failslab(s->object_size, flags, s->flags)) return NULL; @@ -1353,7 +1353,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) flags &= gfp_allowed_mask; - if (flags & __GFP_WAIT) + if (gfpflags_allow_blocking(flags)) local_irq_enable(); flags |= s->allocflags; @@ -1363,8 +1363,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) * so we fall-back to the minimum order allocation. */ alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; - if ((alloc_gfp & __GFP_WAIT) && oo_order(oo) > oo_order(s->min)) - alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_WAIT; + if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min)) + alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_DIRECT_RECLAIM; page = alloc_slab_page(s, alloc_gfp, node, oo); if (unlikely(!page)) { @@ -1424,7 +1424,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) page->frozen = 1; out: - if (flags & __GFP_WAIT) + if (gfpflags_allow_blocking(flags)) local_irq_disable(); if (!page) return NULL; @@ -1507,10 +1507,7 @@ static void free_slab(struct kmem_cache *s, struct page *page) VM_BUG_ON(s->reserved != sizeof(*head)); head = page_address(page) + offset; } else { - /* - * RCU free overloads the RCU head over the LRU - */ - head = (void *)&page->lru; + head = &page->rcu_head; } call_rcu(head, rcu_free_slab); diff --git a/mm/swap.c b/mm/swap.c index 983f692a47fd..39395fb549c0 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -201,7 +201,7 @@ out_put_single: __put_single_page(page); return; } - VM_BUG_ON_PAGE(page_head != page->first_page, page); + VM_BUG_ON_PAGE(page_head != compound_head(page), page); /* * We can release the refcount taken by * get_page_unless_zero() now that @@ -262,7 +262,7 @@ static void put_compound_page(struct page *page) * Case 3 is possible, as we may race with * __split_huge_page_refcount tearing down a THP page. */ - page_head = compound_head_by_tail(page); + page_head = compound_head(page); if (!__compound_tail_refcounted(page_head)) put_unrefcounted_compound_page(page_head, page); else diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9db9ef5e8481..d04563480c94 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -35,6 +35,8 @@ #include <asm/tlbflush.h> #include <asm/shmparam.h> +#include "internal.h" + struct vfree_deferred { struct llist_head list; struct work_struct wq; @@ -1617,7 +1619,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, goto fail; } area->pages[i] = page; - if (gfp_mask & __GFP_WAIT) + if (gfpflags_allow_blocking(gfp_mask)) cond_resched(); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 55721b619aee..2aec4241b42a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1476,7 +1476,7 @@ static int too_many_isolated(struct zone *zone, int file, * won't get blocked by normal direct-reclaimers, forming a circular * deadlock. */ - if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS) + if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) inactive >>= 3; return isolated > inactive; @@ -2477,7 +2477,7 @@ static inline bool compaction_ready(struct zone *zone, int order) balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); watermark = high_wmark_pages(zone) + balance_gap + (2UL << order); - watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); + watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0); /* * If compaction is deferred, reclaim up to a point where @@ -2960,7 +2960,7 @@ static bool zone_balanced(struct zone *zone, int order, unsigned long balance_gap, int classzone_idx) { if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) + - balance_gap, classzone_idx, 0)) + balance_gap, classzone_idx)) return false; if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone, @@ -3791,7 +3791,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) /* * Do not scan if the allocation should not be delayed. */ - if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) + if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC)) return ZONE_RECLAIM_NOSCAN; /* diff --git a/mm/vmstat.c b/mm/vmstat.c index ffcb4f58bf3e..879a2be23325 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -923,7 +923,7 @@ static char * const migratetype_names[MIGRATE_TYPES] = { "Unmovable", "Reclaimable", "Movable", - "Reserve", + "HighAtomic", #ifdef CONFIG_CMA "CMA", #endif diff --git a/mm/zbud.c b/mm/zbud.c index fa48bcdff9d5..d8a181fd779b 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -137,7 +137,7 @@ static const struct zbud_ops zbud_zpool_ops = { .evict = zbud_zpool_evict }; -static void *zbud_zpool_create(char *name, gfp_t gfp, +static void *zbud_zpool_create(const char *name, gfp_t gfp, const struct zpool_ops *zpool_ops, struct zpool *zpool) { diff --git a/mm/zpool.c b/mm/zpool.c index 8f670d3e8706..fd3ff719c32c 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -18,8 +18,6 @@ #include <linux/zpool.h> struct zpool { - char *type; - struct zpool_driver *driver; void *pool; const struct zpool_ops *ops; @@ -73,7 +71,8 @@ int zpool_unregister_driver(struct zpool_driver *driver) } EXPORT_SYMBOL(zpool_unregister_driver); -static struct zpool_driver *zpool_get_driver(char *type) +/* this assumes @type is null-terminated. */ +static struct zpool_driver *zpool_get_driver(const char *type) { struct zpool_driver *driver; @@ -113,6 +112,8 @@ static void zpool_put_driver(struct zpool_driver *driver) * not be loaded, and calling @zpool_create_pool() with the pool type will * fail. * + * The @type string must be null-terminated. + * * Returns: true if @type pool is available, false if not */ bool zpool_has_pool(char *type) @@ -145,9 +146,11 @@ EXPORT_SYMBOL(zpool_has_pool); * * Implementations must guarantee this to be thread-safe. * + * The @type and @name strings must be null-terminated. + * * Returns: New zpool on success, NULL on failure. */ -struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, +struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, const struct zpool_ops *ops) { struct zpool_driver *driver; @@ -174,7 +177,6 @@ struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, return NULL; } - zpool->type = driver->type; zpool->driver = driver; zpool->pool = driver->create(name, gfp, ops, zpool); zpool->ops = ops; @@ -208,7 +210,7 @@ struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, */ void zpool_destroy_pool(struct zpool *zpool) { - pr_debug("destroying pool type %s\n", zpool->type); + pr_debug("destroying pool type %s\n", zpool->driver->type); spin_lock(&pools_lock); list_del(&zpool->list); @@ -228,9 +230,9 @@ void zpool_destroy_pool(struct zpool *zpool) * * Returns: The type of zpool. */ -char *zpool_get_type(struct zpool *zpool) +const char *zpool_get_type(struct zpool *zpool) { - return zpool->type; + return zpool->driver->type; } /** diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index f135b1b6fcdc..9f15bdd9163c 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -16,7 +16,7 @@ * struct page(s) to form a zspage. * * Usage of struct page fields: - * page->first_page: points to the first component (0-order) page + * page->private: points to the first component (0-order) page * page->index (union with page->freelist): offset of the first object * starting in this page. For the first page, this is * always 0, so we use this field (aka freelist) to point @@ -26,8 +26,7 @@ * * For _first_ page only: * - * page->private (union with page->first_page): refers to the - * component page after the first page + * page->private: refers to the component page after the first page * If the page is first_page for huge object, it stores handle. * Look at size_class->huge. * page->freelist: points to the first free object in zspage. @@ -38,6 +37,7 @@ * page->lru: links together first pages of various zspages. * Basically forming list of zspages in a fullness group. * page->mapping: class index and fullness group of the zspage + * page->inuse: the number of objects that are used in this zspage * * Usage of struct page flags: * PG_private: identifies the first component page @@ -58,7 +58,7 @@ #include <linux/cpumask.h> #include <linux/cpu.h> #include <linux/vmalloc.h> -#include <linux/hardirq.h> +#include <linux/preempt.h> #include <linux/spinlock.h> #include <linux/types.h> #include <linux/debugfs.h> @@ -166,9 +166,14 @@ enum zs_stat_type { OBJ_USED, CLASS_ALMOST_FULL, CLASS_ALMOST_EMPTY, - NR_ZS_STAT_TYPE, }; +#ifdef CONFIG_ZSMALLOC_STAT +#define NR_ZS_STAT_TYPE (CLASS_ALMOST_EMPTY + 1) +#else +#define NR_ZS_STAT_TYPE (OBJ_USED + 1) +#endif + struct zs_size_stat { unsigned long objs[NR_ZS_STAT_TYPE]; }; @@ -237,7 +242,7 @@ struct link_free { }; struct zs_pool { - char *name; + const char *name; struct size_class **size_class; struct kmem_cache *handle_cachep; @@ -311,7 +316,7 @@ static void record_obj(unsigned long handle, unsigned long obj) #ifdef CONFIG_ZPOOL -static void *zs_zpool_create(char *name, gfp_t gfp, +static void *zs_zpool_create(const char *name, gfp_t gfp, const struct zpool_ops *zpool_ops, struct zpool *zpool) { @@ -447,19 +452,23 @@ static int get_size_class_index(int size) static inline void zs_stat_inc(struct size_class *class, enum zs_stat_type type, unsigned long cnt) { - class->stats.objs[type] += cnt; + if (type < NR_ZS_STAT_TYPE) + class->stats.objs[type] += cnt; } static inline void zs_stat_dec(struct size_class *class, enum zs_stat_type type, unsigned long cnt) { - class->stats.objs[type] -= cnt; + if (type < NR_ZS_STAT_TYPE) + class->stats.objs[type] -= cnt; } static inline unsigned long zs_stat_get(struct size_class *class, enum zs_stat_type type) { - return class->stats.objs[type]; + if (type < NR_ZS_STAT_TYPE) + return class->stats.objs[type]; + return 0; } #ifdef CONFIG_ZSMALLOC_STAT @@ -548,7 +557,7 @@ static const struct file_operations zs_stat_size_ops = { .release = single_release, }; -static int zs_pool_stat_create(char *name, struct zs_pool *pool) +static int zs_pool_stat_create(const char *name, struct zs_pool *pool) { struct dentry *entry; @@ -588,7 +597,7 @@ static void __exit zs_stat_exit(void) { } -static inline int zs_pool_stat_create(char *name, struct zs_pool *pool) +static inline int zs_pool_stat_create(const char *name, struct zs_pool *pool) { return 0; } @@ -764,7 +773,7 @@ static struct page *get_first_page(struct page *page) if (is_first_page(page)) return page; else - return page->first_page; + return (struct page *)page_private(page); } static struct page *get_next_page(struct page *page) @@ -824,7 +833,7 @@ static unsigned long obj_to_head(struct size_class *class, struct page *page, { if (class->huge) { VM_BUG_ON(!is_first_page(page)); - return *(unsigned long *)page_private(page); + return page_private(page); } else return *(unsigned long *)obj; } @@ -949,7 +958,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags) * Allocate individual pages and link them together as: * 1. first page->private = first sub-page * 2. all sub-pages are linked together using page->lru - * 3. each sub-page is linked to the first page using page->first_page + * 3. each sub-page is linked to the first page using page->private * * For each size class, First/Head pages are linked together using * page->lru. Also, we set PG_private to identify the first page @@ -974,7 +983,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags) if (i == 1) set_page_private(first_page, (unsigned long)page); if (i >= 1) - page->first_page = first_page; + set_page_private(page, (unsigned long)first_page); if (i >= 2) list_add(&page->lru, &prev_page->lru); if (i == class->pages_per_zspage - 1) /* last page */ @@ -1428,8 +1437,6 @@ static void obj_free(struct zs_pool *pool, struct size_class *class, struct page *first_page, *f_page; unsigned long f_objidx, f_offset; void *vaddr; - int class_idx; - enum fullness_group fullness; BUG_ON(!obj); @@ -1437,7 +1444,6 @@ static void obj_free(struct zs_pool *pool, struct size_class *class, obj_to_location(obj, &f_page, &f_objidx); first_page = get_first_page(f_page); - get_zspage_mapping(first_page, &class_idx, &fullness); f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); vaddr = kmap_atomic(f_page); @@ -1822,9 +1828,6 @@ static unsigned long zs_shrinker_count(struct shrinker *shrinker, struct zs_pool *pool = container_of(shrinker, struct zs_pool, shrinker); - if (!pool->shrinker_enabled) - return 0; - for (i = zs_size_classes - 1; i >= 0; i--) { class = pool->size_class[i]; if (!class) @@ -1866,7 +1869,7 @@ static int zs_register_shrinker(struct zs_pool *pool) * On success, a pointer to the newly created pool is returned, * otherwise NULL. */ -struct zs_pool *zs_create_pool(char *name, gfp_t flags) +struct zs_pool *zs_create_pool(const char *name, gfp_t flags) { int i; struct zs_pool *pool; diff --git a/mm/zswap.c b/mm/zswap.c index 4043df7c672f..025f8dc723de 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -82,33 +82,27 @@ module_param_named(enabled, zswap_enabled, bool, 0644); /* Crypto compressor to use */ #define ZSWAP_COMPRESSOR_DEFAULT "lzo" -static char zswap_compressor[CRYPTO_MAX_ALG_NAME] = ZSWAP_COMPRESSOR_DEFAULT; -static struct kparam_string zswap_compressor_kparam = { - .string = zswap_compressor, - .maxlen = sizeof(zswap_compressor), -}; +static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; static int zswap_compressor_param_set(const char *, const struct kernel_param *); static struct kernel_param_ops zswap_compressor_param_ops = { .set = zswap_compressor_param_set, - .get = param_get_string, + .get = param_get_charp, + .free = param_free_charp, }; module_param_cb(compressor, &zswap_compressor_param_ops, - &zswap_compressor_kparam, 0644); + &zswap_compressor, 0644); /* Compressed storage zpool to use */ #define ZSWAP_ZPOOL_DEFAULT "zbud" -static char zswap_zpool_type[32 /* arbitrary */] = ZSWAP_ZPOOL_DEFAULT; -static struct kparam_string zswap_zpool_kparam = { - .string = zswap_zpool_type, - .maxlen = sizeof(zswap_zpool_type), -}; +static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; static int zswap_zpool_param_set(const char *, const struct kernel_param *); static struct kernel_param_ops zswap_zpool_param_ops = { - .set = zswap_zpool_param_set, - .get = param_get_string, + .set = zswap_zpool_param_set, + .get = param_get_charp, + .free = param_free_charp, }; -module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_kparam, 0644); +module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644); /* The maximum percentage of memory that the compressed pool can occupy */ static unsigned int zswap_max_pool_percent = 20; @@ -342,7 +336,7 @@ static void zswap_entry_put(struct zswap_tree *tree, static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, pgoff_t offset) { - struct zswap_entry *entry = NULL; + struct zswap_entry *entry; entry = zswap_rb_search(root, offset); if (entry) @@ -571,7 +565,7 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) static struct zswap_pool *zswap_pool_create(char *type, char *compressor) { struct zswap_pool *pool; - gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN; + gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; pool = kzalloc(sizeof(*pool), GFP_KERNEL); if (!pool) { @@ -615,19 +609,29 @@ error: return NULL; } -static struct zswap_pool *__zswap_pool_create_fallback(void) +static __init struct zswap_pool *__zswap_pool_create_fallback(void) { if (!crypto_has_comp(zswap_compressor, 0, 0)) { + if (!strcmp(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT)) { + pr_err("default compressor %s not available\n", + zswap_compressor); + return NULL; + } pr_err("compressor %s not available, using default %s\n", zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT); - strncpy(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT, - sizeof(zswap_compressor)); + param_free_charp(&zswap_compressor); + zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; } if (!zpool_has_pool(zswap_zpool_type)) { + if (!strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { + pr_err("default zpool %s not available\n", + zswap_zpool_type); + return NULL; + } pr_err("zpool %s not available, using default %s\n", zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT); - strncpy(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT, - sizeof(zswap_zpool_type)); + param_free_charp(&zswap_zpool_type); + zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; } return zswap_pool_create(zswap_zpool_type, zswap_compressor); @@ -684,43 +688,39 @@ static void zswap_pool_put(struct zswap_pool *pool) * param callbacks **********************************/ +/* val must be a null-terminated string */ static int __zswap_param_set(const char *val, const struct kernel_param *kp, char *type, char *compressor) { struct zswap_pool *pool, *put_pool = NULL; - char str[kp->str->maxlen], *s; + char *s = strstrip((char *)val); int ret; - /* - * kp is either zswap_zpool_kparam or zswap_compressor_kparam, defined - * at the top of this file, so maxlen is CRYPTO_MAX_ALG_NAME (64) or - * 32 (arbitrary). - */ - strlcpy(str, val, kp->str->maxlen); - s = strim(str); + /* no change required */ + if (!strcmp(s, *(char **)kp->arg)) + return 0; /* if this is load-time (pre-init) param setting, * don't create a pool; that's done during init. */ if (!zswap_init_started) - return param_set_copystring(s, kp); - - /* no change required */ - if (!strncmp(kp->str->string, s, kp->str->maxlen)) - return 0; + return param_set_charp(s, kp); if (!type) { - type = s; - if (!zpool_has_pool(type)) { - pr_err("zpool %s not available\n", type); + if (!zpool_has_pool(s)) { + pr_err("zpool %s not available\n", s); return -ENOENT; } + type = s; } else if (!compressor) { - compressor = s; - if (!crypto_has_comp(compressor, 0, 0)) { - pr_err("compressor %s not available\n", compressor); + if (!crypto_has_comp(s, 0, 0)) { + pr_err("compressor %s not available\n", s); return -ENOENT; } + compressor = s; + } else { + WARN_ON(1); + return -EINVAL; } spin_lock(&zswap_pools_lock); @@ -736,7 +736,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, } if (pool) - ret = param_set_copystring(s, kp); + ret = param_set_charp(s, kp); else ret = -EINVAL; @@ -1011,7 +1011,8 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* store */ len = dlen + sizeof(struct zswap_header); ret = zpool_malloc(entry->pool->zpool, len, - __GFP_NORETRY | __GFP_NOWARN, &handle); + __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM, + &handle); if (ret == -ENOSPC) { zswap_reject_compress_poor++; goto put_dstmem; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index fab4599ba8b2..aa41e6dd6429 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -414,7 +414,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, len += NET_SKB_PAD; if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || - (gfp_mask & (__GFP_WAIT | GFP_DMA))) { + (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); if (!skb) goto skb_fail; @@ -481,7 +481,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, len += NET_SKB_PAD + NET_IP_ALIGN; if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || - (gfp_mask & (__GFP_WAIT | GFP_DMA))) { + (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); if (!skb) goto skb_fail; @@ -4452,7 +4452,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, return NULL; gfp_head = gfp_mask; - if (gfp_head & __GFP_WAIT) + if (gfp_head & __GFP_DIRECT_RECLAIM) gfp_head |= __GFP_REPEAT; *errcode = -ENOBUFS; @@ -4467,7 +4467,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, while (order) { if (npages >= 1 << order) { - page = alloc_pages((gfp_mask & ~__GFP_WAIT) | + page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY, diff --git a/net/core/sock.c b/net/core/sock.c index 7529eb9463be..1e4dd54bfb5a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1944,8 +1944,10 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) pfrag->offset = 0; if (SKB_FRAG_PAGE_ORDER) { - pfrag->page = alloc_pages((gfp & ~__GFP_WAIT) | __GFP_COMP | - __GFP_NOWARN | __GFP_NORETRY, + /* Avoid direct reclaim but allow kswapd to wake */ + pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | + __GFP_COMP | __GFP_NOWARN | + __GFP_NORETRY, SKB_FRAG_PAGE_ORDER); if (likely(pfrag->page)) { pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index fafe33bdb619..59651af8cc27 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2116,7 +2116,7 @@ int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid consume_skb(info.skb2); if (info.delivered) { - if (info.congested && (allocation & __GFP_WAIT)) + if (info.congested && gfpflags_allow_blocking(allocation)) yield(); return 0; } diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 96744b75db93..977fb86065b7 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -305,7 +305,7 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn, gfp_t slab_mask = GFP_NOWAIT; gfp_t page_mask = GFP_NOWAIT; - if (gfp & __GFP_WAIT) { + if (gfp & __GFP_DIRECT_RECLAIM) { slab_mask = GFP_KERNEL; page_mask = GFP_HIGHUSER; } @@ -379,7 +379,7 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp) struct ib_recv_wr *failed_wr; unsigned int posted = 0; int ret = 0; - bool can_wait = !!(gfp & __GFP_WAIT); + bool can_wait = !!(gfp & __GFP_DIRECT_RECLAIM); u32 pos; /* the goal here is to just make sure that someone, somewhere diff --git a/net/rxrpc/ar-connection.c b/net/rxrpc/ar-connection.c index 692b3e67fb54..6c71ed1caf16 100644 --- a/net/rxrpc/ar-connection.c +++ b/net/rxrpc/ar-connection.c @@ -500,7 +500,7 @@ int rxrpc_connect_call(struct rxrpc_sock *rx, if (bundle->num_conns >= 20) { _debug("too many conns"); - if (!(gfp & __GFP_WAIT)) { + if (!gfpflags_allow_blocking(gfp)) { _leave(" = -EAGAIN"); return -EAGAIN; } diff --git a/net/sctp/associola.c b/net/sctp/associola.c index b00f1f9611d6..559afd0ee7de 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1590,7 +1590,7 @@ int sctp_assoc_lookup_laddr(struct sctp_association *asoc, /* Set an association id for a given association */ int sctp_assoc_set_id(struct sctp_association *asoc, gfp_t gfp) { - bool preload = !!(gfp & __GFP_WAIT); + bool preload = gfpflags_allow_blocking(gfp); int ret; /* If the id is already assigned, keep it. */ diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index f2a1131b2f8b..2b3c22808c3b 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -370,6 +370,8 @@ our $typeTypedefs = qr{(?x: $typeKernelTypedefs\b )}; +our $zero_initializer = qr{(?:(?:0[xX])?0+$Int_type?|NULL|false)\b}; + our $logFunctions = qr{(?x: printk(?:_ratelimited|_once|)| (?:[a-z0-9]+_){1,2}(?:printk|emerg|alert|crit|err|warning|warn|notice|info|debug|dbg|vdbg|devel|cont|WARN)(?:_ratelimited|_once|)| @@ -2313,42 +2315,43 @@ sub process { "Remove Gerrit Change-Id's before submitting upstream.\n" . $herecurr); } +# Check if the commit log is in a possible stack dump + if ($in_commit_log && !$commit_log_possible_stack_dump && + ($line =~ /^\s*(?:WARNING:|BUG:)/ || + $line =~ /^\s*\[\s*\d+\.\d{6,6}\s*\]/ || + # timestamp + $line =~ /^\s*\[\<[0-9a-fA-F]{8,}\>\]/)) { + # stack dump address + $commit_log_possible_stack_dump = 1; + } + # Check for line lengths > 75 in commit log, warn once if ($in_commit_log && !$commit_log_long_line && - length($line) > 75 && - !($line =~ /^\s*[a-zA-Z0-9_\/\.]+\s+\|\s+\d+/ || - # file delta changes - $line =~ /^\s*(?:[\w\.\-]+\/)++[\w\.\-]+:/ || - # filename then : - $line =~ /^\s*(?:Fixes:|Link:)/i || - # A Fixes: or Link: line - $commit_log_possible_stack_dump)) { + length($line) > 75 && + !($line =~ /^\s*[a-zA-Z0-9_\/\.]+\s+\|\s+\d+/ || + # file delta changes + $line =~ /^\s*(?:[\w\.\-]+\/)++[\w\.\-]+:/ || + # filename then : + $line =~ /^\s*(?:Fixes:|Link:)/i || + # A Fixes: or Link: line + $commit_log_possible_stack_dump)) { WARN("COMMIT_LOG_LONG_LINE", "Possible unwrapped commit description (prefer a maximum 75 chars per line)\n" . $herecurr); $commit_log_long_line = 1; } -# Check if the commit log is in a possible stack dump - if ($in_commit_log && !$commit_log_possible_stack_dump && - ($line =~ /^\s*(?:WARNING:|BUG:)/ || - $line =~ /^\s*\[\s*\d+\.\d{6,6}\s*\]/ || - # timestamp - $line =~ /^\s*\[\<[0-9a-fA-F]{8,}\>\]/)) { - # stack dump address - $commit_log_possible_stack_dump = 1; - } - # Reset possible stack dump if a blank line is found - if ($in_commit_log && $commit_log_possible_stack_dump && - $line =~ /^\s*$/) { - $commit_log_possible_stack_dump = 0; - } + if ($in_commit_log && $commit_log_possible_stack_dump && + $line =~ /^\s*$/) { + $commit_log_possible_stack_dump = 0; + } # Check for git id commit length and improperly formed commit descriptions - if ($in_commit_log && + if ($in_commit_log && !$commit_log_possible_stack_dump && ($line =~ /\bcommit\s+[0-9a-f]{5,}\b/i || - ($line =~ /\b[0-9a-f]{12,40}\b/i && - $line !~ /\bfixes:\s*[0-9a-f]{12,40}/i))) { + ($line =~ /\b[0-9a-f]{12,40}\b/i && + $line !~ /[\<\[][0-9a-f]{12,40}[\>\]]/i && + $line !~ /\bfixes:\s*[0-9a-f]{12,40}/i))) { my $init_char = "c"; my $orig_commit = ""; my $short = 1; @@ -3333,21 +3336,20 @@ sub process { } # check for global initialisers. - if ($line =~ /^\+$Type\s*$Ident(?:\s+$Modifier)*\s*=\s*(?:0|NULL|false)\s*;/) { + if ($line =~ /^\+$Type\s*$Ident(?:\s+$Modifier)*\s*=\s*($zero_initializer)\s*;/) { if (ERROR("GLOBAL_INITIALISERS", - "do not initialise globals to 0 or NULL\n" . - $herecurr) && + "do not initialise globals to $1\n" . $herecurr) && $fix) { - $fixed[$fixlinenr] =~ s/(^.$Type\s*$Ident(?:\s+$Modifier)*)\s*=\s*(0|NULL|false)\s*;/$1;/; + $fixed[$fixlinenr] =~ s/(^.$Type\s*$Ident(?:\s+$Modifier)*)\s*=\s*$zero_initializer\s*;/$1;/; } } # check for static initialisers. - if ($line =~ /^\+.*\bstatic\s.*=\s*(0|NULL|false)\s*;/) { + if ($line =~ /^\+.*\bstatic\s.*=\s*($zero_initializer)\s*;/) { if (ERROR("INITIALISED_STATIC", - "do not initialise statics to 0 or NULL\n" . + "do not initialise statics to $1\n" . $herecurr) && $fix) { - $fixed[$fixlinenr] =~ s/(\bstatic\s.*?)\s*=\s*(0|NULL|false)\s*;/$1;/; + $fixed[$fixlinenr] =~ s/(\bstatic\s.*?)\s*=\s*$zero_initializer\s*;/$1;/; } } diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index 98bae869f6d0..cab641a12dd5 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -781,6 +781,7 @@ MAINTAINER field selection options: --git-max-maintainers => maximum maintainers to add (default: $email_git_max_maintainers) --git-min-percent => minimum percentage of commits required (default: $email_git_min_percent) --git-blame => use git blame to find modified commits for patch or file + --git-blame-signatures => when used with --git-blame, also include all commit signers --git-since => git history to use (default: $email_git_since) --hg-since => hg history to use (default: $email_hg_since) --interactive => display a menu (mostly useful if used with the --git option) @@ -812,7 +813,7 @@ Other options: --help => show this help information Default options: - [--email --nogit --git-fallback --m --n --l --multiline -pattern-depth=0 + [--email --nogit --git-fallback --m --r --n --l --multiline --pattern-depth=0 --remove-duplicates --rolestats] Notes: @@ -844,6 +845,9 @@ Notes: Entries in this file can be any command line argument. This file is prepended to any additional command line arguments. Multiple lines and # comments are allowed. + Most options have both positive and negative forms. + The negative forms for --<foo> are --no<foo> and --no-<foo>. + EOT } @@ -970,20 +974,29 @@ sub find_ending_index { return $index; } -sub get_maintainer_role { +sub get_subsystem_name { my ($index) = @_; - my $i; my $start = find_starting_index($index); - my $end = find_ending_index($index); - my $role = "unknown"; my $subsystem = $typevalue[$start]; if ($output_section_maxlen && length($subsystem) > $output_section_maxlen) { $subsystem = substr($subsystem, 0, $output_section_maxlen - 3); $subsystem =~ s/\s*$//; $subsystem = $subsystem . "..."; } + return $subsystem; +} + +sub get_maintainer_role { + my ($index) = @_; + + my $i; + my $start = find_starting_index($index); + my $end = find_ending_index($index); + + my $role = "unknown"; + my $subsystem = get_subsystem_name($index); for ($i = $start + 1; $i < $end; $i++) { my $tv = $typevalue[$i]; @@ -1017,16 +1030,7 @@ sub get_maintainer_role { sub get_list_role { my ($index) = @_; - my $i; - my $start = find_starting_index($index); - my $end = find_ending_index($index); - - my $subsystem = $typevalue[$start]; - if ($output_section_maxlen && length($subsystem) > $output_section_maxlen) { - $subsystem = substr($subsystem, 0, $output_section_maxlen - 3); - $subsystem =~ s/\s*$//; - $subsystem = $subsystem . "..."; - } + my $subsystem = get_subsystem_name($index); if ($subsystem eq "THE REST") { $subsystem = ""; @@ -1114,7 +1118,8 @@ sub add_categories { } } if ($email_reviewer) { - push_email_addresses($pvalue, 'reviewer'); + my $subsystem = get_subsystem_name($i); + push_email_addresses($pvalue, "reviewer:$subsystem"); } } elsif ($ptype eq "T") { push(@scm, $pvalue); diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c index e24121afb2f2..6eb62936c672 100644 --- a/security/integrity/ima/ima_crypto.c +++ b/security/integrity/ima/ima_crypto.c @@ -126,7 +126,7 @@ static void *ima_alloc_pages(loff_t max_size, size_t *allocated_size, { void *ptr; int order = ima_maxorder; - gfp_t gfp_mask = __GFP_WAIT | __GFP_NOWARN | __GFP_NORETRY; + gfp_t gfp_mask = __GFP_RECLAIM | __GFP_NOWARN | __GFP_NORETRY; if (order) order = min(get_order(max_size), order); diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index cfe121353eec..4b4957b8df4e 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -6,6 +6,7 @@ TARGETS += firmware TARGETS += ftrace TARGETS += futex TARGETS += kcmp +TARGETS += lib TARGETS += membarrier TARGETS += memfd TARGETS += memory-hotplug diff --git a/tools/testing/selftests/lib/Makefile b/tools/testing/selftests/lib/Makefile new file mode 100644 index 000000000000..47147b968514 --- /dev/null +++ b/tools/testing/selftests/lib/Makefile @@ -0,0 +1,8 @@ +# Makefile for lib/ function selftests + +# No binaries, but make sure arg-less "make" doesn't trigger "run_tests" +all: + +TEST_PROGS := printf.sh + +include ../lib.mk diff --git a/tools/testing/selftests/lib/printf.sh b/tools/testing/selftests/lib/printf.sh new file mode 100644 index 000000000000..4fdc70fe6980 --- /dev/null +++ b/tools/testing/selftests/lib/printf.sh @@ -0,0 +1,10 @@ +#!/bin/sh +# Runs printf infrastructure using test_printf kernel module + +if /sbin/modprobe -q test_printf; then + /sbin/modprobe -q -r test_printf + echo "printf: ok" +else + echo "printf: [FAIL]" + exit 1 +fi |