diff options
135 files changed, 1002 insertions, 1056 deletions
diff --git a/Documentation/networking/rds.txt b/Documentation/networking/rds.txt index c67077cbeb80..e1a3d59bbe0f 100644 --- a/Documentation/networking/rds.txt +++ b/Documentation/networking/rds.txt @@ -62,11 +62,10 @@ Socket Interface ================ AF_RDS, PF_RDS, SOL_RDS - These constants haven't been assigned yet, because RDS isn't in - mainline yet. Currently, the kernel module assigns some constant - and publishes it to user space through two sysctl files - /proc/sys/net/rds/pf_rds - /proc/sys/net/rds/sol_rds + AF_RDS and PF_RDS are the domain type to be used with socket(2) + to create RDS sockets. SOL_RDS is the socket-level to be used + with setsockopt(2) and getsockopt(2) for RDS specific socket + options. fd = socket(PF_RDS, SOCK_SEQPACKET, 0); This creates a new, unbound RDS socket. diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c index 99824ff8dd35..df7d8cbee377 100644 --- a/arch/s390/hypfs/inode.c +++ b/arch/s390/hypfs/inode.c @@ -21,7 +21,7 @@ #include <linux/module.h> #include <linux/seq_file.h> #include <linux/mount.h> -#include <linux/aio.h> +#include <linux/uio.h> #include <asm/ebcdic.h> #include "hypfs.h" diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c index 0a465e0f3012..1396ad0787fc 100644 --- a/crypto/algif_hash.c +++ b/crypto/algif_hash.c @@ -56,8 +56,8 @@ static int hash_sendmsg(struct socket *sock, struct msghdr *msg, ctx->more = 0; - while (iov_iter_count(&msg->msg_iter)) { - int len = iov_iter_count(&msg->msg_iter); + while (msg_data_left(msg)) { + int len = msg_data_left(msg); if (len > limit) len = limit; diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c index 0aa02635ceda..945075292bc9 100644 --- a/crypto/algif_skcipher.c +++ b/crypto/algif_skcipher.c @@ -106,7 +106,7 @@ static void skcipher_async_cb(struct crypto_async_request *req, int err) atomic_dec(&ctx->inflight); skcipher_free_async_sgls(sreq); kfree(req); - aio_complete(iocb, err, err); + iocb->ki_complete(iocb, err, err); } static inline int skcipher_sndbuf(struct sock *sk) @@ -641,7 +641,7 @@ static int skcipher_recvmsg_sync(struct socket *sock, struct msghdr *msg, long copied = 0; lock_sock(sk); - while (iov_iter_count(&msg->msg_iter)) { + while (msg_data_left(msg)) { sgl = list_first_entry(&ctx->tsgl, struct skcipher_sg_list, list); sg = sgl->sg; @@ -655,7 +655,7 @@ static int skcipher_recvmsg_sync(struct socket *sock, struct msghdr *msg, goto unlock; } - used = min_t(unsigned long, ctx->used, iov_iter_count(&msg->msg_iter)); + used = min_t(unsigned long, ctx->used, msg_data_left(msg)); used = af_alg_make_sg(&ctx->rsgl, &msg->msg_iter, used); err = used; diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 297110c12635..9c4fd7a8e2e5 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -26,7 +26,7 @@ #include <linux/pfn.h> #include <linux/export.h> #include <linux/io.h> -#include <linux/aio.h> +#include <linux/uio.h> #include <linux/uaccess.h> diff --git a/drivers/char/tile-srom.c b/drivers/char/tile-srom.c index 02e76ac6d282..69f6b4acc377 100644 --- a/drivers/char/tile-srom.c +++ b/drivers/char/tile-srom.c @@ -27,7 +27,6 @@ #include <linux/types.h> /* size_t */ #include <linux/proc_fs.h> #include <linux/fcntl.h> /* O_ACCMODE */ -#include <linux/aio.h> #include <linux/pagemap.h> #include <linux/hugetlb.h> #include <linux/uaccess.h> diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c index 6d7f453b4d05..aed8afee56da 100644 --- a/drivers/infiniband/hw/ipath/ipath_file_ops.c +++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c @@ -40,7 +40,6 @@ #include <linux/slab.h> #include <linux/highmem.h> #include <linux/io.h> -#include <linux/aio.h> #include <linux/jiffies.h> #include <linux/cpu.h> #include <asm/pgtable.h> diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index 41937c6f888a..14046f5a37fa 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -39,7 +39,6 @@ #include <linux/vmalloc.h> #include <linux/highmem.h> #include <linux/io.h> -#include <linux/aio.h> #include <linux/jiffies.h> #include <asm/pgtable.h> #include <linux/delay.h> diff --git a/drivers/misc/mei/amthif.c b/drivers/misc/mei/amthif.c index c4cb9a984a5f..40ea639fa413 100644 --- a/drivers/misc/mei/amthif.c +++ b/drivers/misc/mei/amthif.c @@ -19,7 +19,6 @@ #include <linux/errno.h> #include <linux/types.h> #include <linux/fcntl.h> -#include <linux/aio.h> #include <linux/ioctl.h> #include <linux/cdev.h> #include <linux/list.h> diff --git a/drivers/misc/mei/main.c b/drivers/misc/mei/main.c index 3c019c0e60eb..47680c84801c 100644 --- a/drivers/misc/mei/main.c +++ b/drivers/misc/mei/main.c @@ -22,7 +22,6 @@ #include <linux/errno.h> #include <linux/types.h> #include <linux/fcntl.h> -#include <linux/aio.h> #include <linux/poll.h> #include <linux/init.h> #include <linux/ioctl.h> diff --git a/drivers/misc/mei/pci-me.c b/drivers/misc/mei/pci-me.c index bd3039ab8f98..af44ee26075d 100644 --- a/drivers/misc/mei/pci-me.c +++ b/drivers/misc/mei/pci-me.c @@ -21,7 +21,6 @@ #include <linux/errno.h> #include <linux/types.h> #include <linux/fcntl.h> -#include <linux/aio.h> #include <linux/pci.h> #include <linux/poll.h> #include <linux/ioctl.h> diff --git a/drivers/net/ethernet/broadcom/bgmac.c b/drivers/net/ethernet/broadcom/bgmac.c index fa8f9e147c34..5cb93d1f50a4 100644 --- a/drivers/net/ethernet/broadcom/bgmac.c +++ b/drivers/net/ethernet/broadcom/bgmac.c @@ -123,7 +123,7 @@ bgmac_dma_tx_add_buf(struct bgmac *bgmac, struct bgmac_dma_ring *ring, struct bgmac_dma_desc *dma_desc; u32 ctl1; - if (i == ring->num_slots - 1) + if (i == BGMAC_TX_RING_SLOTS - 1) ctl0 |= BGMAC_DESC_CTL0_EOT; ctl1 = len & BGMAC_DESC_CTL1_LEN; @@ -142,11 +142,10 @@ static netdev_tx_t bgmac_dma_tx_add(struct bgmac *bgmac, { struct device *dma_dev = bgmac->core->dma_dev; struct net_device *net_dev = bgmac->net_dev; - struct bgmac_slot_info *slot = &ring->slots[ring->end]; - int free_slots; + int index = ring->end % BGMAC_TX_RING_SLOTS; + struct bgmac_slot_info *slot = &ring->slots[index]; int nr_frags; u32 flags; - int index = ring->end; int i; if (skb->len > BGMAC_DESC_CTL1_LEN) { @@ -159,12 +158,10 @@ static netdev_tx_t bgmac_dma_tx_add(struct bgmac *bgmac, nr_frags = skb_shinfo(skb)->nr_frags; - if (ring->start <= ring->end) - free_slots = ring->start - ring->end + BGMAC_TX_RING_SLOTS; - else - free_slots = ring->start - ring->end; - - if (free_slots <= nr_frags + 1) { + /* ring->end - ring->start will return the number of valid slots, + * even when ring->end overflows + */ + if (ring->end - ring->start + nr_frags + 1 >= BGMAC_TX_RING_SLOTS) { bgmac_err(bgmac, "TX ring is full, queue should be stopped!\n"); netif_stop_queue(net_dev); return NETDEV_TX_BUSY; @@ -200,7 +197,7 @@ static netdev_tx_t bgmac_dma_tx_add(struct bgmac *bgmac, } slot->skb = skb; - + ring->end += nr_frags + 1; netdev_sent_queue(net_dev, skb->len); wmb(); @@ -208,13 +205,12 @@ static netdev_tx_t bgmac_dma_tx_add(struct bgmac *bgmac, /* Increase ring->end to point empty slot. We tell hardware the first * slot it should *not* read. */ - ring->end = (index + 1) % BGMAC_TX_RING_SLOTS; bgmac_write(bgmac, ring->mmio_base + BGMAC_DMA_TX_INDEX, ring->index_base + - ring->end * sizeof(struct bgmac_dma_desc)); + (ring->end % BGMAC_TX_RING_SLOTS) * + sizeof(struct bgmac_dma_desc)); - free_slots -= nr_frags + 1; - if (free_slots < 8) + if (ring->end - ring->start >= BGMAC_TX_RING_SLOTS - 8) netif_stop_queue(net_dev); return NETDEV_TX_OK; @@ -256,17 +252,17 @@ static void bgmac_dma_tx_free(struct bgmac *bgmac, struct bgmac_dma_ring *ring) empty_slot &= BGMAC_DMA_TX_STATDPTR; empty_slot /= sizeof(struct bgmac_dma_desc); - while (ring->start != empty_slot) { - struct bgmac_slot_info *slot = &ring->slots[ring->start]; - u32 ctl1 = le32_to_cpu(ring->cpu_base[ring->start].ctl1); - int len = ctl1 & BGMAC_DESC_CTL1_LEN; + while (ring->start != ring->end) { + int slot_idx = ring->start % BGMAC_TX_RING_SLOTS; + struct bgmac_slot_info *slot = &ring->slots[slot_idx]; + u32 ctl1; + int len; - if (!slot->dma_addr) { - bgmac_err(bgmac, "Hardware reported transmission for empty TX ring slot %d! End of ring: %d\n", - ring->start, ring->end); - goto next; - } + if (slot_idx == empty_slot) + break; + ctl1 = le32_to_cpu(ring->cpu_base[slot_idx].ctl1); + len = ctl1 & BGMAC_DESC_CTL1_LEN; if (ctl1 & BGMAC_DESC_CTL0_SOF) /* Unmap no longer used buffer */ dma_unmap_single(dma_dev, slot->dma_addr, len, @@ -284,10 +280,8 @@ static void bgmac_dma_tx_free(struct bgmac *bgmac, struct bgmac_dma_ring *ring) slot->skb = NULL; } -next: slot->dma_addr = 0; - if (++ring->start >= BGMAC_TX_RING_SLOTS) - ring->start = 0; + ring->start++; freed = true; } @@ -352,13 +346,13 @@ static int bgmac_dma_rx_skb_for_slot(struct bgmac *bgmac, return -ENOMEM; /* Poison - if everything goes fine, hardware will overwrite it */ - rx = buf; + rx = buf + BGMAC_RX_BUF_OFFSET; rx->len = cpu_to_le16(0xdead); rx->flags = cpu_to_le16(0xbeef); /* Map skb for the DMA */ - dma_addr = dma_map_single(dma_dev, buf, BGMAC_RX_BUF_SIZE, - DMA_FROM_DEVICE); + dma_addr = dma_map_single(dma_dev, buf + BGMAC_RX_BUF_OFFSET, + BGMAC_RX_BUF_SIZE, DMA_FROM_DEVICE); if (dma_mapping_error(dma_dev, dma_addr)) { bgmac_err(bgmac, "DMA mapping error\n"); put_page(virt_to_head_page(buf)); @@ -372,13 +366,23 @@ static int bgmac_dma_rx_skb_for_slot(struct bgmac *bgmac, return 0; } +static void bgmac_dma_rx_update_index(struct bgmac *bgmac, + struct bgmac_dma_ring *ring) +{ + dma_wmb(); + + bgmac_write(bgmac, ring->mmio_base + BGMAC_DMA_RX_INDEX, + ring->index_base + + ring->end * sizeof(struct bgmac_dma_desc)); +} + static void bgmac_dma_rx_setup_desc(struct bgmac *bgmac, struct bgmac_dma_ring *ring, int desc_idx) { struct bgmac_dma_desc *dma_desc = ring->cpu_base + desc_idx; u32 ctl0 = 0, ctl1 = 0; - if (desc_idx == ring->num_slots - 1) + if (desc_idx == BGMAC_RX_RING_SLOTS - 1) ctl0 |= BGMAC_DESC_CTL0_EOT; ctl1 |= BGMAC_RX_BUF_SIZE & BGMAC_DESC_CTL1_LEN; /* Is there any BGMAC device that requires extension? */ @@ -390,6 +394,21 @@ static void bgmac_dma_rx_setup_desc(struct bgmac *bgmac, dma_desc->addr_high = cpu_to_le32(upper_32_bits(ring->slots[desc_idx].dma_addr)); dma_desc->ctl0 = cpu_to_le32(ctl0); dma_desc->ctl1 = cpu_to_le32(ctl1); + + ring->end = desc_idx; +} + +static void bgmac_dma_rx_poison_buf(struct device *dma_dev, + struct bgmac_slot_info *slot) +{ + struct bgmac_rx_header *rx = slot->buf + BGMAC_RX_BUF_OFFSET; + + dma_sync_single_for_cpu(dma_dev, slot->dma_addr, BGMAC_RX_BUF_SIZE, + DMA_FROM_DEVICE); + rx->len = cpu_to_le16(0xdead); + rx->flags = cpu_to_le16(0xbeef); + dma_sync_single_for_device(dma_dev, slot->dma_addr, BGMAC_RX_BUF_SIZE, + DMA_FROM_DEVICE); } static int bgmac_dma_rx_read(struct bgmac *bgmac, struct bgmac_dma_ring *ring, @@ -404,64 +423,53 @@ static int bgmac_dma_rx_read(struct bgmac *bgmac, struct bgmac_dma_ring *ring, end_slot &= BGMAC_DMA_RX_STATDPTR; end_slot /= sizeof(struct bgmac_dma_desc); - ring->end = end_slot; - - while (ring->start != ring->end) { + while (ring->start != end_slot) { struct device *dma_dev = bgmac->core->dma_dev; struct bgmac_slot_info *slot = &ring->slots[ring->start]; - struct bgmac_rx_header *rx = slot->buf; + struct bgmac_rx_header *rx = slot->buf + BGMAC_RX_BUF_OFFSET; struct sk_buff *skb; void *buf = slot->buf; + dma_addr_t dma_addr = slot->dma_addr; u16 len, flags; - /* Unmap buffer to make it accessible to the CPU */ - dma_sync_single_for_cpu(dma_dev, slot->dma_addr, - BGMAC_RX_BUF_SIZE, DMA_FROM_DEVICE); + do { + /* Prepare new skb as replacement */ + if (bgmac_dma_rx_skb_for_slot(bgmac, slot)) { + bgmac_dma_rx_poison_buf(dma_dev, slot); + break; + } - /* Get info from the header */ - len = le16_to_cpu(rx->len); - flags = le16_to_cpu(rx->flags); + /* Unmap buffer to make it accessible to the CPU */ + dma_unmap_single(dma_dev, dma_addr, + BGMAC_RX_BUF_SIZE, DMA_FROM_DEVICE); - do { - dma_addr_t old_dma_addr = slot->dma_addr; - int err; + /* Get info from the header */ + len = le16_to_cpu(rx->len); + flags = le16_to_cpu(rx->flags); /* Check for poison and drop or pass the packet */ if (len == 0xdead && flags == 0xbeef) { bgmac_err(bgmac, "Found poisoned packet at slot %d, DMA issue!\n", ring->start); - dma_sync_single_for_device(dma_dev, - slot->dma_addr, - BGMAC_RX_BUF_SIZE, - DMA_FROM_DEVICE); + put_page(virt_to_head_page(buf)); break; } - /* Omit CRC. */ - len -= ETH_FCS_LEN; - - /* Prepare new skb as replacement */ - err = bgmac_dma_rx_skb_for_slot(bgmac, slot); - if (err) { - /* Poison the old skb */ - rx->len = cpu_to_le16(0xdead); - rx->flags = cpu_to_le16(0xbeef); - - dma_sync_single_for_device(dma_dev, - slot->dma_addr, - BGMAC_RX_BUF_SIZE, - DMA_FROM_DEVICE); + if (len > BGMAC_RX_ALLOC_SIZE) { + bgmac_err(bgmac, "Found oversized packet at slot %d, DMA issue!\n", + ring->start); + put_page(virt_to_head_page(buf)); break; } - bgmac_dma_rx_setup_desc(bgmac, ring, ring->start); - /* Unmap old skb, we'll pass it to the netfif */ - dma_unmap_single(dma_dev, old_dma_addr, - BGMAC_RX_BUF_SIZE, DMA_FROM_DEVICE); + /* Omit CRC. */ + len -= ETH_FCS_LEN; skb = build_skb(buf, BGMAC_RX_ALLOC_SIZE); - skb_put(skb, BGMAC_RX_FRAME_OFFSET + len); - skb_pull(skb, BGMAC_RX_FRAME_OFFSET); + skb_put(skb, BGMAC_RX_FRAME_OFFSET + + BGMAC_RX_BUF_OFFSET + len); + skb_pull(skb, BGMAC_RX_FRAME_OFFSET + + BGMAC_RX_BUF_OFFSET); skb_checksum_none_assert(skb); skb->protocol = eth_type_trans(skb, bgmac->net_dev); @@ -469,6 +477,8 @@ static int bgmac_dma_rx_read(struct bgmac *bgmac, struct bgmac_dma_ring *ring, handled++; } while (0); + bgmac_dma_rx_setup_desc(bgmac, ring, ring->start); + if (++ring->start >= BGMAC_RX_RING_SLOTS) ring->start = 0; @@ -476,6 +486,8 @@ static int bgmac_dma_rx_read(struct bgmac *bgmac, struct bgmac_dma_ring *ring, break; } + bgmac_dma_rx_update_index(bgmac, ring); + return handled; } @@ -509,7 +521,7 @@ static void bgmac_dma_tx_ring_free(struct bgmac *bgmac, struct bgmac_slot_info *slot; int i; - for (i = 0; i < ring->num_slots; i++) { + for (i = 0; i < BGMAC_TX_RING_SLOTS; i++) { int len = dma_desc[i].ctl1 & BGMAC_DESC_CTL1_LEN; slot = &ring->slots[i]; @@ -534,21 +546,22 @@ static void bgmac_dma_rx_ring_free(struct bgmac *bgmac, struct bgmac_slot_info *slot; int i; - for (i = 0; i < ring->num_slots; i++) { + for (i = 0; i < BGMAC_RX_RING_SLOTS; i++) { slot = &ring->slots[i]; - if (!slot->buf) + if (!slot->dma_addr) continue; - if (slot->dma_addr) - dma_unmap_single(dma_dev, slot->dma_addr, - BGMAC_RX_BUF_SIZE, - DMA_FROM_DEVICE); + dma_unmap_single(dma_dev, slot->dma_addr, + BGMAC_RX_BUF_SIZE, + DMA_FROM_DEVICE); put_page(virt_to_head_page(slot->buf)); + slot->dma_addr = 0; } } static void bgmac_dma_ring_desc_free(struct bgmac *bgmac, - struct bgmac_dma_ring *ring) + struct bgmac_dma_ring *ring, + int num_slots) { struct device *dma_dev = bgmac->core->dma_dev; int size; @@ -557,23 +570,33 @@ static void bgmac_dma_ring_desc_free(struct bgmac *bgmac, return; /* Free ring of descriptors */ - size = ring->num_slots * sizeof(struct bgmac_dma_desc); + size = num_slots * sizeof(struct bgmac_dma_desc); dma_free_coherent(dma_dev, size, ring->cpu_base, ring->dma_base); } -static void bgmac_dma_free(struct bgmac *bgmac) +static void bgmac_dma_cleanup(struct bgmac *bgmac) { int i; - for (i = 0; i < BGMAC_MAX_TX_RINGS; i++) { + for (i = 0; i < BGMAC_MAX_TX_RINGS; i++) bgmac_dma_tx_ring_free(bgmac, &bgmac->tx_ring[i]); - bgmac_dma_ring_desc_free(bgmac, &bgmac->tx_ring[i]); - } - for (i = 0; i < BGMAC_MAX_RX_RINGS; i++) { + + for (i = 0; i < BGMAC_MAX_RX_RINGS; i++) bgmac_dma_rx_ring_free(bgmac, &bgmac->rx_ring[i]); - bgmac_dma_ring_desc_free(bgmac, &bgmac->rx_ring[i]); - } +} + +static void bgmac_dma_free(struct bgmac *bgmac) +{ + int i; + + for (i = 0; i < BGMAC_MAX_TX_RINGS; i++) + bgmac_dma_ring_desc_free(bgmac, &bgmac->tx_ring[i], + BGMAC_TX_RING_SLOTS); + + for (i = 0; i < BGMAC_MAX_RX_RINGS; i++) + bgmac_dma_ring_desc_free(bgmac, &bgmac->rx_ring[i], + BGMAC_RX_RING_SLOTS); } static int bgmac_dma_alloc(struct bgmac *bgmac) @@ -596,11 +619,10 @@ static int bgmac_dma_alloc(struct bgmac *bgmac) for (i = 0; i < BGMAC_MAX_TX_RINGS; i++) { ring = &bgmac->tx_ring[i]; - ring->num_slots = BGMAC_TX_RING_SLOTS; ring->mmio_base = ring_base[i]; /* Alloc ring of descriptors */ - size = ring->num_slots * sizeof(struct bgmac_dma_desc); + size = BGMAC_TX_RING_SLOTS * sizeof(struct bgmac_dma_desc); ring->cpu_base = dma_zalloc_coherent(dma_dev, size, &ring->dma_base, GFP_KERNEL); @@ -621,14 +643,11 @@ static int bgmac_dma_alloc(struct bgmac *bgmac) } for (i = 0; i < BGMAC_MAX_RX_RINGS; i++) { - int j; - ring = &bgmac->rx_ring[i]; - ring->num_slots = BGMAC_RX_RING_SLOTS; ring->mmio_base = ring_base[i]; /* Alloc ring of descriptors */ - size = ring->num_slots * sizeof(struct bgmac_dma_desc); + size = BGMAC_RX_RING_SLOTS * sizeof(struct bgmac_dma_desc); ring->cpu_base = dma_zalloc_coherent(dma_dev, size, &ring->dma_base, GFP_KERNEL); @@ -645,15 +664,6 @@ static int bgmac_dma_alloc(struct bgmac *bgmac) ring->index_base = lower_32_bits(ring->dma_base); else ring->index_base = 0; - - /* Alloc RX slots */ - for (j = 0; j < ring->num_slots; j++) { - err = bgmac_dma_rx_skb_for_slot(bgmac, &ring->slots[j]); - if (err) { - bgmac_err(bgmac, "Can't allocate skb for slot in RX ring\n"); - goto err_dma_free; - } - } } return 0; @@ -663,10 +673,10 @@ err_dma_free: return -ENOMEM; } -static void bgmac_dma_init(struct bgmac *bgmac) +static int bgmac_dma_init(struct bgmac *bgmac) { struct bgmac_dma_ring *ring; - int i; + int i, err; for (i = 0; i < BGMAC_MAX_TX_RINGS; i++) { ring = &bgmac->tx_ring[i]; @@ -698,16 +708,24 @@ static void bgmac_dma_init(struct bgmac *bgmac) if (ring->unaligned) bgmac_dma_rx_enable(bgmac, ring); - for (j = 0; j < ring->num_slots; j++) - bgmac_dma_rx_setup_desc(bgmac, ring, j); - - bgmac_write(bgmac, ring->mmio_base + BGMAC_DMA_RX_INDEX, - ring->index_base + - ring->num_slots * sizeof(struct bgmac_dma_desc)); - ring->start = 0; ring->end = 0; + for (j = 0; j < BGMAC_RX_RING_SLOTS; j++) { + err = bgmac_dma_rx_skb_for_slot(bgmac, &ring->slots[j]); + if (err) + goto error; + + bgmac_dma_rx_setup_desc(bgmac, ring, j); + } + + bgmac_dma_rx_update_index(bgmac, ring); } + + return 0; + +error: + bgmac_dma_cleanup(bgmac); + return err; } /************************************************** @@ -1115,8 +1133,6 @@ static void bgmac_chip_reset(struct bgmac *bgmac) bgmac_phy_init(bgmac); netdev_reset_queue(bgmac->net_dev); - - bgmac->int_status = 0; } static void bgmac_chip_intrs_on(struct bgmac *bgmac) @@ -1185,11 +1201,8 @@ static void bgmac_enable(struct bgmac *bgmac) } /* http://bcm-v4.sipsolutions.net/mac-gbit/gmac/chipinit */ -static void bgmac_chip_init(struct bgmac *bgmac, bool full_init) +static void bgmac_chip_init(struct bgmac *bgmac) { - struct bgmac_dma_ring *ring; - int i; - /* 1 interrupt per received frame */ bgmac_write(bgmac, BGMAC_INT_RECV_LAZY, 1 << BGMAC_IRL_FC_SHIFT); @@ -1207,16 +1220,7 @@ static void bgmac_chip_init(struct bgmac *bgmac, bool full_init) bgmac_write(bgmac, BGMAC_RXMAX_LENGTH, 32 + ETHER_MAX_LEN); - if (full_init) { - bgmac_dma_init(bgmac); - if (1) /* FIXME: is there any case we don't want IRQs? */ - bgmac_chip_intrs_on(bgmac); - } else { - for (i = 0; i < BGMAC_MAX_RX_RINGS; i++) { - ring = &bgmac->rx_ring[i]; - bgmac_dma_rx_enable(bgmac, ring); - } - } + bgmac_chip_intrs_on(bgmac); bgmac_enable(bgmac); } @@ -1231,14 +1235,13 @@ static irqreturn_t bgmac_interrupt(int irq, void *dev_id) if (!int_status) return IRQ_NONE; - /* Ack */ - bgmac_write(bgmac, BGMAC_INT_STATUS, int_status); + int_status &= ~(BGMAC_IS_TX0 | BGMAC_IS_RX); + if (int_status) + bgmac_err(bgmac, "Unknown IRQs: 0x%08X\n", int_status); /* Disable new interrupts until handling existing ones */ bgmac_chip_intrs_off(bgmac); - bgmac->int_status = int_status; - napi_schedule(&bgmac->napi); return IRQ_HANDLED; @@ -1247,25 +1250,17 @@ static irqreturn_t bgmac_interrupt(int irq, void *dev_id) static int bgmac_poll(struct napi_struct *napi, int weight) { struct bgmac *bgmac = container_of(napi, struct bgmac, napi); - struct bgmac_dma_ring *ring; int handled = 0; - if (bgmac->int_status & BGMAC_IS_TX0) { - ring = &bgmac->tx_ring[0]; - bgmac_dma_tx_free(bgmac, ring); - bgmac->int_status &= ~BGMAC_IS_TX0; - } + /* Ack */ + bgmac_write(bgmac, BGMAC_INT_STATUS, ~0); - if (bgmac->int_status & BGMAC_IS_RX) { - ring = &bgmac->rx_ring[0]; - handled += bgmac_dma_rx_read(bgmac, ring, weight); - bgmac->int_status &= ~BGMAC_IS_RX; - } + bgmac_dma_tx_free(bgmac, &bgmac->tx_ring[0]); + handled += bgmac_dma_rx_read(bgmac, &bgmac->rx_ring[0], weight); - if (bgmac->int_status) { - bgmac_err(bgmac, "Unknown IRQs: 0x%08X\n", bgmac->int_status); - bgmac->int_status = 0; - } + /* Poll again if more events arrived in the meantime */ + if (bgmac_read(bgmac, BGMAC_INT_STATUS) & (BGMAC_IS_TX0 | BGMAC_IS_RX)) + return handled; if (handled < weight) { napi_complete(napi); @@ -1285,23 +1280,27 @@ static int bgmac_open(struct net_device *net_dev) int err = 0; bgmac_chip_reset(bgmac); + + err = bgmac_dma_init(bgmac); + if (err) + return err; + /* Specs say about reclaiming rings here, but we do that in DMA init */ - bgmac_chip_init(bgmac, true); + bgmac_chip_init(bgmac); err = request_irq(bgmac->core->irq, bgmac_interrupt, IRQF_SHARED, KBUILD_MODNAME, net_dev); if (err < 0) { bgmac_err(bgmac, "IRQ request error: %d!\n", err); - goto err_out; + bgmac_dma_cleanup(bgmac); + return err; } napi_enable(&bgmac->napi); phy_start(bgmac->phy_dev); netif_carrier_on(net_dev); - -err_out: - return err; + return 0; } static int bgmac_stop(struct net_device *net_dev) @@ -1317,6 +1316,7 @@ static int bgmac_stop(struct net_device *net_dev) free_irq(bgmac->core->irq, net_dev); bgmac_chip_reset(bgmac); + bgmac_dma_cleanup(bgmac); return 0; } diff --git a/drivers/net/ethernet/broadcom/bgmac.h b/drivers/net/ethernet/broadcom/bgmac.h index 3ad965fe7fcc..db27febbb215 100644 --- a/drivers/net/ethernet/broadcom/bgmac.h +++ b/drivers/net/ethernet/broadcom/bgmac.h @@ -356,13 +356,15 @@ #define BGMAC_MAX_RX_RINGS 1 #define BGMAC_TX_RING_SLOTS 128 -#define BGMAC_RX_RING_SLOTS 512 - 1 /* Why -1? Well, Broadcom does that... */ +#define BGMAC_RX_RING_SLOTS 512 #define BGMAC_RX_HEADER_LEN 28 /* Last 24 bytes are unused. Well... */ #define BGMAC_RX_FRAME_OFFSET 30 /* There are 2 unused bytes between header and real data */ +#define BGMAC_RX_BUF_OFFSET (NET_SKB_PAD + NET_IP_ALIGN - \ + BGMAC_RX_FRAME_OFFSET) #define BGMAC_RX_MAX_FRAME_SIZE 1536 /* Copied from b44/tg3 */ #define BGMAC_RX_BUF_SIZE (BGMAC_RX_FRAME_OFFSET + BGMAC_RX_MAX_FRAME_SIZE) -#define BGMAC_RX_ALLOC_SIZE (SKB_DATA_ALIGN(BGMAC_RX_BUF_SIZE) + \ +#define BGMAC_RX_ALLOC_SIZE (SKB_DATA_ALIGN(BGMAC_RX_BUF_SIZE + BGMAC_RX_BUF_OFFSET) + \ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) #define BGMAC_BFL_ENETROBO 0x0010 /* has ephy roboswitch spi */ @@ -414,14 +416,13 @@ enum bgmac_dma_ring_type { * empty. */ struct bgmac_dma_ring { - u16 num_slots; - u16 start; - u16 end; + u32 start; + u32 end; - u16 mmio_base; struct bgmac_dma_desc *cpu_base; dma_addr_t dma_base; u32 index_base; /* Used for unaligned rings only, otherwise 0 */ + u16 mmio_base; bool unaligned; struct bgmac_slot_info slots[BGMAC_RX_RING_SLOTS]; @@ -452,7 +453,6 @@ struct bgmac { /* Int */ u32 int_mask; - u32 int_status; /* Current MAC state */ int mac_speed; diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 24e10ea3d5ef..6de054404156 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -724,7 +724,8 @@ static irqreturn_t t4_nondata_intr(int irq, void *cookie) adap->swintr = 1; t4_write_reg(adap, MYPF_REG(PL_PF_INT_CAUSE_A), v); } - t4_slow_intr_handler(adap); + if (adap->flags & MASTER_PF) + t4_slow_intr_handler(adap); return IRQ_HANDLED; } diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c index e622214e2eca..0d2eddab04ef 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c @@ -121,12 +121,6 @@ #define NOMEM_TMR_IDX (SGE_NTIMERS - 1) /* - * An FL with <= FL_STARVE_THRES buffers is starving and a periodic timer will - * attempt to refill it. - */ -#define FL_STARVE_THRES 4 - -/* * Suspend an Ethernet Tx queue with fewer available descriptors than this. * This is the same as calc_tx_descs() for a TSO packet with * nr_frags == MAX_SKB_FRAGS. @@ -144,7 +138,7 @@ * Max Tx descriptor space we allow for an Ethernet packet to be inlined * into a WR. */ -#define MAX_IMM_TX_PKT_LEN 128 +#define MAX_IMM_TX_PKT_LEN 256 /* * Max size of a WR sent through a control Tx queue. @@ -248,9 +242,21 @@ static inline unsigned int fl_cap(const struct sge_fl *fl) return fl->size - 8; /* 1 descriptor = 8 buffers */ } -static inline bool fl_starving(const struct sge_fl *fl) +/** + * fl_starving - return whether a Free List is starving. + * @adapter: pointer to the adapter + * @fl: the Free List + * + * Tests specified Free List to see whether the number of buffers + * available to the hardware has falled below our "starvation" + * threshold. + */ +static inline bool fl_starving(const struct adapter *adapter, + const struct sge_fl *fl) { - return fl->avail - fl->pend_cred <= FL_STARVE_THRES; + const struct sge *s = &adapter->sge; + + return fl->avail - fl->pend_cred <= s->fl_starve_thres; } static int map_skb(struct device *dev, const struct sk_buff *skb, @@ -586,8 +592,10 @@ static unsigned int refill_fl(struct adapter *adap, struct sge_fl *q, int n, unsigned int cred = q->avail; __be64 *d = &q->desc[q->pidx]; struct rx_sw_desc *sd = &q->sdesc[q->pidx]; + int node; gfp |= __GFP_NOWARN; + node = dev_to_node(adap->pdev_dev); if (s->fl_pg_order == 0) goto alloc_small_pages; @@ -596,7 +604,7 @@ static unsigned int refill_fl(struct adapter *adap, struct sge_fl *q, int n, * Prefer large buffers */ while (n) { - pg = __dev_alloc_pages(gfp, s->fl_pg_order); + pg = alloc_pages_node(node, gfp | __GFP_COMP, s->fl_pg_order); if (unlikely(!pg)) { q->large_alloc_failed++; break; /* fall back to single pages */ @@ -626,7 +634,7 @@ static unsigned int refill_fl(struct adapter *adap, struct sge_fl *q, int n, alloc_small_pages: while (n--) { - pg = __dev_alloc_page(gfp); + pg = alloc_pages_node(node, gfp, 0); if (unlikely(!pg)) { q->alloc_failed++; break; @@ -655,7 +663,7 @@ out: cred = q->avail - cred; q->pend_cred += cred; ring_fl_db(adap, q); - if (unlikely(fl_starving(q))) { + if (unlikely(fl_starving(adap, q))) { smp_wmb(); set_bit(q->cntxt_id - adap->sge.egr_start, adap->sge.starving_fl); @@ -722,6 +730,22 @@ static void *alloc_ring(struct device *dev, size_t nelem, size_t elem_size, */ static inline unsigned int sgl_len(unsigned int n) { + /* A Direct Scatter Gather List uses 32-bit lengths and 64-bit PCI DMA + * addresses. The DSGL Work Request starts off with a 32-bit DSGL + * ULPTX header, then Length0, then Address0, then, for 1 <= i <= N, + * repeated sequences of { Length[i], Length[i+1], Address[i], + * Address[i+1] } (this ensures that all addresses are on 64-bit + * boundaries). If N is even, then Length[N+1] should be set to 0 and + * Address[N+1] is omitted. + * + * The following calculation incorporates all of the above. It's + * somewhat hard to follow but, briefly: the "+2" accounts for the + * first two flits which include the DSGL header, Length0 and + * Address0; the "(3*(n-1))/2" covers the main body of list entries (3 + * flits for every pair of the remaining N) +1 if (n-1) is odd; and + * finally the "+((n-1)&1)" adds the one remaining flit needed if + * (n-1) is odd ... + */ n--; return (3 * n) / 2 + (n & 1) + 2; } @@ -769,12 +793,30 @@ static inline unsigned int calc_tx_flits(const struct sk_buff *skb) unsigned int flits; int hdrlen = is_eth_imm(skb); + /* If the skb is small enough, we can pump it out as a work request + * with only immediate data. In that case we just have to have the + * TX Packet header plus the skb data in the Work Request. + */ + if (hdrlen) return DIV_ROUND_UP(skb->len + hdrlen, sizeof(__be64)); + /* Otherwise, we're going to have to construct a Scatter gather list + * of the skb body and fragments. We also include the flits necessary + * for the TX Packet Work Request and CPL. We always have a firmware + * Write Header (incorporated as part of the cpl_tx_pkt_lso and + * cpl_tx_pkt structures), followed by either a TX Packet Write CPL + * message or, if we're doing a Large Send Offload, an LSO CPL message + * with an embedded TX Packet Write CPL message. + */ flits = sgl_len(skb_shinfo(skb)->nr_frags + 1) + 4; if (skb_shinfo(skb)->gso_size) - flits += 2; + flits += (sizeof(struct fw_eth_tx_pkt_wr) + + sizeof(struct cpl_tx_pkt_lso_core) + + sizeof(struct cpl_tx_pkt_core)) / sizeof(__be64); + else + flits += (sizeof(struct fw_eth_tx_pkt_wr) + + sizeof(struct cpl_tx_pkt_core)) / sizeof(__be64); return flits; } @@ -2196,7 +2238,8 @@ static irqreturn_t t4_intr_msi(int irq, void *cookie) { struct adapter *adap = cookie; - t4_slow_intr_handler(adap); + if (adap->flags & MASTER_PF) + t4_slow_intr_handler(adap); process_intrq(adap); return IRQ_HANDLED; } @@ -2211,7 +2254,8 @@ static irqreturn_t t4_intr_intx(int irq, void *cookie) struct adapter *adap = cookie; t4_write_reg(adap, MYPF_REG(PCIE_PF_CLI_A), 0); - if (t4_slow_intr_handler(adap) | process_intrq(adap)) + if (((adap->flags & MASTER_PF) && t4_slow_intr_handler(adap)) | + process_intrq(adap)) return IRQ_HANDLED; return IRQ_NONE; /* probably shared interrupt */ } @@ -2248,7 +2292,7 @@ static void sge_rx_timer_cb(unsigned long data) clear_bit(id, s->starving_fl); smp_mb__after_atomic(); - if (fl_starving(fl)) { + if (fl_starving(adap, fl)) { rxq = container_of(fl, struct sge_eth_rxq, fl); if (napi_reschedule(&rxq->rspq.napi)) fl->starving++; diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h index 4b0494b9cc7c..1bf1cdce74ac 100644 --- a/drivers/net/ethernet/emulex/benet/be.h +++ b/drivers/net/ethernet/emulex/benet/be.h @@ -99,6 +99,7 @@ #define BE_NAPI_WEIGHT 64 #define MAX_RX_POST BE_NAPI_WEIGHT /* Frags posted at a time */ #define RX_FRAGS_REFILL_WM (RX_Q_LEN - MAX_RX_POST) +#define MAX_NUM_POST_ERX_DB 255u #define MAX_VFS 30 /* Max VFs supported by BE3 FW */ #define FW_VER_LEN 32 diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 5ff7fba9b67c..fb0bc3c3620e 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -2122,7 +2122,7 @@ static void be_post_rx_frags(struct be_rx_obj *rxo, gfp_t gfp, u32 frags_needed) if (rxo->rx_post_starved) rxo->rx_post_starved = false; do { - notify = min(256u, posted); + notify = min(MAX_NUM_POST_ERX_DB, posted); be_rxq_notify(adapter, rxq->id, notify); posted -= notify; } while (posted); diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h b/drivers/net/ethernet/intel/e1000e/e1000.h index a69f09e37b58..5d9ceb17b4cb 100644 --- a/drivers/net/ethernet/intel/e1000e/e1000.h +++ b/drivers/net/ethernet/intel/e1000e/e1000.h @@ -343,6 +343,7 @@ struct e1000_adapter { struct timecounter tc; struct ptp_clock *ptp_clock; struct ptp_clock_info ptp_clock_info; + struct pm_qos_request pm_qos_req; u16 eee_advert; }; diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index 74ec185a697f..c509a5c900f5 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -3297,9 +3297,9 @@ static void e1000_configure_rx(struct e1000_adapter *adapter) ew32(RXDCTL(0), rxdctl | 0x3); } - pm_qos_update_request(&adapter->netdev->pm_qos_req, lat); + pm_qos_update_request(&adapter->pm_qos_req, lat); } else { - pm_qos_update_request(&adapter->netdev->pm_qos_req, + pm_qos_update_request(&adapter->pm_qos_req, PM_QOS_DEFAULT_VALUE); } @@ -4403,7 +4403,7 @@ static int e1000_open(struct net_device *netdev) e1000_update_mng_vlan(adapter); /* DMA latency requirement to workaround jumbo issue */ - pm_qos_add_request(&adapter->netdev->pm_qos_req, PM_QOS_CPU_DMA_LATENCY, + pm_qos_add_request(&adapter->pm_qos_req, PM_QOS_CPU_DMA_LATENCY, PM_QOS_DEFAULT_VALUE); /* before we allocate an interrupt, we must be ready to handle it. @@ -4514,7 +4514,7 @@ static int e1000_close(struct net_device *netdev) !test_bit(__E1000_TESTING, &adapter->state)) e1000e_release_hw_control(adapter); - pm_qos_remove_request(&adapter->netdev->pm_qos_req); + pm_qos_remove_request(&adapter->pm_qos_req); pm_runtime_put_sync(&pdev->dev); diff --git a/drivers/net/ethernet/toshiba/Kconfig b/drivers/net/ethernet/toshiba/Kconfig index 74acb5cf6099..5d244b6b5e3a 100644 --- a/drivers/net/ethernet/toshiba/Kconfig +++ b/drivers/net/ethernet/toshiba/Kconfig @@ -5,7 +5,7 @@ config NET_VENDOR_TOSHIBA bool "Toshiba devices" default y - depends on PCI && (PPC_IBM_CELL_BLADE || PPC_CELLEB || MIPS) || PPC_PS3 + depends on PCI && (PPC_IBM_CELL_BLADE || MIPS) || PPC_PS3 ---help--- If you have a network (Ethernet) card belonging to this class, say Y and read the Ethernet-HOWTO, available from @@ -42,7 +42,7 @@ config GELIC_WIRELESS config SPIDER_NET tristate "Spider Gigabit Ethernet driver" - depends on PCI && (PPC_IBM_CELL_BLADE || PPC_CELLEB) + depends on PCI && PPC_IBM_CELL_BLADE select FW_LOADER select SUNGEM_PHY ---help--- diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index f0b8b3e0ed7c..a10b31664709 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -132,6 +132,8 @@ struct hv_netvsc_packet { bool is_data_pkt; bool xmit_more; /* from skb */ + bool cp_partial; /* partial copy into send buffer */ + u16 vlan_tci; u16 q_idx; @@ -146,6 +148,9 @@ struct hv_netvsc_packet { /* This points to the memory after page_buf */ struct rndis_message *rndis_msg; + u32 rmsg_size; /* RNDIS header and PPI size */ + u32 rmsg_pgcnt; /* page count of RNDIS header and PPI */ + u32 total_data_buflen; /* Points to the send/receive buffer where the ethernet frame is */ void *data; diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 4d4d497d5762..2e8ad0636b46 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -703,15 +703,18 @@ static u32 netvsc_copy_to_send_buf(struct netvsc_device *net_device, u32 msg_size = 0; u32 padding = 0; u32 remain = packet->total_data_buflen % net_device->pkt_align; + u32 page_count = packet->cp_partial ? packet->rmsg_pgcnt : + packet->page_buf_cnt; /* Add padding */ - if (packet->is_data_pkt && packet->xmit_more && remain) { + if (packet->is_data_pkt && packet->xmit_more && remain && + !packet->cp_partial) { padding = net_device->pkt_align - remain; packet->rndis_msg->msg_len += padding; packet->total_data_buflen += padding; } - for (i = 0; i < packet->page_buf_cnt; i++) { + for (i = 0; i < page_count; i++) { char *src = phys_to_virt(packet->page_buf[i].pfn << PAGE_SHIFT); u32 offset = packet->page_buf[i].offset; u32 len = packet->page_buf[i].len; @@ -739,6 +742,7 @@ static inline int netvsc_send_pkt( struct net_device *ndev = net_device->ndev; u64 req_id; int ret; + struct hv_page_buffer *pgbuf; nvmsg.hdr.msg_type = NVSP_MSG1_TYPE_SEND_RNDIS_PKT; if (packet->is_data_pkt) { @@ -766,8 +770,10 @@ static inline int netvsc_send_pkt( return -ENODEV; if (packet->page_buf_cnt) { + pgbuf = packet->cp_partial ? packet->page_buf + + packet->rmsg_pgcnt : packet->page_buf; ret = vmbus_sendpacket_pagebuffer(out_channel, - packet->page_buf, + pgbuf, packet->page_buf_cnt, &nvmsg, sizeof(struct nvsp_message), @@ -824,6 +830,7 @@ int netvsc_send(struct hv_device *device, unsigned long flag; struct multi_send_data *msdp; struct hv_netvsc_packet *msd_send = NULL, *cur_send = NULL; + bool try_batch; net_device = get_outbound_net_device(device); if (!net_device) @@ -837,6 +844,7 @@ int netvsc_send(struct hv_device *device, } packet->channel = out_channel; packet->send_buf_index = NETVSC_INVALID_INDEX; + packet->cp_partial = false; msdp = &net_device->msd[q_idx]; @@ -845,12 +853,18 @@ int netvsc_send(struct hv_device *device, if (msdp->pkt) msd_len = msdp->pkt->total_data_buflen; - if (packet->is_data_pkt && msd_len > 0 && - msdp->count < net_device->max_pkt && - msd_len + pktlen + net_device->pkt_align < + try_batch = packet->is_data_pkt && msd_len > 0 && msdp->count < + net_device->max_pkt; + + if (try_batch && msd_len + pktlen + net_device->pkt_align < net_device->send_section_size) { section_index = msdp->pkt->send_buf_index; + } else if (try_batch && msd_len + packet->rmsg_size < + net_device->send_section_size) { + section_index = msdp->pkt->send_buf_index; + packet->cp_partial = true; + } else if (packet->is_data_pkt && pktlen + net_device->pkt_align < net_device->send_section_size) { section_index = netvsc_get_next_send_section(net_device); @@ -866,22 +880,26 @@ int netvsc_send(struct hv_device *device, netvsc_copy_to_send_buf(net_device, section_index, msd_len, packet); - if (!packet->part_of_skb) { - skb = (struct sk_buff *) - (unsigned long) - packet->send_completion_tid; - - packet->send_completion_tid = 0; - } - packet->page_buf_cnt = 0; packet->send_buf_index = section_index; - packet->total_data_buflen += msd_len; + + if (packet->cp_partial) { + packet->page_buf_cnt -= packet->rmsg_pgcnt; + packet->total_data_buflen = msd_len + packet->rmsg_size; + } else { + packet->page_buf_cnt = 0; + packet->total_data_buflen += msd_len; + if (!packet->part_of_skb) { + skb = (struct sk_buff *)(unsigned long)packet-> + send_completion_tid; + packet->send_completion_tid = 0; + } + } if (msdp->pkt) netvsc_xmit_completion(msdp->pkt); - if (packet->xmit_more) { + if (packet->xmit_more && !packet->cp_partial) { msdp->pkt = packet; msdp->count++; } else { diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 448716787e73..a3a9d3898a6e 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -277,15 +277,16 @@ static u32 fill_pg_buf(struct page *page, u32 offset, u32 len, } static u32 init_page_array(void *hdr, u32 len, struct sk_buff *skb, - struct hv_page_buffer *pb) + struct hv_netvsc_packet *packet) { + struct hv_page_buffer *pb = packet->page_buf; u32 slots_used = 0; char *data = skb->data; int frags = skb_shinfo(skb)->nr_frags; int i; /* The packet is laid out thus: - * 1. hdr + * 1. hdr: RNDIS header and PPI * 2. skb linear data * 3. skb fragment data */ @@ -294,6 +295,9 @@ static u32 init_page_array(void *hdr, u32 len, struct sk_buff *skb, offset_in_page(hdr), len, &pb[slots_used]); + packet->rmsg_size = len; + packet->rmsg_pgcnt = slots_used; + slots_used += fill_pg_buf(virt_to_page(data), offset_in_page(data), skb_headlen(skb), &pb[slots_used]); @@ -578,7 +582,7 @@ do_send: rndis_msg->msg_len += rndis_msg_size; packet->total_data_buflen = rndis_msg->msg_len; packet->page_buf_cnt = init_page_array(rndis_msg, rndis_msg_size, - skb, &page_buf[0]); + skb, packet); ret = netvsc_send(net_device_ctx->device_ctx, packet); diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig index 16adbc481772..8fadaa14b9f0 100644 --- a/drivers/net/phy/Kconfig +++ b/drivers/net/phy/Kconfig @@ -68,8 +68,8 @@ config SMSC_PHY config BROADCOM_PHY tristate "Drivers for Broadcom PHYs" ---help--- - Currently supports the BCM5411, BCM5421, BCM5461, BCM5464, BCM5481 - and BCM5482 PHYs. + Currently supports the BCM5411, BCM5421, BCM5461, BCM54616S, BCM5464, + BCM5481 and BCM5482 PHYs. config BCM63XX_PHY tristate "Drivers for Broadcom 63xx SOCs internal PHY" diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c index a52afb26421b..9c71295f2fef 100644 --- a/drivers/net/phy/broadcom.c +++ b/drivers/net/phy/broadcom.c @@ -549,6 +549,19 @@ static struct phy_driver broadcom_drivers[] = { .config_intr = bcm54xx_config_intr, .driver = { .owner = THIS_MODULE }, }, { + .phy_id = PHY_ID_BCM54616S, + .phy_id_mask = 0xfffffff0, + .name = "Broadcom BCM54616S", + .features = PHY_GBIT_FEATURES | + SUPPORTED_Pause | SUPPORTED_Asym_Pause, + .flags = PHY_HAS_MAGICANEG | PHY_HAS_INTERRUPT, + .config_init = bcm54xx_config_init, + .config_aneg = genphy_config_aneg, + .read_status = genphy_read_status, + .ack_interrupt = bcm54xx_ack_interrupt, + .config_intr = bcm54xx_config_intr, + .driver = { .owner = THIS_MODULE }, +}, { .phy_id = PHY_ID_BCM5464, .phy_id_mask = 0xfffffff0, .name = "Broadcom BCM5464", @@ -660,6 +673,7 @@ static struct mdio_device_id __maybe_unused broadcom_tbl[] = { { PHY_ID_BCM5411, 0xfffffff0 }, { PHY_ID_BCM5421, 0xfffffff0 }, { PHY_ID_BCM5461, 0xfffffff0 }, + { PHY_ID_BCM54616S, 0xfffffff0 }, { PHY_ID_BCM5464, 0xfffffff0 }, { PHY_ID_BCM5482, 0xfffffff0 }, { PHY_ID_BCM5482, 0xfffffff0 }, diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index 777757ae1973..733f4feb2ef3 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -1072,7 +1072,7 @@ static void __handle_set_rx_mode(struct usbnet *dev) * especially now that control transfers can be queued. */ static void -kevent (struct work_struct *work) +usbnet_deferred_kevent (struct work_struct *work) { struct usbnet *dev = container_of(work, struct usbnet, kevent); @@ -1626,7 +1626,7 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod) skb_queue_head_init(&dev->rxq_pause); dev->bh.func = usbnet_bh; dev->bh.data = (unsigned long) dev; - INIT_WORK (&dev->kevent, kevent); + INIT_WORK (&dev->kevent, usbnet_deferred_kevent); init_usb_anchor(&dev->deferred); dev->delay.function = usbnet_bh; dev->delay.data = (unsigned long) dev; diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 577c9b071ad9..154116aafd0d 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1699,12 +1699,6 @@ static int vxlan6_xmit_skb(struct dst_entry *dst, struct sock *sk, } } - skb = iptunnel_handle_offloads(skb, udp_sum, type); - if (IS_ERR(skb)) { - err = -EINVAL; - goto err; - } - skb_scrub_packet(skb, xnet); min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len @@ -1724,6 +1718,12 @@ static int vxlan6_xmit_skb(struct dst_entry *dst, struct sock *sk, goto err; } + skb = iptunnel_handle_offloads(skb, udp_sum, type); + if (IS_ERR(skb)) { + err = -EINVAL; + goto err; + } + vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); vxh->vx_flags = htonl(VXLAN_HF_VNI); vxh->vx_vni = md->vni; @@ -1784,10 +1784,6 @@ int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, } } - skb = iptunnel_handle_offloads(skb, udp_sum, type); - if (IS_ERR(skb)) - return PTR_ERR(skb); - min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + VXLAN_HLEN + sizeof(struct iphdr) + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); @@ -1803,6 +1799,10 @@ int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, if (WARN_ON(!skb)) return -ENOMEM; + skb = iptunnel_handle_offloads(skb, udp_sum, type); + if (IS_ERR(skb)) + return PTR_ERR(skb); + vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); vxh->vx_flags = htonl(VXLAN_HF_VNI); vxh->vx_vni = md->vni; diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 2270bd51f9c2..d383f84869aa 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -33,7 +33,6 @@ static int sg_version_num = 30536; /* 2 digits for each component */ #include <linux/sched.h> #include <linux/string.h> #include <linux/mm.h> -#include <linux/aio.h> #include <linux/errno.h> #include <linux/mtio.h> #include <linux/ioctl.h> @@ -51,6 +50,7 @@ static int sg_version_num = 30536; /* 2 digits for each component */ #include <linux/mutex.h> #include <linux/atomic.h> #include <linux/ratelimit.h> +#include <linux/uio.h> #include "scsi.h" #include <scsi/scsi_dbg.h> diff --git a/drivers/staging/unisys/include/timskmod.h b/drivers/staging/unisys/include/timskmod.h index 4019a0d63645..52648d4d9922 100644 --- a/drivers/staging/unisys/include/timskmod.h +++ b/drivers/staging/unisys/include/timskmod.h @@ -46,7 +46,6 @@ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/fcntl.h> -#include <linux/aio.h> #include <linux/workqueue.h> #include <linux/kthread.h> #include <linux/seq_file.h> diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 175c9956cbe3..a12315a78248 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -23,6 +23,7 @@ #include <linux/export.h> #include <linux/hid.h> #include <linux/module.h> +#include <linux/uio.h> #include <asm/unaligned.h> #include <linux/usb/composite.h> @@ -655,9 +656,10 @@ static void ffs_user_copy_worker(struct work_struct *work) unuse_mm(io_data->mm); } - aio_complete(io_data->kiocb, ret, ret); + io_data->kiocb->ki_complete(io_data->kiocb, ret, ret); - if (io_data->ffs->ffs_eventfd && !io_data->kiocb->ki_eventfd) + if (io_data->ffs->ffs_eventfd && + !(io_data->kiocb->ki_flags & IOCB_EVENTFD)) eventfd_signal(io_data->ffs->ffs_eventfd, 1); usb_ep_free_request(io_data->ep, io_data->req); diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index 200f9a584064..662ef2c1c62b 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -26,6 +26,7 @@ #include <linux/poll.h> #include <linux/mmu_context.h> #include <linux/aio.h> +#include <linux/uio.h> #include <linux/device.h> #include <linux/moduleparam.h> @@ -469,7 +470,7 @@ static void ep_user_copy_worker(struct work_struct *work) ret = -EFAULT; /* completing the iocb can drop the ctx and mm, don't touch mm after */ - aio_complete(iocb, ret, ret); + iocb->ki_complete(iocb, ret, ret); kfree(priv->buf); kfree(priv->to_free); @@ -497,7 +498,8 @@ static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req) kfree(priv); iocb->private = NULL; /* aio_complete() reports bytes-transferred _and_ faults */ - aio_complete(iocb, req->actual ? req->actual : req->status, + + iocb->ki_complete(iocb, req->actual ? req->actual : req->status, req->status); } else { /* ep_copy_to_user() won't report both; we hide some faults */ diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 18f05bff8826..7d137a43cc86 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -357,13 +357,13 @@ static void handle_tx(struct vhost_net *net) iov_iter_init(&msg.msg_iter, WRITE, vq->iov, out, len); iov_iter_advance(&msg.msg_iter, hdr_size); /* Sanity check */ - if (!iov_iter_count(&msg.msg_iter)) { + if (!msg_data_left(&msg)) { vq_err(vq, "Unexpected header len for TX: " "%zd expected %zd\n", len, hdr_size); break; } - len = iov_iter_count(&msg.msg_iter); + len = msg_data_left(&msg); zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN && (nvq->upend_idx + 1) % UIO_MAXIOV != diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index eb14e055ea83..ff1a5bac4200 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -33,7 +33,7 @@ #include <linux/pagemap.h> #include <linux/idr.h> #include <linux/sched.h> -#include <linux/aio.h> +#include <linux/uio.h> #include <net/9p/9p.h> #include <net/9p/client.h> diff --git a/fs/affs/file.c b/fs/affs/file.c index a91795e01a7f..3aa7eb66547e 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -12,7 +12,7 @@ * affs regular file handling primitives */ -#include <linux/aio.h> +#include <linux/uio.h> #include "affs.h" static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext); diff --git a/fs/afs/misc.c b/fs/afs/misc.c index 0dd4dafee10b..91ea1aa0d8b3 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -22,9 +22,12 @@ int afs_abort_to_error(u32 abort_code) { switch (abort_code) { + /* low errno codes inserted into abort namespace */ case 13: return -EACCES; case 27: return -EFBIG; case 30: return -EROFS; + + /* VICE "special error" codes; 101 - 111 */ case VSALVAGE: return -EIO; case VNOVNODE: return -ENOENT; case VNOVOL: return -ENOMEDIUM; @@ -36,11 +39,18 @@ int afs_abort_to_error(u32 abort_code) case VOVERQUOTA: return -EDQUOT; case VBUSY: return -EBUSY; case VMOVED: return -ENXIO; - case 0x2f6df0a: return -EWOULDBLOCK; + + /* Unified AFS error table; ET "uae" == 0x2f6df00 */ + case 0x2f6df00: return -EPERM; + case 0x2f6df01: return -ENOENT; + case 0x2f6df04: return -EIO; + case 0x2f6df0a: return -EAGAIN; + case 0x2f6df0b: return -ENOMEM; case 0x2f6df0c: return -EACCES; case 0x2f6df0f: return -EBUSY; case 0x2f6df10: return -EEXIST; case 0x2f6df11: return -EXDEV; + case 0x2f6df12: return -ENODEV; case 0x2f6df13: return -ENOTDIR; case 0x2f6df14: return -EISDIR; case 0x2f6df15: return -EINVAL; @@ -54,8 +64,12 @@ int afs_abort_to_error(u32 abort_code) case 0x2f6df23: return -ENAMETOOLONG; case 0x2f6df24: return -ENOLCK; case 0x2f6df26: return -ENOTEMPTY; + case 0x2f6df28: return -EWOULDBLOCK; + case 0x2f6df69: return -ENOTCONN; + case 0x2f6df6c: return -ETIMEDOUT; case 0x2f6df78: return -EDQUOT; + /* RXKAD abort codes; from include/rxrpc/packet.h. ET "RXK" == 0x1260B00 */ case RXKADINCONSISTENCY: return -EPROTO; case RXKADPACKETSHORT: return -EPROTO; case RXKADLEVELFAIL: return -EKEYREJECTED; diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index dbc732e9a5c0..3a57a1b0fb51 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -770,15 +770,12 @@ static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb, void afs_send_empty_reply(struct afs_call *call) { struct msghdr msg; - struct kvec iov[1]; _enter(""); - iov[0].iov_base = NULL; - iov[0].iov_len = 0; msg.msg_name = NULL; msg.msg_namelen = 0; - iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 0, 0); /* WTF? */ + iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, NULL, 0, 0); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; diff --git a/fs/afs/write.c b/fs/afs/write.c index c13cb08964ed..0714abcd7f32 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -14,7 +14,6 @@ #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/pagevec.h> -#include <linux/aio.h> #include "internal.h" static int afs_write_back_from_locked_page(struct afs_writeback *wb, @@ -151,6 +151,38 @@ struct kioctx { unsigned id; }; +/* + * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either + * cancelled or completed (this makes a certain amount of sense because + * successful cancellation - io_cancel() - does deliver the completion to + * userspace). + * + * And since most things don't implement kiocb cancellation and we'd really like + * kiocb completion to be lockless when possible, we use ki_cancel to + * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED + * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel(). + */ +#define KIOCB_CANCELLED ((void *) (~0ULL)) + +struct aio_kiocb { + struct kiocb common; + + struct kioctx *ki_ctx; + kiocb_cancel_fn *ki_cancel; + + struct iocb __user *ki_user_iocb; /* user's aiocb */ + __u64 ki_user_data; /* user's data for completion */ + + struct list_head ki_list; /* the aio core uses this + * for cancellation */ + + /* + * If the aio_resfd field of the userspace iocb is not zero, + * this is the underlying eventfd context to deliver events to. + */ + struct eventfd_ctx *ki_eventfd; +}; + /*------ sysctl variables----*/ static DEFINE_SPINLOCK(aio_nr_lock); unsigned long aio_nr; /* current system wide number of aio requests */ @@ -220,7 +252,7 @@ static int __init aio_setup(void) if (IS_ERR(aio_mnt)) panic("Failed to create aio fs mount."); - kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); + kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page)); @@ -480,8 +512,9 @@ static int aio_setup_ring(struct kioctx *ctx) #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) -void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel) +void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel) { + struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, common); struct kioctx *ctx = req->ki_ctx; unsigned long flags; @@ -496,7 +529,7 @@ void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel) } EXPORT_SYMBOL(kiocb_set_cancel_fn); -static int kiocb_cancel(struct kiocb *kiocb) +static int kiocb_cancel(struct aio_kiocb *kiocb) { kiocb_cancel_fn *old, *cancel; @@ -514,7 +547,7 @@ static int kiocb_cancel(struct kiocb *kiocb) cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED); } while (cancel != old); - return cancel(kiocb); + return cancel(&kiocb->common); } static void free_ioctx(struct work_struct *work) @@ -550,13 +583,13 @@ static void free_ioctx_reqs(struct percpu_ref *ref) static void free_ioctx_users(struct percpu_ref *ref) { struct kioctx *ctx = container_of(ref, struct kioctx, users); - struct kiocb *req; + struct aio_kiocb *req; spin_lock_irq(&ctx->ctx_lock); while (!list_empty(&ctx->active_reqs)) { req = list_first_entry(&ctx->active_reqs, - struct kiocb, ki_list); + struct aio_kiocb, ki_list); list_del_init(&req->ki_list); kiocb_cancel(req); @@ -778,22 +811,6 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, return 0; } -/* wait_on_sync_kiocb: - * Waits on the given sync kiocb to complete. - */ -ssize_t wait_on_sync_kiocb(struct kiocb *req) -{ - while (!req->ki_ctx) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (req->ki_ctx) - break; - io_schedule(); - } - __set_current_state(TASK_RUNNING); - return req->ki_user_data; -} -EXPORT_SYMBOL(wait_on_sync_kiocb); - /* * exit_aio: called when the last user of mm goes away. At this point, there is * no way for any new requests to be submited or any of the io_* syscalls to be @@ -948,9 +965,9 @@ static void user_refill_reqs_available(struct kioctx *ctx) * Allocate a slot for an aio request. * Returns NULL if no requests are free. */ -static inline struct kiocb *aio_get_req(struct kioctx *ctx) +static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx) { - struct kiocb *req; + struct aio_kiocb *req; if (!get_reqs_available(ctx)) { user_refill_reqs_available(ctx); @@ -971,10 +988,10 @@ out_put: return NULL; } -static void kiocb_free(struct kiocb *req) +static void kiocb_free(struct aio_kiocb *req) { - if (req->ki_filp) - fput(req->ki_filp); + if (req->common.ki_filp) + fput(req->common.ki_filp); if (req->ki_eventfd != NULL) eventfd_ctx_put(req->ki_eventfd); kmem_cache_free(kiocb_cachep, req); @@ -1010,8 +1027,9 @@ out: /* aio_complete * Called when the io request on the given iocb is complete. */ -void aio_complete(struct kiocb *iocb, long res, long res2) +static void aio_complete(struct kiocb *kiocb, long res, long res2) { + struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, common); struct kioctx *ctx = iocb->ki_ctx; struct aio_ring *ring; struct io_event *ev_page, *event; @@ -1025,13 +1043,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) * ref, no other paths have a way to get another ref * - the sync task helpfully left a reference to itself in the iocb */ - if (is_sync_kiocb(iocb)) { - iocb->ki_user_data = res; - smp_wmb(); - iocb->ki_ctx = ERR_PTR(-EXDEV); - wake_up_process(iocb->ki_obj.tsk); - return; - } + BUG_ON(is_sync_kiocb(kiocb)); if (iocb->ki_list.next) { unsigned long flags; @@ -1057,7 +1069,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); event = ev_page + pos % AIO_EVENTS_PER_PAGE; - event->obj = (u64)(unsigned long)iocb->ki_obj.user; + event->obj = (u64)(unsigned long)iocb->ki_user_iocb; event->data = iocb->ki_user_data; event->res = res; event->res2 = res2; @@ -1066,7 +1078,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n", - ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data, + ctx, tail, iocb, iocb->ki_user_iocb, iocb->ki_user_data, res, res2); /* after flagging the request as done, we @@ -1113,7 +1125,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2) percpu_ref_put(&ctx->reqs); } -EXPORT_SYMBOL(aio_complete); /* aio_read_events_ring * Pull an event off of the ioctx's event ring. Returns the number of @@ -1344,12 +1355,13 @@ typedef ssize_t (rw_iter_op)(struct kiocb *, struct iov_iter *); static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb, int rw, char __user *buf, unsigned long *nr_segs, + size_t *len, struct iovec **iovec, bool compat) { ssize_t ret; - *nr_segs = kiocb->ki_nbytes; + *nr_segs = *len; #ifdef CONFIG_COMPAT if (compat) @@ -1364,21 +1376,22 @@ static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb, if (ret < 0) return ret; - /* ki_nbytes now reflect bytes instead of segs */ - kiocb->ki_nbytes = ret; + /* len now reflect bytes instead of segs */ + *len = ret; return 0; } static ssize_t aio_setup_single_vector(struct kiocb *kiocb, int rw, char __user *buf, unsigned long *nr_segs, + size_t len, struct iovec *iovec) { - if (unlikely(!access_ok(!rw, buf, kiocb->ki_nbytes))) + if (unlikely(!access_ok(!rw, buf, len))) return -EFAULT; iovec->iov_base = buf; - iovec->iov_len = kiocb->ki_nbytes; + iovec->iov_len = len; *nr_segs = 1; return 0; } @@ -1388,7 +1401,7 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb, * Performs the initial checks and io submission. */ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, - char __user *buf, bool compat) + char __user *buf, size_t len, bool compat) { struct file *file = req->ki_filp; ssize_t ret; @@ -1423,21 +1436,21 @@ rw_common: if (!rw_op && !iter_op) return -EINVAL; - ret = (opcode == IOCB_CMD_PREADV || - opcode == IOCB_CMD_PWRITEV) - ? aio_setup_vectored_rw(req, rw, buf, &nr_segs, - &iovec, compat) - : aio_setup_single_vector(req, rw, buf, &nr_segs, - iovec); + if (opcode == IOCB_CMD_PREADV || opcode == IOCB_CMD_PWRITEV) + ret = aio_setup_vectored_rw(req, rw, buf, &nr_segs, + &len, &iovec, compat); + else + ret = aio_setup_single_vector(req, rw, buf, &nr_segs, + len, iovec); if (!ret) - ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); + ret = rw_verify_area(rw, file, &req->ki_pos, len); if (ret < 0) { if (iovec != inline_vecs) kfree(iovec); return ret; } - req->ki_nbytes = ret; + len = ret; /* XXX: move/kill - rw_verify_area()? */ /* This matches the pread()/pwrite() logic */ @@ -1450,7 +1463,7 @@ rw_common: file_start_write(file); if (iter_op) { - iov_iter_init(&iter, rw, iovec, nr_segs, req->ki_nbytes); + iov_iter_init(&iter, rw, iovec, nr_segs, len); ret = iter_op(req, &iter); } else { ret = rw_op(req, iovec, nr_segs, req->ki_pos); @@ -1500,7 +1513,7 @@ rw_common: static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, struct iocb *iocb, bool compat) { - struct kiocb *req; + struct aio_kiocb *req; ssize_t ret; /* enforce forwards compatibility on users */ @@ -1523,11 +1536,14 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, if (unlikely(!req)) return -EAGAIN; - req->ki_filp = fget(iocb->aio_fildes); - if (unlikely(!req->ki_filp)) { + req->common.ki_filp = fget(iocb->aio_fildes); + if (unlikely(!req->common.ki_filp)) { ret = -EBADF; goto out_put_req; } + req->common.ki_pos = iocb->aio_offset; + req->common.ki_complete = aio_complete; + req->common.ki_flags = 0; if (iocb->aio_flags & IOCB_FLAG_RESFD) { /* @@ -1542,6 +1558,8 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->ki_eventfd = NULL; goto out_put_req; } + + req->common.ki_flags |= IOCB_EVENTFD; } ret = put_user(KIOCB_KEY, &user_iocb->aio_key); @@ -1550,13 +1568,12 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, goto out_put_req; } - req->ki_obj.user = user_iocb; + req->ki_user_iocb = user_iocb; req->ki_user_data = iocb->aio_data; - req->ki_pos = iocb->aio_offset; - req->ki_nbytes = iocb->aio_nbytes; - ret = aio_run_iocb(req, iocb->aio_lio_opcode, + ret = aio_run_iocb(&req->common, iocb->aio_lio_opcode, (char __user *)(unsigned long)iocb->aio_buf, + iocb->aio_nbytes, compat); if (ret) goto out_put_req; @@ -1643,10 +1660,10 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, /* lookup_kiocb * Finds a given iocb for cancellation. */ -static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, - u32 key) +static struct aio_kiocb * +lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, u32 key) { - struct list_head *pos; + struct aio_kiocb *kiocb; assert_spin_locked(&ctx->ctx_lock); @@ -1654,9 +1671,8 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, return NULL; /* TODO: use a hash or array, this sucks. */ - list_for_each(pos, &ctx->active_reqs) { - struct kiocb *kiocb = list_kiocb(pos); - if (kiocb->ki_obj.user == iocb) + list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { + if (kiocb->ki_user_iocb == iocb) return kiocb; } return NULL; @@ -1676,7 +1692,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, struct io_event __user *, result) { struct kioctx *ctx; - struct kiocb *kiocb; + struct aio_kiocb *kiocb; u32 key; int ret; diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 90bc079d9982..fdcb4d69f430 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -15,6 +15,7 @@ #include <linux/buffer_head.h> #include <linux/vfs.h> #include <linux/writeback.h> +#include <linux/uio.h> #include <asm/uaccess.h> #include "bfs.h" diff --git a/fs/block_dev.c b/fs/block_dev.c index 975266be67d3..2e522aed6584 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -27,7 +27,6 @@ #include <linux/namei.h> #include <linux/log2.h> #include <linux/cleancache.h> -#include <linux/aio.h> #include <asm/uaccess.h> #include "internal.h" diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 30982bbd31c3..aee18f84e315 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -24,7 +24,6 @@ #include <linux/string.h> #include <linux/backing-dev.h> #include <linux/mpage.h> -#include <linux/aio.h> #include <linux/falloc.h> #include <linux/swap.h> #include <linux/writeback.h> @@ -32,6 +31,7 @@ #include <linux/compat.h> #include <linux/slab.h> #include <linux/btrfs.h> +#include <linux/uio.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d2e732d7af52..686331f22b15 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -32,7 +32,6 @@ #include <linux/writeback.h> #include <linux/statfs.h> #include <linux/compat.h> -#include <linux/aio.h> #include <linux/bit_spinlock.h> #include <linux/xattr.h> #include <linux/posix_acl.h> @@ -43,6 +42,7 @@ #include <linux/btrfs.h> #include <linux/blkdev.h> #include <linux/posix_acl_xattr.h> +#include <linux/uio.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" diff --git a/fs/ceph/file.c b/fs/ceph/file.c index d533075a823d..139f2fea91a0 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -7,7 +7,6 @@ #include <linux/mount.h> #include <linux/namei.h> #include <linux/writeback.h> -#include <linux/aio.h> #include <linux/falloc.h> #include "super.h" @@ -808,7 +807,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *filp = iocb->ki_filp; struct ceph_file_info *fi = filp->private_data; - size_t len = iocb->ki_nbytes; + size_t len = iov_iter_count(to); struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); struct page *pinned_page = NULL; diff --git a/fs/direct-io.c b/fs/direct-io.c index e181b6b2e297..6fb00e3f1059 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -37,7 +37,6 @@ #include <linux/uio.h> #include <linux/atomic.h> #include <linux/prefetch.h> -#include <linux/aio.h> /* * How many user pages to map in one call to get_user_pages(). This determines @@ -265,7 +264,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, ret = err; } - aio_complete(dio->iocb, ret, 0); + dio->iocb->ki_complete(dio->iocb, ret, 0); } kmem_cache_free(dio_cache, dio); @@ -1056,7 +1055,7 @@ static inline int drop_refcount(struct dio *dio) * operation. AIO can if it was a broken operation described above or * in fact if all the bios race to complete before we get here. In * that case dio_complete() translates the EIOCBQUEUED into the proper - * return code that the caller will hand to aio_complete(). + * return code that the caller will hand to ->complete(). * * This is managed by the bio_lock instead of being an atomic_t so that * completion paths can drop their ref and use the remaining count to diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index fd39bad6f1bd..79675089443d 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -31,7 +31,6 @@ #include <linux/security.h> #include <linux/compat.h> #include <linux/fs_stack.h> -#include <linux/aio.h> #include "ecryptfs_kernel.h" /** @@ -52,12 +51,6 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb, struct file *file = iocb->ki_filp; rc = generic_file_read_iter(iocb, to); - /* - * Even though this is a async interface, we need to wait - * for IO to finish to update atime - */ - if (-EIOCBQUEUED == rc) - rc = wait_on_sync_kiocb(iocb); if (rc >= 0) { path = ecryptfs_dentry_to_lower_path(file->f_path.dentry); touch_atime(path); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 6434bc000125..df9d6afbc5d5 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -31,7 +31,7 @@ #include <linux/mpage.h> #include <linux/fiemap.h> #include <linux/namei.h> -#include <linux/aio.h> +#include <linux/uio.h> #include "ext2.h" #include "acl.h" #include "xattr.h" diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 2c6ccc49ba27..db07ffbe7c85 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -27,7 +27,7 @@ #include <linux/writeback.h> #include <linux/mpage.h> #include <linux/namei.h> -#include <linux/aio.h> +#include <linux/uio.h> #include "ext3.h" #include "xattr.h" #include "acl.h" diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 33a09da16c9c..598abbbe6786 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -23,9 +23,9 @@ #include <linux/jbd2.h> #include <linux/mount.h> #include <linux/path.h> -#include <linux/aio.h> #include <linux/quotaops.h> #include <linux/pagevec.h> +#include <linux/uio.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 45fe924f82bc..740c7871c117 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -20,9 +20,9 @@ * (sct@redhat.com), 1993, 1998 */ -#include <linux/aio.h> #include "ext4_jbd2.h" #include "truncate.h" +#include <linux/uio.h> #include <trace/events/ext4.h> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5cb9a212b86f..a3f451370bef 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -37,7 +37,6 @@ #include <linux/printk.h> #include <linux/slab.h> #include <linux/ratelimit.h> -#include <linux/aio.h> #include <linux/bitops.h> #include "ext4_jbd2.h" diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index b24a2541a9ba..464984261e69 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -18,7 +18,6 @@ #include <linux/pagevec.h> #include <linux/mpage.h> #include <linux/namei.h> -#include <linux/aio.h> #include <linux/uio.h> #include <linux/bio.h> #include <linux/workqueue.h> diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 985ed023a750..497f8515d205 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -12,12 +12,12 @@ #include <linux/f2fs_fs.h> #include <linux/buffer_head.h> #include <linux/mpage.h> -#include <linux/aio.h> #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/blkdev.h> #include <linux/bio.h> #include <linux/prefetch.h> +#include <linux/uio.h> #include "f2fs.h" #include "node.h" diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 497c7c5263c7..8521207de229 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -19,7 +19,6 @@ #include <linux/mpage.h> #include <linux/buffer_head.h> #include <linux/mount.h> -#include <linux/aio.h> #include <linux/vfs.h> #include <linux/parser.h> #include <linux/uio.h> diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 28d0c7abba1c..b3fa05032234 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -38,7 +38,6 @@ #include <linux/device.h> #include <linux/file.h> #include <linux/fs.h> -#include <linux/aio.h> #include <linux/kdev_t.h> #include <linux/kthread.h> #include <linux/list.h> @@ -48,6 +47,7 @@ #include <linux/slab.h> #include <linux/stat.h> #include <linux/module.h> +#include <linux/uio.h> #include "fuse_i.h" diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 39706c57ad3c..95a2797eef66 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -19,7 +19,6 @@ #include <linux/pipe_fs_i.h> #include <linux/swap.h> #include <linux/splice.h> -#include <linux/aio.h> MODULE_ALIAS_MISCDEV(FUSE_MINOR); MODULE_ALIAS("devname:fuse"); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index c01ec3bdcfd8..ff102cbf16ea 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -15,8 +15,8 @@ #include <linux/module.h> #include <linux/compat.h> #include <linux/swap.h> -#include <linux/aio.h> #include <linux/falloc.h> +#include <linux/uio.h> static const struct file_operations fuse_direct_io_file_operations; @@ -528,6 +528,17 @@ static void fuse_release_user_pages(struct fuse_req *req, int write) } } +static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io) +{ + if (io->err) + return io->err; + + if (io->bytes >= 0 && io->write) + return -EIO; + + return io->bytes < 0 ? io->size : io->bytes; +} + /** * In case of short read, the caller sets 'pos' to the position of * actual end of fuse request in IO request. Otherwise, if bytes_requested @@ -546,6 +557,7 @@ static void fuse_release_user_pages(struct fuse_req *req, int write) */ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) { + bool is_sync = is_sync_kiocb(io->iocb); int left; spin_lock(&io->lock); @@ -555,30 +567,24 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) io->bytes = pos; left = --io->reqs; + if (!left && is_sync) + complete(io->done); spin_unlock(&io->lock); - if (!left) { - long res; + if (!left && !is_sync) { + ssize_t res = fuse_get_res_by_io(io); - if (io->err) - res = io->err; - else if (io->bytes >= 0 && io->write) - res = -EIO; - else { - res = io->bytes < 0 ? io->size : io->bytes; + if (res >= 0) { + struct inode *inode = file_inode(io->iocb->ki_filp); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); - if (!is_sync_kiocb(io->iocb)) { - struct inode *inode = file_inode(io->iocb->ki_filp); - struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_inode *fi = get_fuse_inode(inode); - - spin_lock(&fc->lock); - fi->attr_version = ++fc->attr_version; - spin_unlock(&fc->lock); - } + spin_lock(&fc->lock); + fi->attr_version = ++fc->attr_version; + spin_unlock(&fc->lock); } - aio_complete(io->iocb, res, 0); + io->iocb->ki_complete(io->iocb, res, 0); kfree(io); } } @@ -2801,6 +2807,7 @@ static ssize_t fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t offset) { + DECLARE_COMPLETION_ONSTACK(wait); ssize_t ret = 0; struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; @@ -2852,6 +2859,9 @@ fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, if (!is_sync_kiocb(iocb) && (offset + count > i_size) && rw == WRITE) io->async = false; + if (io->async && is_sync_kiocb(iocb)) + io->done = &wait; + if (rw == WRITE) ret = __fuse_direct_write(io, iter, &pos); else @@ -2864,11 +2874,12 @@ fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, if (!is_sync_kiocb(iocb)) return -EIOCBQUEUED; - ret = wait_on_sync_kiocb(iocb); - } else { - kfree(io); + wait_for_completion(&wait); + ret = fuse_get_res_by_io(io); } + kfree(io); + if (rw == WRITE) { if (ret > 0) fuse_write_update_size(inode, pos); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 1cdfb07c1376..7354dc142a50 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -263,6 +263,7 @@ struct fuse_io_priv { int err; struct kiocb *iocb; struct file *file; + struct completion *done; }; /** diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 4ad4f94edebe..fe6634d25d1d 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -20,7 +20,7 @@ #include <linux/swap.h> #include <linux/gfs2_ondisk.h> #include <linux/backing-dev.h> -#include <linux/aio.h> +#include <linux/uio.h> #include <trace/events/writeback.h> #include "gfs2.h" diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 3e32bb8e2d7e..f6fc412b1100 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -25,7 +25,6 @@ #include <asm/uaccess.h> #include <linux/dlm.h> #include <linux/dlm_plock.h> -#include <linux/aio.h> #include <linux/delay.h> #include "gfs2.h" diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index d0929bc81782..98d4ea45bb70 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -14,7 +14,7 @@ #include <linux/pagemap.h> #include <linux/mpage.h> #include <linux/sched.h> -#include <linux/aio.h> +#include <linux/uio.h> #include "hfs_fs.h" #include "btree.h" diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 0cf786f2d046..f541196d4ee9 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -14,7 +14,7 @@ #include <linux/pagemap.h> #include <linux/mpage.h> #include <linux/sched.h> -#include <linux/aio.h> +#include <linux/uio.h> #include "hfsplus_fs.h" #include "hfsplus_raw.h" diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index bd3df1ca3c9b..3197aed10614 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -22,8 +22,8 @@ #include <linux/buffer_head.h> #include <linux/pagemap.h> #include <linux/quotaops.h> +#include <linux/uio.h> #include <linux/writeback.h> -#include <linux/aio.h> #include "jfs_incore.h" #include "jfs_inode.h" #include "jfs_filsys.h" diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index e907c8cf732e..c3929fb2ab26 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -265,7 +265,7 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t return -EINVAL; #else - VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE); + VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE); if (rw == READ) return nfs_file_direct_read(iocb, iter, pos); @@ -393,7 +393,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write) long res = (long) dreq->error; if (!res) res = (long) dreq->count; - aio_complete(dreq->iocb, res, 0); + dreq->iocb->ki_complete(dreq->iocb, res, 0); } complete_all(&dreq->completion); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index e679d24c39d3..37b15582e0de 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -26,7 +26,6 @@ #include <linux/nfs_mount.h> #include <linux/mm.h> #include <linux/pagemap.h> -#include <linux/aio.h> #include <linux/gfp.h> #include <linux/swap.h> diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 8b5969538f39..ab4987bc637f 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -26,7 +26,7 @@ #include <linux/mpage.h> #include <linux/pagemap.h> #include <linux/writeback.h> -#include <linux/aio.h> +#include <linux/uio.h> #include "nilfs.h" #include "btnode.h" #include "segment.h" diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 1da9b2d184dc..f16f2d8401fe 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -28,7 +28,6 @@ #include <linux/swap.h> #include <linux/uio.h> #include <linux/writeback.h> -#include <linux/aio.h> #include <asm/page.h> #include <asm/uaccess.h> diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 898b9949d363..1d0c21df0d80 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -28,7 +28,6 @@ #include <linux/quotaops.h> #include <linux/slab.h> #include <linux/log2.h> -#include <linux/aio.h> #include "aops.h" #include "attrib.h" diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 44db1808cdb5..e1bf18c5d25e 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -29,6 +29,7 @@ #include <linux/mpage.h> #include <linux/quotaops.h> #include <linux/blkdev.h> +#include <linux/uio.h> #include <cluster/masklog.h> diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 6cae155d54df..dd59599b022d 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -22,7 +22,7 @@ #ifndef OCFS2_AOPS_H #define OCFS2_AOPS_H -#include <linux/aio.h> +#include <linux/fs.h> handle_t *ocfs2_start_walk_page_trans(struct inode *inode, struct page *page, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 46e0d4e857c7..266845de2100 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2280,7 +2280,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, file->f_path.dentry->d_name.name, (unsigned int)from->nr_segs); /* GRRRRR */ - if (iocb->ki_nbytes == 0) + if (count == 0) return 0; appending = file->f_flags & O_APPEND ? 1 : 0; @@ -2330,8 +2330,7 @@ relock: } can_do_direct = direct_io; - ret = ocfs2_prepare_inode_for_write(file, ppos, - iocb->ki_nbytes, appending, + ret = ocfs2_prepare_inode_for_write(file, ppos, count, appending, &can_do_direct, &has_refcount); if (ret < 0) { mlog_errno(ret); @@ -2339,8 +2338,7 @@ relock: } if (direct_io && !is_sync_kiocb(iocb)) - unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_nbytes, - *ppos); + unaligned_dio = ocfs2_is_io_unaligned(inode, count, *ppos); /* * We can't complete the direct I/O as requested, fall back to diff --git a/fs/pipe.c b/fs/pipe.c index 21981e58e2a6..2d084f2d0b83 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -21,7 +21,6 @@ #include <linux/audit.h> #include <linux/syscalls.h> #include <linux/fcntl.h> -#include <linux/aio.h> #include <asm/uaccess.h> #include <asm/ioctls.h> diff --git a/fs/read_write.c b/fs/read_write.c index 8e1b68786d66..99a6ef946d01 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -9,7 +9,6 @@ #include <linux/fcntl.h> #include <linux/file.h> #include <linux/uio.h> -#include <linux/aio.h> #include <linux/fsnotify.h> #include <linux/security.h> #include <linux/export.h> @@ -343,13 +342,10 @@ ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos) init_sync_kiocb(&kiocb, file); kiocb.ki_pos = *ppos; - kiocb.ki_nbytes = iov_iter_count(iter); iter->type |= READ; ret = file->f_op->read_iter(&kiocb, iter); - if (ret == -EIOCBQUEUED) - ret = wait_on_sync_kiocb(&kiocb); - + BUG_ON(ret == -EIOCBQUEUED); if (ret > 0) *ppos = kiocb.ki_pos; return ret; @@ -366,13 +362,10 @@ ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos) init_sync_kiocb(&kiocb, file); kiocb.ki_pos = *ppos; - kiocb.ki_nbytes = iov_iter_count(iter); iter->type |= WRITE; ret = file->f_op->write_iter(&kiocb, iter); - if (ret == -EIOCBQUEUED) - ret = wait_on_sync_kiocb(&kiocb); - + BUG_ON(ret == -EIOCBQUEUED); if (ret > 0) *ppos = kiocb.ki_pos; return ret; @@ -426,11 +419,9 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - kiocb.ki_nbytes = len; ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); - if (-EIOCBQUEUED == ret) - ret = wait_on_sync_kiocb(&kiocb); + BUG_ON(ret == -EIOCBQUEUED); *ppos = kiocb.ki_pos; return ret; } @@ -446,12 +437,10 @@ ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *p init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - kiocb.ki_nbytes = len; iov_iter_init(&iter, READ, &iov, 1, len); ret = filp->f_op->read_iter(&kiocb, &iter); - if (-EIOCBQUEUED == ret) - ret = wait_on_sync_kiocb(&kiocb); + BUG_ON(ret == -EIOCBQUEUED); *ppos = kiocb.ki_pos; return ret; } @@ -510,11 +499,9 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - kiocb.ki_nbytes = len; ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); - if (-EIOCBQUEUED == ret) - ret = wait_on_sync_kiocb(&kiocb); + BUG_ON(ret == -EIOCBQUEUED); *ppos = kiocb.ki_pos; return ret; } @@ -530,12 +517,10 @@ ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, lo init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - kiocb.ki_nbytes = len; iov_iter_init(&iter, WRITE, &iov, 1, len); ret = filp->f_op->write_iter(&kiocb, &iter); - if (-EIOCBQUEUED == ret) - ret = wait_on_sync_kiocb(&kiocb); + BUG_ON(ret == -EIOCBQUEUED); *ppos = kiocb.ki_pos; return ret; } @@ -719,12 +704,10 @@ static ssize_t do_iter_readv_writev(struct file *filp, int rw, const struct iove init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - kiocb.ki_nbytes = len; iov_iter_init(&iter, rw, iov, nr_segs, len); ret = fn(&kiocb, &iter); - if (ret == -EIOCBQUEUED) - ret = wait_on_sync_kiocb(&kiocb); + BUG_ON(ret == -EIOCBQUEUED); *ppos = kiocb.ki_pos; return ret; } @@ -737,11 +720,9 @@ static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - kiocb.ki_nbytes = len; ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos); - if (ret == -EIOCBQUEUED) - ret = wait_on_sync_kiocb(&kiocb); + BUG_ON(ret == -EIOCBQUEUED); *ppos = kiocb.ki_pos; return ret; } diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index e72401e1f995..9312b7842e03 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -18,7 +18,7 @@ #include <linux/writeback.h> #include <linux/quotaops.h> #include <linux/swap.h> -#include <linux/aio.h> +#include <linux/uio.h> int reiserfs_commit_write(struct file *f, struct page *page, unsigned from, unsigned to); diff --git a/fs/splice.c b/fs/splice.c index 7968da96bebb..4bbfa95b5bfe 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -32,7 +32,6 @@ #include <linux/gfp.h> #include <linux/socket.h> #include <linux/compat.h> -#include <linux/aio.h> #include "internal.h" /* diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index e627c0acf626..c3d15fe83403 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -50,7 +50,6 @@ */ #include "ubifs.h" -#include <linux/aio.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/slab.h> diff --git a/fs/udf/file.c b/fs/udf/file.c index 08f3555fbeac..7f885cc8b0b7 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -34,7 +34,7 @@ #include <linux/errno.h> #include <linux/pagemap.h> #include <linux/buffer_head.h> -#include <linux/aio.h> +#include <linux/uio.h> #include "udf_i.h" #include "udf_sb.h" @@ -122,7 +122,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); int err, pos; - size_t count = iocb->ki_nbytes; + size_t count = iov_iter_count(from); struct udf_inode_info *iinfo = UDF_I(inode); mutex_lock(&inode->i_mutex); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index a445d599098d..9c1fbd23913d 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -38,7 +38,7 @@ #include <linux/slab.h> #include <linux/crc-itu-t.h> #include <linux/mpage.h> -#include <linux/aio.h> +#include <linux/uio.h> #include "udf_i.h" #include "udf_sb.h" diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 3a9b7a1b8704..4f8cdc59bc38 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -31,7 +31,6 @@ #include "xfs_bmap.h" #include "xfs_bmap_util.h" #include "xfs_bmap_btree.h" -#include <linux/aio.h> #include <linux/gfp.h> #include <linux/mpage.h> #include <linux/pagevec.h> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index a2e1cb8a568b..f44212fae653 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -38,7 +38,6 @@ #include "xfs_icache.h" #include "xfs_pnfs.h" -#include <linux/aio.h> #include <linux/dcache.h> #include <linux/falloc.h> #include <linux/pagevec.h> diff --git a/include/linux/aio.h b/include/linux/aio.h index d9c92daa3944..9eb42dbc5582 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -1,86 +1,23 @@ #ifndef __LINUX__AIO_H #define __LINUX__AIO_H -#include <linux/list.h> -#include <linux/workqueue.h> #include <linux/aio_abi.h> -#include <linux/uio.h> -#include <linux/rcupdate.h> - -#include <linux/atomic.h> struct kioctx; struct kiocb; +struct mm_struct; #define KIOCB_KEY 0 -/* - * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either - * cancelled or completed (this makes a certain amount of sense because - * successful cancellation - io_cancel() - does deliver the completion to - * userspace). - * - * And since most things don't implement kiocb cancellation and we'd really like - * kiocb completion to be lockless when possible, we use ki_cancel to - * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED - * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel(). - */ -#define KIOCB_CANCELLED ((void *) (~0ULL)) - typedef int (kiocb_cancel_fn)(struct kiocb *); -struct kiocb { - struct file *ki_filp; - struct kioctx *ki_ctx; /* NULL for sync ops */ - kiocb_cancel_fn *ki_cancel; - void *private; - - union { - void __user *user; - struct task_struct *tsk; - } ki_obj; - - __u64 ki_user_data; /* user's data for completion */ - loff_t ki_pos; - size_t ki_nbytes; /* copy of iocb->aio_nbytes */ - - struct list_head ki_list; /* the aio core uses this - * for cancellation */ - - /* - * If the aio_resfd field of the userspace iocb is not zero, - * this is the underlying eventfd context to deliver events to. - */ - struct eventfd_ctx *ki_eventfd; -}; - -static inline bool is_sync_kiocb(struct kiocb *kiocb) -{ - return kiocb->ki_ctx == NULL; -} - -static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) -{ - *kiocb = (struct kiocb) { - .ki_ctx = NULL, - .ki_filp = filp, - .ki_obj.tsk = current, - }; -} - /* prototypes */ #ifdef CONFIG_AIO -extern ssize_t wait_on_sync_kiocb(struct kiocb *iocb); -extern void aio_complete(struct kiocb *iocb, long res, long res2); -struct mm_struct; extern void exit_aio(struct mm_struct *mm); extern long do_io_submit(aio_context_t ctx_id, long nr, struct iocb __user *__user *iocbpp, bool compat); void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel); #else -static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; } -static inline void aio_complete(struct kiocb *iocb, long res, long res2) { } -struct mm_struct; static inline void exit_aio(struct mm_struct *mm) { } static inline long do_io_submit(aio_context_t ctx_id, long nr, struct iocb __user * __user *iocbpp, @@ -89,11 +26,6 @@ static inline void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel) { } #endif /* CONFIG_AIO */ -static inline struct kiocb *list_kiocb(struct list_head *h) -{ - return list_entry(h, struct kiocb, ki_list); -} - /* for sysctl: */ extern unsigned long aio_nr; extern unsigned long aio_max_nr; diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index cab606617522..ae2982c0f7a6 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -11,6 +11,7 @@ #define PHY_ID_BCM5421 0x002060e0 #define PHY_ID_BCM5464 0x002060b0 #define PHY_ID_BCM5461 0x002060c0 +#define PHY_ID_BCM54616S 0x03625d10 #define PHY_ID_BCM57780 0x03625d90 #define PHY_ID_BCM7250 0xae025280 diff --git a/include/linux/fs.h b/include/linux/fs.h index f4131e8ead74..fdce1ddf230c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -314,6 +314,28 @@ struct page; struct address_space; struct writeback_control; +#define IOCB_EVENTFD (1 << 0) + +struct kiocb { + struct file *ki_filp; + loff_t ki_pos; + void (*ki_complete)(struct kiocb *iocb, long ret, long ret2); + void *private; + int ki_flags; +}; + +static inline bool is_sync_kiocb(struct kiocb *kiocb) +{ + return kiocb->ki_complete == NULL; +} + +static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) +{ + *kiocb = (struct kiocb) { + .ki_filp = filp, + }; +} + /* * "descriptor" for what we're up to with a read. * This allows us to use the same read code yet diff --git a/include/linux/net.h b/include/linux/net.h index e74114bcca68..738ea48be889 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -211,7 +211,7 @@ int sock_create(int family, int type, int proto, struct socket **res); int sock_create_kern(int family, int type, int proto, struct socket **res); int sock_create_lite(int family, int type, int proto, struct socket **res); void sock_release(struct socket *sock); -int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len); +int sock_sendmsg(struct socket *sock, struct msghdr *msg); int sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 13acb3d8ecdd..b5679aed660b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1313,6 +1313,8 @@ enum netdev_priv_flags { * @base_addr: Device I/O address * @irq: Device IRQ number * + * @carrier_changes: Stats to monitor carrier on<->off transitions + * * @state: Generic network queuing layer state, see netdev_state_t * @dev_list: The global list of network devices * @napi_list: List entry, that is used for polling napi devices @@ -1346,8 +1348,6 @@ enum netdev_priv_flags { * @tx_dropped: Dropped packets by core network, * do not use this in drivers * - * @carrier_changes: Stats to monitor carrier on<->off transitions - * * @wireless_handlers: List of functions to handle Wireless Extensions, * instead of ioctl, * see <net/iw_handler.h> for details. @@ -1390,14 +1390,14 @@ enum netdev_priv_flags { * @dev_port: Used to differentiate devices that share * the same function * @addr_list_lock: XXX: need comments on this one - * @uc: unicast mac addresses - * @mc: multicast mac addresses - * @dev_addrs: list of device hw addresses - * @queues_kset: Group of all Kobjects in the Tx and RX queues * @uc_promisc: Counter, that indicates, that promiscuous mode * has been enabled due to the need to listen to * additional unicast addresses in a device that * does not implement ndo_set_rx_mode() + * @uc: unicast mac addresses + * @mc: multicast mac addresses + * @dev_addrs: list of device hw addresses + * @queues_kset: Group of all Kobjects in the Tx and RX queues * @promiscuity: Number of times, the NIC is told to work in * Promiscuous mode, if it becomes 0 the NIC will * exit from working in Promiscuous mode @@ -1427,6 +1427,12 @@ enum netdev_priv_flags { * @ingress_queue: XXX: need comments on this one * @broadcast: hw bcast address * + * @rx_cpu_rmap: CPU reverse-mapping for RX completion interrupts, + * indexed by RX queue number. Assigned by driver. + * This must only be set if the ndo_rx_flow_steer + * operation is defined + * @index_hlist: Device index hash chain + * * @_tx: Array of TX queues * @num_tx_queues: Number of TX queues allocated at alloc_netdev_mq() time * @real_num_tx_queues: Number of TX queues currently active in device @@ -1436,11 +1442,6 @@ enum netdev_priv_flags { * * @xps_maps: XXX: need comments on this one * - * @rx_cpu_rmap: CPU reverse-mapping for RX completion interrupts, - * indexed by RX queue number. Assigned by driver. - * This must only be set if the ndo_rx_flow_steer - * operation is defined - * * @trans_start: Time (in jiffies) of last Tx * @watchdog_timeo: Represents the timeout that is used by * the watchdog ( see dev_watchdog() ) @@ -1448,7 +1449,6 @@ enum netdev_priv_flags { * * @pcpu_refcnt: Number of references to this device * @todo_list: Delayed register/unregister - * @index_hlist: Device index hash chain * @link_watch_list: XXX: need comments on this one * * @reg_state: Register/unregister state machine @@ -1515,6 +1515,8 @@ struct net_device { unsigned long base_addr; int irq; + atomic_t carrier_changes; + /* * Some hardware also needs these fields (state,dev_list, * napi_list,unreg_list,close_list) but they are not @@ -1555,8 +1557,6 @@ struct net_device { atomic_long_t rx_dropped; atomic_long_t tx_dropped; - atomic_t carrier_changes; - #ifdef CONFIG_WIRELESS_EXT const struct iw_handler_def * wireless_handlers; struct iw_public_data * wireless_data; @@ -1596,6 +1596,8 @@ struct net_device { unsigned short dev_id; unsigned short dev_port; spinlock_t addr_list_lock; + unsigned char name_assign_type; + bool uc_promisc; struct netdev_hw_addr_list uc; struct netdev_hw_addr_list mc; struct netdev_hw_addr_list dev_addrs; @@ -1603,10 +1605,6 @@ struct net_device { #ifdef CONFIG_SYSFS struct kset *queues_kset; #endif - - unsigned char name_assign_type; - - bool uc_promisc; unsigned int promiscuity; unsigned int allmulti; @@ -1653,7 +1651,10 @@ struct net_device { struct netdev_queue __rcu *ingress_queue; unsigned char broadcast[MAX_ADDR_LEN]; - +#ifdef CONFIG_RFS_ACCEL + struct cpu_rmap *rx_cpu_rmap; +#endif + struct hlist_node index_hlist; /* * Cache lines mostly used on transmit path @@ -1664,13 +1665,11 @@ struct net_device { struct Qdisc *qdisc; unsigned long tx_queue_len; spinlock_t tx_global_lock; + int watchdog_timeo; #ifdef CONFIG_XPS struct xps_dev_maps __rcu *xps_maps; #endif -#ifdef CONFIG_RFS_ACCEL - struct cpu_rmap *rx_cpu_rmap; -#endif /* These may be needed for future network-power-down code. */ @@ -1680,13 +1679,11 @@ struct net_device { */ unsigned long trans_start; - int watchdog_timeo; struct timer_list watchdog_timer; int __percpu *pcpu_refcnt; struct list_head todo_list; - struct hlist_node index_hlist; struct list_head link_watch_list; enum { NETREG_UNINITIALIZED=0, @@ -1751,7 +1748,6 @@ struct net_device { #endif struct phy_device *phydev; struct lock_class_key *qdisc_tx_busylock; - struct pm_qos_request pm_qos_req; }; #define to_net_dev(d) container_of(d, struct net_device, dev) diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 02fc86d2348e..6835c1279df7 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -134,7 +134,7 @@ struct netlink_callback { struct netlink_notify { struct net *net; - int portid; + u32 portid; int protocol; }; diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 5db76a32fcab..2da5d1081ad9 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -77,7 +77,20 @@ static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev) return rtnl_dereference(dev->ingress_queue); } -extern struct netdev_queue *dev_ingress_queue_create(struct net_device *dev); +struct netdev_queue *dev_ingress_queue_create(struct net_device *dev); + +#ifdef CONFIG_NET_CLS_ACT +void net_inc_ingress_queue(void); +void net_dec_ingress_queue(void); +#else +static inline void net_inc_ingress_queue(void) +{ +} + +static inline void net_dec_ingress_queue(void) +{ +} +#endif extern void rtnetlink_init(void); extern void __rtnl_unlock(void); diff --git a/include/linux/socket.h b/include/linux/socket.h index c9852ef7e317..5bf59c8493b7 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -139,6 +139,11 @@ static inline struct cmsghdr * cmsg_nxthdr (struct msghdr *__msg, struct cmsghdr return __cmsg_nxthdr(__msg->msg_control, __msg->msg_controllen, __cmsg); } +static inline size_t msg_data_left(struct msghdr *msg) +{ + return iov_iter_count(&msg->msg_iter); +} + /* "Socket"-level control message types: */ #define SCM_RIGHTS 0x01 /* rw: access rights (array of int) */ diff --git a/include/linux/uio.h b/include/linux/uio.h index 71880299ed48..1f4a37f1f025 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -139,4 +139,18 @@ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count) size_t csum_and_copy_to_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i); size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i); +int import_iovec(int type, const struct iovec __user * uvector, + unsigned nr_segs, unsigned fast_segs, + struct iovec **iov, struct iov_iter *i); + +#ifdef CONFIG_COMPAT +struct compat_iovec; +int compat_import_iovec(int type, const struct compat_iovec __user * uvector, + unsigned nr_segs, unsigned fast_segs, + struct iovec **iov, struct iov_iter *i); +#endif + +int import_single_range(int type, void __user *buf, size_t len, + struct iovec *iov, struct iov_iter *i); + #endif diff --git a/include/net/compat.h b/include/net/compat.h index 42a9c8431177..48103cf94e97 100644 --- a/include/net/compat.h +++ b/include/net/compat.h @@ -40,7 +40,7 @@ int compat_sock_get_timestampns(struct sock *, struct timespec __user *); #define compat_mmsghdr mmsghdr #endif /* defined(CONFIG_COMPAT) */ -ssize_t get_compat_msghdr(struct msghdr *, struct compat_msghdr __user *, +int get_compat_msghdr(struct msghdr *, struct compat_msghdr __user *, struct sockaddr __user **, struct iovec **); asmlinkage long compat_sys_sendmsg(int, struct compat_msghdr __user *, unsigned int); diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index b7ce1003c429..360c4802288d 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -31,67 +31,14 @@ struct inet_hashinfo; -#define INET_TWDR_RECYCLE_SLOTS_LOG 5 -#define INET_TWDR_RECYCLE_SLOTS (1 << INET_TWDR_RECYCLE_SLOTS_LOG) - -/* - * If time > 4sec, it is "slow" path, no recycling is required, - * so that we select tick to get range about 4 seconds. - */ -#if HZ <= 16 || HZ > 4096 -# error Unsupported: HZ <= 16 or HZ > 4096 -#elif HZ <= 32 -# define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) -#elif HZ <= 64 -# define INET_TWDR_RECYCLE_TICK (6 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) -#elif HZ <= 128 -# define INET_TWDR_RECYCLE_TICK (7 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) -#elif HZ <= 256 -# define INET_TWDR_RECYCLE_TICK (8 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) -#elif HZ <= 512 -# define INET_TWDR_RECYCLE_TICK (9 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) -#elif HZ <= 1024 -# define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) -#elif HZ <= 2048 -# define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) -#else -# define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) -#endif - -static inline u32 inet_tw_time_stamp(void) -{ - return jiffies; -} - -/* TIME_WAIT reaping mechanism. */ -#define INET_TWDR_TWKILL_SLOTS 8 /* Please keep this a power of 2. */ - -#define INET_TWDR_TWKILL_QUOTA 100 - struct inet_timewait_death_row { - /* Short-time timewait calendar */ - int twcal_hand; - unsigned long twcal_jiffie; - struct timer_list twcal_timer; - struct hlist_head twcal_row[INET_TWDR_RECYCLE_SLOTS]; - - spinlock_t death_lock; - int tw_count; - int period; - u32 thread_slots; - struct work_struct twkill_work; - struct timer_list tw_timer; - int slot; - struct hlist_head cells[INET_TWDR_TWKILL_SLOTS]; - struct inet_hashinfo *hashinfo; + atomic_t tw_count; + + struct inet_hashinfo *hashinfo ____cacheline_aligned_in_smp; int sysctl_tw_recycle; int sysctl_max_tw_buckets; }; -void inet_twdr_hangman(unsigned long data); -void inet_twdr_twkill_work(struct work_struct *work); -void inet_twdr_twcal_tick(unsigned long data); - struct inet_bind_bucket; /* @@ -133,52 +80,18 @@ struct inet_timewait_sock { __be16 tw_sport; kmemcheck_bitfield_begin(flags); /* And these are ours. */ - unsigned int tw_pad0 : 1, /* 1 bit hole */ + unsigned int tw_kill : 1, tw_transparent : 1, tw_flowlabel : 20, tw_pad : 2, /* 2 bits hole */ tw_tos : 8; kmemcheck_bitfield_end(flags); - u32 tw_ttd; + struct timer_list tw_timer; struct inet_bind_bucket *tw_tb; - struct hlist_node tw_death_node; + struct inet_timewait_death_row *tw_dr; }; #define tw_tclass tw_tos -static inline int inet_twsk_dead_hashed(const struct inet_timewait_sock *tw) -{ - return !hlist_unhashed(&tw->tw_death_node); -} - -static inline void inet_twsk_dead_node_init(struct inet_timewait_sock *tw) -{ - tw->tw_death_node.pprev = NULL; -} - -static inline void __inet_twsk_del_dead_node(struct inet_timewait_sock *tw) -{ - __hlist_del(&tw->tw_death_node); - inet_twsk_dead_node_init(tw); -} - -static inline int inet_twsk_del_dead_node(struct inet_timewait_sock *tw) -{ - if (inet_twsk_dead_hashed(tw)) { - __inet_twsk_del_dead_node(tw); - return 1; - } - return 0; -} - -#define inet_twsk_for_each(tw, node, head) \ - hlist_nulls_for_each_entry(tw, node, head, tw_node) - -#define inet_twsk_for_each_inmate(tw, jail) \ - hlist_for_each_entry(tw, jail, tw_death_node) - -#define inet_twsk_for_each_inmate_safe(tw, safe, jail) \ - hlist_for_each_entry_safe(tw, safe, jail, tw_death_node) - static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk) { return (struct inet_timewait_sock *)sk; @@ -193,16 +106,14 @@ int inet_twsk_bind_unhash(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo); struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, + struct inet_timewait_death_row *dr, const int state); void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, struct inet_hashinfo *hashinfo); -void inet_twsk_schedule(struct inet_timewait_sock *tw, - struct inet_timewait_death_row *twdr, - const int timeo, const int timewait_len); -void inet_twsk_deschedule(struct inet_timewait_sock *tw, - struct inet_timewait_death_row *twdr); +void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo); +void inet_twsk_deschedule(struct inet_timewait_sock *tw); void inet_twsk_purge(struct inet_hashinfo *hashinfo, struct inet_timewait_death_row *twdr, int family); diff --git a/include/net/sock.h b/include/net/sock.h index bd6f523f2251..3a4898ec8c67 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -57,7 +57,6 @@ #include <linux/page_counter.h> #include <linux/memcontrol.h> #include <linux/static_key.h> -#include <linux/aio.h> #include <linux/sched.h> #include <linux/filter.h> diff --git a/include/rxrpc/packet.h b/include/rxrpc/packet.h index f2902ef7ab75..4dce116bfd80 100644 --- a/include/rxrpc/packet.h +++ b/include/rxrpc/packet.h @@ -47,7 +47,8 @@ struct rxrpc_header { #define RXRPC_PACKET_TYPE_CHALLENGE 6 /* connection security challenge (SRVR->CLNT) */ #define RXRPC_PACKET_TYPE_RESPONSE 7 /* connection secutity response (CLNT->SRVR) */ #define RXRPC_PACKET_TYPE_DEBUG 8 /* debug info request */ -#define RXRPC_N_PACKET_TYPES 9 /* number of packet types (incl type 0) */ +#define RXRPC_PACKET_TYPE_VERSION 13 /* version string request */ +#define RXRPC_N_PACKET_TYPES 14 /* number of packet types (incl type 0) */ uint8_t flags; /* packet flags */ #define RXRPC_CLIENT_INITIATED 0x01 /* signifies a packet generated by a client */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index bb0635bd74f2..879edfc5ee52 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -32,7 +32,6 @@ #include <linux/security.h> #include <linux/bootmem.h> #include <linux/memblock.h> -#include <linux/aio.h> #include <linux/syscalls.h> #include <linux/kexec.h> #include <linux/kdb.h> @@ -46,6 +45,7 @@ #include <linux/irq_work.h> #include <linux/utsname.h> #include <linux/ctype.h> +#include <linux/uio.h> #include <asm/uaccess.h> @@ -521,7 +521,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) int i; int level = default_message_loglevel; int facility = 1; /* LOG_USER */ - size_t len = iocb->ki_nbytes; + size_t len = iov_iter_count(from); ssize_t ret = len; if (len > LOG_LINE_MAX) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ce410bb9f2e1..4012336de30f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -19,6 +19,7 @@ */ #include <linux/module.h> +#include <linux/aio.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/slab.h> diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 9d96e283520c..fc6e33f6b7f3 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -766,3 +766,60 @@ const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) flags); } EXPORT_SYMBOL(dup_iter); + +int import_iovec(int type, const struct iovec __user * uvector, + unsigned nr_segs, unsigned fast_segs, + struct iovec **iov, struct iov_iter *i) +{ + ssize_t n; + struct iovec *p; + n = rw_copy_check_uvector(type, uvector, nr_segs, fast_segs, + *iov, &p); + if (n < 0) { + if (p != *iov) + kfree(p); + *iov = NULL; + return n; + } + iov_iter_init(i, type, p, nr_segs, n); + *iov = p == *iov ? NULL : p; + return 0; +} +EXPORT_SYMBOL(import_iovec); + +#ifdef CONFIG_COMPAT +#include <linux/compat.h> + +int compat_import_iovec(int type, const struct compat_iovec __user * uvector, + unsigned nr_segs, unsigned fast_segs, + struct iovec **iov, struct iov_iter *i) +{ + ssize_t n; + struct iovec *p; + n = compat_rw_copy_check_uvector(type, uvector, nr_segs, fast_segs, + *iov, &p); + if (n < 0) { + if (p != *iov) + kfree(p); + *iov = NULL; + return n; + } + iov_iter_init(i, type, p, nr_segs, n); + *iov = p == *iov ? NULL : p; + return 0; +} +#endif + +int import_single_range(int rw, void __user *buf, size_t len, + struct iovec *iov, struct iov_iter *i) +{ + if (len > MAX_RW_COUNT) + len = MAX_RW_COUNT; + if (unlikely(!access_ok(!rw, buf, len))) + return -EFAULT; + + iov->iov_base = buf; + iov->iov_len = len; + iov_iter_init(i, rw, iov, 1, len); + return 0; +} diff --git a/mm/filemap.c b/mm/filemap.c index ad7242043bdb..876f4e6f3ed6 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -13,7 +13,6 @@ #include <linux/compiler.h> #include <linux/fs.h> #include <linux/uaccess.h> -#include <linux/aio.h> #include <linux/capability.h> #include <linux/kernel_stat.h> #include <linux/gfp.h> diff --git a/mm/page_io.c b/mm/page_io.c index e6045804c8d8..a96c8562d835 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -20,8 +20,8 @@ #include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/frontswap.h> -#include <linux/aio.h> #include <linux/blkdev.h> +#include <linux/uio.h> #include <asm/pgtable.h> static struct bio *get_swap_bio(gfp_t gfp_flags, @@ -274,7 +274,6 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, iov_iter_bvec(&from, ITER_BVEC | WRITE, &bv, 1, PAGE_SIZE); init_sync_kiocb(&kiocb, swap_file); kiocb.ki_pos = page_file_offset(page); - kiocb.ki_nbytes = PAGE_SIZE; set_page_writeback(page); unlock_page(page); diff --git a/mm/shmem.c b/mm/shmem.c index cf2d0ca010bc..80b360c7bcd1 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -31,7 +31,7 @@ #include <linux/mm.h> #include <linux/export.h> #include <linux/swap.h> -#include <linux/aio.h> +#include <linux/uio.h> static struct vfsmount *shm_mnt; diff --git a/net/compat.c b/net/compat.c index c4b6b0f43d5d..5cfd26a0006f 100644 --- a/net/compat.c +++ b/net/compat.c @@ -31,10 +31,10 @@ #include <asm/uaccess.h> #include <net/compat.h> -ssize_t get_compat_msghdr(struct msghdr *kmsg, - struct compat_msghdr __user *umsg, - struct sockaddr __user **save_addr, - struct iovec **iov) +int get_compat_msghdr(struct msghdr *kmsg, + struct compat_msghdr __user *umsg, + struct sockaddr __user **save_addr, + struct iovec **iov) { compat_uptr_t uaddr, uiov, tmp3; compat_size_t nr_segs; @@ -81,13 +81,9 @@ ssize_t get_compat_msghdr(struct msghdr *kmsg, kmsg->msg_iocb = NULL; - err = compat_rw_copy_check_uvector(save_addr ? READ : WRITE, - compat_ptr(uiov), nr_segs, - UIO_FASTIOV, *iov, iov); - if (err >= 0) - iov_iter_init(&kmsg->msg_iter, save_addr ? READ : WRITE, - *iov, nr_segs, err); - return err; + return compat_import_iovec(save_addr ? READ : WRITE, + compat_ptr(uiov), nr_segs, + UIO_FASTIOV, iov, &kmsg->msg_iter); } /* Bleech... */ diff --git a/net/core/datagram.c b/net/core/datagram.c index df493d68330c..b80fb91bb3f7 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -673,7 +673,7 @@ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, if (!chunk) return 0; - if (iov_iter_count(&msg->msg_iter) < chunk) { + if (msg_data_left(msg) < chunk) { if (__skb_checksum_complete(skb)) goto csum_error; if (skb_copy_datagram_msg(skb, hlen, msg, chunk)) diff --git a/net/core/dev.c b/net/core/dev.c index b2775f06c710..af4a1b0adc10 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1630,6 +1630,22 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev) } EXPORT_SYMBOL(call_netdevice_notifiers); +#ifdef CONFIG_NET_CLS_ACT +static struct static_key ingress_needed __read_mostly; + +void net_inc_ingress_queue(void) +{ + static_key_slow_inc(&ingress_needed); +} +EXPORT_SYMBOL_GPL(net_inc_ingress_queue); + +void net_dec_ingress_queue(void) +{ + static_key_slow_dec(&ingress_needed); +} +EXPORT_SYMBOL_GPL(net_dec_ingress_queue); +#endif + static struct static_key netstamp_needed __read_mostly; #ifdef HAVE_JUMP_LABEL /* We are not allowed to call static_key_slow_dec() from irq context @@ -3547,7 +3563,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb, struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue); if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc) - goto out; + return skb; if (*pt_prev) { *ret = deliver_skb(skb, *pt_prev, orig_dev); @@ -3561,8 +3577,6 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb, return NULL; } -out: - skb->tc_verd = 0; return skb; } #endif @@ -3698,12 +3712,15 @@ another_round: skip_taps: #ifdef CONFIG_NET_CLS_ACT - skb = handle_ing(skb, &pt_prev, &ret, orig_dev); - if (!skb) - goto unlock; + if (static_key_false(&ingress_needed)) { + skb = handle_ing(skb, &pt_prev, &ret, orig_dev); + if (!skb) + goto unlock; + } + + skb->tc_verd = 0; ncls: #endif - if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) goto drop; diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 332f7d6d9942..5f566663e47f 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -27,28 +27,16 @@ struct inet_timewait_death_row dccp_death_row = { .sysctl_max_tw_buckets = NR_FILE * 2, - .period = DCCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS, - .death_lock = __SPIN_LOCK_UNLOCKED(dccp_death_row.death_lock), .hashinfo = &dccp_hashinfo, - .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0, - (unsigned long)&dccp_death_row), - .twkill_work = __WORK_INITIALIZER(dccp_death_row.twkill_work, - inet_twdr_twkill_work), -/* Short-time timewait calendar */ - - .twcal_hand = -1, - .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0, - (unsigned long)&dccp_death_row), }; EXPORT_SYMBOL_GPL(dccp_death_row); void dccp_time_wait(struct sock *sk, int state, int timeo) { - struct inet_timewait_sock *tw = NULL; + struct inet_timewait_sock *tw; - if (dccp_death_row.tw_count < dccp_death_row.sysctl_max_tw_buckets) - tw = inet_twsk_alloc(sk, state); + tw = inet_twsk_alloc(sk, &dccp_death_row, state); if (tw != NULL) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -71,8 +59,7 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) if (state == DCCP_TIME_WAIT) timeo = DCCP_TIMEWAIT_LEN; - inet_twsk_schedule(tw, &dccp_death_row, timeo, - DCCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, timeo); inet_twsk_put(tw); } else { /* Sorry, if we're out of memory, just CLOSE this diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 263710259774..af150b43b214 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -886,12 +886,12 @@ EXPORT_SYMBOL(gue_build_header); #ifdef CONFIG_NET_FOU_IP_TUNNELS -static const struct ip_tunnel_encap_ops __read_mostly fou_iptun_ops = { +static const struct ip_tunnel_encap_ops fou_iptun_ops = { .encap_hlen = fou_encap_hlen, .build_header = fou_build_header, }; -static const struct ip_tunnel_encap_ops __read_mostly gue_iptun_ops = { +static const struct ip_tunnel_encap_ops gue_iptun_ops = { .encap_hlen = gue_encap_hlen, .build_header = gue_build_header, }; diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c index b77f5e84c623..8986e63f3bda 100644 --- a/net/ipv4/geneve.c +++ b/net/ipv4/geneve.c @@ -113,10 +113,6 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, int min_headroom; int err; - skb = udp_tunnel_handle_offloads(skb, csum); - if (IS_ERR(skb)) - return PTR_ERR(skb); - min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr) + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); @@ -131,6 +127,10 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, if (unlikely(!skb)) return -ENOMEM; + skb = udp_tunnel_handle_offloads(skb, csum); + if (IS_ERR(skb)) + return PTR_ERR(skb); + gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len); geneve_build_header(gnvh, tun_flags, vni, opt_len, opt); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 76322c9867d5..70e8b3c308ec 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -248,7 +248,7 @@ static int inet_twsk_diag_fill(struct sock *sk, struct inet_timewait_sock *tw = inet_twsk(sk); struct inet_diag_msg *r; struct nlmsghdr *nlh; - s32 tmo; + long tmo; nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), nlmsg_flags); @@ -258,7 +258,7 @@ static int inet_twsk_diag_fill(struct sock *sk, r = nlmsg_data(nlh); BUG_ON(tw->tw_state != TCP_TIME_WAIT); - tmo = tw->tw_ttd - inet_tw_time_stamp(); + tmo = tw->tw_timer.expires - jiffies; if (tmo < 0) tmo = 0; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index d4630bf2d9aa..c6fb80bd5826 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -388,7 +388,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, *twp = tw; } else if (tw) { /* Silly. Should hash-dance instead... */ - inet_twsk_deschedule(tw, death_row); + inet_twsk_deschedule(tw); inet_twsk_put(tw); } @@ -565,7 +565,7 @@ ok: spin_unlock(&head->lock); if (tw) { - inet_twsk_deschedule(tw, death_row); + inet_twsk_deschedule(tw); while (twrefcnt) { twrefcnt--; inet_twsk_put(tw); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 118f0f195820..00ec8d5d7e7e 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -67,9 +67,9 @@ int inet_twsk_bind_unhash(struct inet_timewait_sock *tw, } /* Must be called with locally disabled BHs. */ -static void __inet_twsk_kill(struct inet_timewait_sock *tw, - struct inet_hashinfo *hashinfo) +static void inet_twsk_kill(struct inet_timewait_sock *tw) { + struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo; struct inet_bind_hashbucket *bhead; int refcnt; /* Unlink from established hashes. */ @@ -89,6 +89,8 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw, BUG_ON(refcnt >= atomic_read(&tw->tw_refcnt)); atomic_sub(refcnt, &tw->tw_refcnt); + atomic_dec(&tw->tw_dr->tw_count); + inet_twsk_put(tw); } void inet_twsk_free(struct inet_timewait_sock *tw) @@ -168,16 +170,34 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, } EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); -struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) +void tw_timer_handler(unsigned long data) { - struct inet_timewait_sock *tw = - kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, - GFP_ATOMIC); + struct inet_timewait_sock *tw = (struct inet_timewait_sock *)data; + + if (tw->tw_kill) + NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED); + else + NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED); + inet_twsk_kill(tw); +} + +struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, + struct inet_timewait_death_row *dr, + const int state) +{ + struct inet_timewait_sock *tw; + + if (atomic_read(&dr->tw_count) >= dr->sysctl_max_tw_buckets) + return NULL; + + tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, + GFP_ATOMIC); if (tw) { const struct inet_sock *inet = inet_sk(sk); kmemcheck_annotate_bitfield(tw, flags); + tw->tw_dr = dr; /* Give us an identity. */ tw->tw_daddr = inet->inet_daddr; tw->tw_rcv_saddr = inet->inet_rcv_saddr; @@ -196,13 +216,14 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat tw->tw_prot = sk->sk_prot_creator; atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); twsk_net_set(tw, sock_net(sk)); + setup_timer(&tw->tw_timer, tw_timer_handler, (unsigned long)tw); /* * Because we use RCU lookups, we should not set tw_refcnt * to a non null value before everything is setup for this * timewait socket. */ atomic_set(&tw->tw_refcnt, 0); - inet_twsk_dead_node_init(tw); + __module_get(tw->tw_prot->owner); } @@ -210,139 +231,20 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat } EXPORT_SYMBOL_GPL(inet_twsk_alloc); -/* Returns non-zero if quota exceeded. */ -static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr, - const int slot) -{ - struct inet_timewait_sock *tw; - unsigned int killed; - int ret; - - /* NOTE: compare this to previous version where lock - * was released after detaching chain. It was racy, - * because tw buckets are scheduled in not serialized context - * in 2.3 (with netfilter), and with softnet it is common, because - * soft irqs are not sequenced. - */ - killed = 0; - ret = 0; -rescan: - inet_twsk_for_each_inmate(tw, &twdr->cells[slot]) { - __inet_twsk_del_dead_node(tw); - spin_unlock(&twdr->death_lock); - __inet_twsk_kill(tw, twdr->hashinfo); -#ifdef CONFIG_NET_NS - NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED); -#endif - inet_twsk_put(tw); - killed++; - spin_lock(&twdr->death_lock); - if (killed > INET_TWDR_TWKILL_QUOTA) { - ret = 1; - break; - } - - /* While we dropped twdr->death_lock, another cpu may have - * killed off the next TW bucket in the list, therefore - * do a fresh re-read of the hlist head node with the - * lock reacquired. We still use the hlist traversal - * macro in order to get the prefetches. - */ - goto rescan; - } - - twdr->tw_count -= killed; -#ifndef CONFIG_NET_NS - NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITED, killed); -#endif - return ret; -} - -void inet_twdr_hangman(unsigned long data) -{ - struct inet_timewait_death_row *twdr; - unsigned int need_timer; - - twdr = (struct inet_timewait_death_row *)data; - spin_lock(&twdr->death_lock); - - if (twdr->tw_count == 0) - goto out; - - need_timer = 0; - if (inet_twdr_do_twkill_work(twdr, twdr->slot)) { - twdr->thread_slots |= (1 << twdr->slot); - schedule_work(&twdr->twkill_work); - need_timer = 1; - } else { - /* We purged the entire slot, anything left? */ - if (twdr->tw_count) - need_timer = 1; - twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1)); - } - if (need_timer) - mod_timer(&twdr->tw_timer, jiffies + twdr->period); -out: - spin_unlock(&twdr->death_lock); -} -EXPORT_SYMBOL_GPL(inet_twdr_hangman); - -void inet_twdr_twkill_work(struct work_struct *work) -{ - struct inet_timewait_death_row *twdr = - container_of(work, struct inet_timewait_death_row, twkill_work); - int i; - - BUILD_BUG_ON((INET_TWDR_TWKILL_SLOTS - 1) > - (sizeof(twdr->thread_slots) * 8)); - - while (twdr->thread_slots) { - spin_lock_bh(&twdr->death_lock); - for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) { - if (!(twdr->thread_slots & (1 << i))) - continue; - - while (inet_twdr_do_twkill_work(twdr, i) != 0) { - if (need_resched()) { - spin_unlock_bh(&twdr->death_lock); - schedule(); - spin_lock_bh(&twdr->death_lock); - } - } - - twdr->thread_slots &= ~(1 << i); - } - spin_unlock_bh(&twdr->death_lock); - } -} -EXPORT_SYMBOL_GPL(inet_twdr_twkill_work); - /* These are always called from BH context. See callers in * tcp_input.c to verify this. */ /* This is for handling early-kills of TIME_WAIT sockets. */ -void inet_twsk_deschedule(struct inet_timewait_sock *tw, - struct inet_timewait_death_row *twdr) +void inet_twsk_deschedule(struct inet_timewait_sock *tw) { - spin_lock(&twdr->death_lock); - if (inet_twsk_del_dead_node(tw)) { - inet_twsk_put(tw); - if (--twdr->tw_count == 0) - del_timer(&twdr->tw_timer); - } - spin_unlock(&twdr->death_lock); - __inet_twsk_kill(tw, twdr->hashinfo); + if (del_timer_sync(&tw->tw_timer)) + inet_twsk_kill(tw); } EXPORT_SYMBOL(inet_twsk_deschedule); -void inet_twsk_schedule(struct inet_timewait_sock *tw, - struct inet_timewait_death_row *twdr, - const int timeo, const int timewait_len) +void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo) { - struct hlist_head *list; - int slot; - /* timeout := RTO * 3.5 * * 3.5 = 1+2+0.5 to wait for two retransmits. @@ -367,115 +269,15 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw, * is greater than TS tick!) and detect old duplicates with help * of PAWS. */ - slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK; - spin_lock(&twdr->death_lock); - - /* Unlink it, if it was scheduled */ - if (inet_twsk_del_dead_node(tw)) - twdr->tw_count--; - else + tw->tw_kill = timeo <= 4*HZ; + if (!mod_timer_pinned(&tw->tw_timer, jiffies + timeo)) { atomic_inc(&tw->tw_refcnt); - - if (slot >= INET_TWDR_RECYCLE_SLOTS) { - /* Schedule to slow timer */ - if (timeo >= timewait_len) { - slot = INET_TWDR_TWKILL_SLOTS - 1; - } else { - slot = DIV_ROUND_UP(timeo, twdr->period); - if (slot >= INET_TWDR_TWKILL_SLOTS) - slot = INET_TWDR_TWKILL_SLOTS - 1; - } - tw->tw_ttd = inet_tw_time_stamp() + timeo; - slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1); - list = &twdr->cells[slot]; - } else { - tw->tw_ttd = inet_tw_time_stamp() + (slot << INET_TWDR_RECYCLE_TICK); - - if (twdr->twcal_hand < 0) { - twdr->twcal_hand = 0; - twdr->twcal_jiffie = jiffies; - twdr->twcal_timer.expires = twdr->twcal_jiffie + - (slot << INET_TWDR_RECYCLE_TICK); - add_timer(&twdr->twcal_timer); - } else { - if (time_after(twdr->twcal_timer.expires, - jiffies + (slot << INET_TWDR_RECYCLE_TICK))) - mod_timer(&twdr->twcal_timer, - jiffies + (slot << INET_TWDR_RECYCLE_TICK)); - slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1); - } - list = &twdr->twcal_row[slot]; + atomic_inc(&tw->tw_dr->tw_count); } - - hlist_add_head(&tw->tw_death_node, list); - - if (twdr->tw_count++ == 0) - mod_timer(&twdr->tw_timer, jiffies + twdr->period); - spin_unlock(&twdr->death_lock); } EXPORT_SYMBOL_GPL(inet_twsk_schedule); -void inet_twdr_twcal_tick(unsigned long data) -{ - struct inet_timewait_death_row *twdr; - int n, slot; - unsigned long j; - unsigned long now = jiffies; - int killed = 0; - int adv = 0; - - twdr = (struct inet_timewait_death_row *)data; - - spin_lock(&twdr->death_lock); - if (twdr->twcal_hand < 0) - goto out; - - slot = twdr->twcal_hand; - j = twdr->twcal_jiffie; - - for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) { - if (time_before_eq(j, now)) { - struct hlist_node *safe; - struct inet_timewait_sock *tw; - - inet_twsk_for_each_inmate_safe(tw, safe, - &twdr->twcal_row[slot]) { - __inet_twsk_del_dead_node(tw); - __inet_twsk_kill(tw, twdr->hashinfo); -#ifdef CONFIG_NET_NS - NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED); -#endif - inet_twsk_put(tw); - killed++; - } - } else { - if (!adv) { - adv = 1; - twdr->twcal_jiffie = j; - twdr->twcal_hand = slot; - } - - if (!hlist_empty(&twdr->twcal_row[slot])) { - mod_timer(&twdr->twcal_timer, j); - goto out; - } - } - j += 1 << INET_TWDR_RECYCLE_TICK; - slot = (slot + 1) & (INET_TWDR_RECYCLE_SLOTS - 1); - } - twdr->twcal_hand = -1; - -out: - if ((twdr->tw_count -= killed) == 0) - del_timer(&twdr->tw_timer); -#ifndef CONFIG_NET_NS - NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITKILLED, killed); -#endif - spin_unlock(&twdr->death_lock); -} -EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick); - void inet_twsk_purge(struct inet_hashinfo *hashinfo, struct inet_timewait_death_row *twdr, int family) { @@ -509,7 +311,7 @@ restart: rcu_read_unlock(); local_bh_disable(); - inet_twsk_deschedule(tw, twdr); + inet_twsk_deschedule(tw); local_bh_enable(); inet_twsk_put(tw); goto restart_rcu; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index d8953ef0770c..e1f3b911dd1e 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -63,7 +63,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) socket_seq_show(seq); seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n", sock_prot_inuse_get(net, &tcp_prot), orphans, - tcp_death_row.tw_count, sockets, + atomic_read(&tcp_death_row.tw_count), sockets, proto_memory_allocated(&tcp_prot)); seq_printf(seq, "UDP: inuse %d mem %ld\n", sock_prot_inuse_get(net, &udp_prot), diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index c0bb648fb2f9..561cd4b8fc6e 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -46,7 +46,6 @@ #include <linux/stddef.h> #include <linux/slab.h> #include <linux/errno.h> -#include <linux/aio.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/spinlock.h> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 094a6822c71d..18e3a12eb1b2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1119,7 +1119,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) sg = !!(sk->sk_route_caps & NETIF_F_SG); - while (iov_iter_count(&msg->msg_iter)) { + while (msg_data_left(msg)) { int copy = 0; int max = size_goal; @@ -1163,8 +1163,8 @@ new_segment: } /* Try to append data to the end of skb. */ - if (copy > iov_iter_count(&msg->msg_iter)) - copy = iov_iter_count(&msg->msg_iter); + if (copy > msg_data_left(msg)) + copy = msg_data_left(msg); /* Where to copy to? */ if (skb_availroom(skb) > 0) { @@ -1221,7 +1221,7 @@ new_segment: tcp_skb_pcount_set(skb, 0); copied += copy; - if (!iov_iter_count(&msg->msg_iter)) { + if (!msg_data_left(msg)) { tcp_tx_timestamp(sk, skb); goto out; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 031cf72cd05c..a7ef679dd3ea 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3099,17 +3099,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (sacked & TCPCB_SACKED_RETRANS) tp->retrans_out -= acked_pcount; flag |= FLAG_RETRANS_DATA_ACKED; - } else { + } else if (!(sacked & TCPCB_SACKED_ACKED)) { last_ackt = skb->skb_mstamp; WARN_ON_ONCE(last_ackt.v64 == 0); if (!first_ackt.v64) first_ackt = last_ackt; - if (!(sacked & TCPCB_SACKED_ACKED)) { - reord = min(pkts_acked, reord); - if (!after(scb->end_seq, tp->high_seq)) - flag |= FLAG_ORIG_SACK_ACKED; - } + reord = min(pkts_acked, reord); + if (!after(scb->end_seq, tp->high_seq)) + flag |= FLAG_ORIG_SACK_ACKED; } if (sacked & TCPCB_SACKED_ACKED) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 37578d52897e..3571f2be4470 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1685,7 +1685,7 @@ do_time_wait: iph->daddr, th->dest, inet_iif(skb)); if (sk2) { - inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); + inet_twsk_deschedule(inet_twsk(sk)); inet_twsk_put(inet_twsk(sk)); sk = sk2; goto process; @@ -2242,9 +2242,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) static void get_timewait4_sock(const struct inet_timewait_sock *tw, struct seq_file *f, int i) { + long delta = tw->tw_timer.expires - jiffies; __be32 dest, src; __u16 destp, srcp; - s32 delta = tw->tw_ttd - inet_tw_time_stamp(); dest = tw->tw_daddr; src = tw->tw_rcv_saddr; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 2088fdcca141..63d6311b5365 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -34,18 +34,7 @@ int sysctl_tcp_abort_on_overflow __read_mostly; struct inet_timewait_death_row tcp_death_row = { .sysctl_max_tw_buckets = NR_FILE * 2, - .period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS, - .death_lock = __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock), .hashinfo = &tcp_hashinfo, - .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0, - (unsigned long)&tcp_death_row), - .twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work, - inet_twdr_twkill_work), -/* Short-time timewait calendar */ - - .twcal_hand = -1, - .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0, - (unsigned long)&tcp_death_row), }; EXPORT_SYMBOL_GPL(tcp_death_row); @@ -158,7 +147,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, if (!th->fin || TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) { kill_with_rst: - inet_twsk_deschedule(tw, &tcp_death_row); + inet_twsk_deschedule(tw); inet_twsk_put(tw); return TCP_TW_RST; } @@ -174,11 +163,9 @@ kill_with_rst: if (tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && tcp_tw_remember_stamp(tw)) - inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout, - TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, tw->tw_timeout); else - inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, - TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); return TCP_TW_ACK; } @@ -211,13 +198,12 @@ kill_with_rst: */ if (sysctl_tcp_rfc1337 == 0) { kill: - inet_twsk_deschedule(tw, &tcp_death_row); + inet_twsk_deschedule(tw); inet_twsk_put(tw); return TCP_TW_SUCCESS; } } - inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, - TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent = tmp_opt.rcv_tsval; @@ -267,8 +253,7 @@ kill: * Do not reschedule in the last case. */ if (paws_reject || th->ack) - inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, - TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); @@ -283,16 +268,15 @@ EXPORT_SYMBOL(tcp_timewait_state_process); */ void tcp_time_wait(struct sock *sk, int state, int timeo) { - struct inet_timewait_sock *tw = NULL; const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); + struct inet_timewait_sock *tw; bool recycle_ok = false; if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) recycle_ok = tcp_remember_stamp(sk); - if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) - tw = inet_twsk_alloc(sk, state); + tw = inet_twsk_alloc(sk, &tcp_death_row, state); if (tw) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); @@ -355,8 +339,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) timeo = TCP_TIMEWAIT_LEN; } - inet_twsk_schedule(tw, &tcp_death_row, timeo, - TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, timeo); inet_twsk_put(tw); } else { /* Sorry, if we're out of memory, just CLOSE this diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e662d85d1635..8c8d7e06b72f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2994,6 +2994,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, rcu_read_unlock(); #endif + /* Do not fool tcpdump (if any), clean our debris */ + skb->tstamp.tv64 = 0; return skb; } EXPORT_SYMBOL(tcp_make_synack); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 033f17816ef4..871641bc1ed4 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -246,7 +246,7 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, *twp = tw; } else if (tw) { /* Silly. Should hash-dance instead... */ - inet_twsk_deschedule(tw, death_row); + inet_twsk_deschedule(tw); inet_twsk_put(tw); } diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index b53148444e15..ed9d681207fa 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -288,8 +288,7 @@ static struct ip6_tnl *vti6_locate(struct net *net, struct __ip6_tnl_parm *p, static void vti6_dev_uninit(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - struct net *net = dev_net(dev); - struct vti6_net *ip6n = net_generic(net, vti6_net_id); + struct vti6_net *ip6n = net_generic(t->net, vti6_net_id); if (dev == ip6n->fb_tnl_dev) RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index f73a97f6e68e..ad51df85aa00 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1486,7 +1486,7 @@ do_time_wait: ntohs(th->dest), tcp_v6_iif(skb)); if (sk2) { struct inet_timewait_sock *tw = inet_twsk(sk); - inet_twsk_deschedule(tw, &tcp_death_row); + inet_twsk_deschedule(tw); inet_twsk_put(tw); sk = sk2; tcp_v6_restore_cb(skb); @@ -1728,9 +1728,9 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) static void get_timewait6_sock(struct seq_file *seq, struct inet_timewait_sock *tw, int i) { + long delta = tw->tw_timer.expires - jiffies; const struct in6_addr *dest, *src; __u16 destp, srcp; - s32 delta = tw->tw_ttd - inet_tw_time_stamp(); dest = &tw->tw_v6_daddr; src = &tw->tw_v6_rcv_saddr; diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 51afea4b0af7..3ad91266c821 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -63,7 +63,7 @@ struct nfulnl_instance { struct timer_list timer; struct net *net; struct user_namespace *peer_user_ns; /* User namespace of the peer process */ - int peer_portid; /* PORTID of the peer process */ + u32 peer_portid; /* PORTID of the peer process */ /* configurable parameters */ unsigned int flushtimeout; /* timeout until queue flush */ @@ -152,7 +152,7 @@ static void nfulnl_timer(unsigned long data); static struct nfulnl_instance * instance_create(struct net *net, u_int16_t group_num, - int portid, struct user_namespace *user_ns) + u32 portid, struct user_namespace *user_ns) { struct nfulnl_instance *inst; struct nfnl_log_net *log = nfnl_log_pernet(net); @@ -1007,7 +1007,7 @@ static int seq_show(struct seq_file *s, void *v) { const struct nfulnl_instance *inst = v; - seq_printf(s, "%5d %6d %5d %1d %5d %6d %2d\n", + seq_printf(s, "%5u %6u %5u %1u %5u %6u %2u\n", inst->group_num, inst->peer_portid, inst->qlen, inst->copy_mode, inst->copy_range, diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index 628afc350c02..0b98c7420239 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -55,7 +55,7 @@ struct nfqnl_instance { struct hlist_node hlist; /* global list of queues */ struct rcu_head rcu; - int peer_portid; + u32 peer_portid; unsigned int queue_maxlen; unsigned int copy_range; unsigned int queue_dropped; @@ -110,8 +110,7 @@ instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num) } static struct nfqnl_instance * -instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, - int portid) +instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid) { struct nfqnl_instance *inst; unsigned int h; @@ -870,7 +869,7 @@ static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = { }; static struct nfqnl_instance * -verdict_instance_lookup(struct nfnl_queue_net *q, u16 queue_num, int nlportid) +verdict_instance_lookup(struct nfnl_queue_net *q, u16 queue_num, u32 nlportid) { struct nfqnl_instance *queue; @@ -1252,7 +1251,7 @@ static int seq_show(struct seq_file *s, void *v) { const struct nfqnl_instance *inst = v; - seq_printf(s, "%5d %6d %5d %1d %5d %5d %5d %8d %2d\n", + seq_printf(s, "%5u %6u %5u %1u %5u %5u %5u %8u %2d\n", inst->queue_num, inst->peer_portid, inst->queue_total, inst->copy_mode, inst->copy_range, diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index c205b26a2bee..cca96cec1b68 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -272,7 +272,7 @@ tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport, hp->source, lport ? lport : hp->dest, skb->dev, NFT_LOOKUP_LISTENER); if (sk2) { - inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); + inet_twsk_deschedule(inet_twsk(sk)); inet_twsk_put(inet_twsk(sk)); sk = sk2; } @@ -437,7 +437,7 @@ tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, tgi->lport ? tgi->lport : hp->dest, skb->dev, NFT_LOOKUP_LISTENER); if (sk2) { - inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); + inet_twsk_deschedule(inet_twsk(sk)); inet_twsk_put(inet_twsk(sk)); sk = sk2; } diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 14a2d11581da..3763036710ae 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -1584,7 +1584,7 @@ static const struct genl_ops nfc_genl_ops[] = { struct urelease_work { struct work_struct w; - int portid; + u32 portid; }; static void nfc_urelease_event_work(struct work_struct *work) diff --git a/net/rds/connection.c b/net/rds/connection.c index 378c3a6acf84..14f041398ca1 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -130,7 +130,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, rcu_read_lock(); conn = rds_conn_lookup(head, laddr, faddr, trans); if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && - !is_outgoing) { + laddr == faddr && !is_outgoing) { /* This is a looped back IB connection, and we're * called by the code handling the incoming connect. * We need a second connection object into which we @@ -193,6 +193,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, } atomic_set(&conn->c_state, RDS_CONN_DOWN); + conn->c_send_gen = 0; conn->c_reconnect_jiffies = 0; INIT_DELAYED_WORK(&conn->c_send_w, rds_send_worker); INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker); diff --git a/net/rds/rds.h b/net/rds/rds.h index c3f2855c3d84..0d41155a2258 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -110,6 +110,7 @@ struct rds_connection { void *c_transport_data; atomic_t c_state; + unsigned long c_send_gen; unsigned long c_flags; unsigned long c_reconnect_jiffies; struct delayed_work c_send_w; diff --git a/net/rds/send.c b/net/rds/send.c index 44672befc0ee..e9430f537f9c 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -140,8 +140,11 @@ int rds_send_xmit(struct rds_connection *conn) struct scatterlist *sg; int ret = 0; LIST_HEAD(to_be_dropped); + int batch_count; + unsigned long send_gen = 0; restart: + batch_count = 0; /* * sendmsg calls here after having queued its message on the send @@ -157,6 +160,17 @@ restart: } /* + * we record the send generation after doing the xmit acquire. + * if someone else manages to jump in and do some work, we'll use + * this to avoid a goto restart farther down. + * + * The acquire_in_xmit() check above ensures that only one + * caller can increment c_send_gen at any time. + */ + conn->c_send_gen++; + send_gen = conn->c_send_gen; + + /* * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT, * we do the opposite to avoid races. */ @@ -202,6 +216,16 @@ restart: if (!rm) { unsigned int len; + batch_count++; + + /* we want to process as big a batch as we can, but + * we also want to avoid softlockups. If we've been + * through a lot of messages, lets back off and see + * if anyone else jumps in + */ + if (batch_count >= 1024) + goto over_batch; + spin_lock_irqsave(&conn->c_lock, flags); if (!list_empty(&conn->c_send_queue)) { @@ -357,9 +381,9 @@ restart: } } +over_batch: if (conn->c_trans->xmit_complete) conn->c_trans->xmit_complete(conn); - release_in_xmit(conn); /* Nuke any messages we decided not to retransmit. */ @@ -380,10 +404,15 @@ restart: * If the transport cannot continue (i.e ret != 0), then it must * call us when more room is available, such as from the tx * completion handler. + * + * We have an extra generation check here so that if someone manages + * to jump in after our release_in_xmit, we'll see that they have done + * some work and we will skip our goto */ if (ret == 0) { smp_mb(); - if (!list_empty(&conn->c_send_queue)) { + if (!list_empty(&conn->c_send_queue) && + send_gen == conn->c_send_gen) { rds_stats_inc(s_send_lock_queue_raced); goto restart; } diff --git a/net/rxrpc/ar-input.c b/net/rxrpc/ar-input.c index 481f89f93789..4505a691d88c 100644 --- a/net/rxrpc/ar-input.c +++ b/net/rxrpc/ar-input.c @@ -28,7 +28,7 @@ const char *rxrpc_pkts[] = { "?00", "DATA", "ACK", "BUSY", "ABORT", "ACKALL", "CHALL", "RESP", "DEBUG", - "?09", "?10", "?11", "?12", "?13", "?14", "?15" + "?09", "?10", "?11", "?12", "VERSION", "?14", "?15" }; /* @@ -593,6 +593,20 @@ static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn, rxrpc_queue_conn(conn); } +/* + * post endpoint-level events to the local endpoint + * - this includes debug and version messages + */ +static void rxrpc_post_packet_to_local(struct rxrpc_local *local, + struct sk_buff *skb) +{ + _enter("%p,%p", local, skb); + + atomic_inc(&local->usage); + skb_queue_tail(&local->event_queue, skb); + rxrpc_queue_work(&local->event_processor); +} + static struct rxrpc_connection *rxrpc_conn_from_local(struct rxrpc_local *local, struct sk_buff *skb, struct rxrpc_skb_priv *sp) @@ -699,6 +713,11 @@ void rxrpc_data_ready(struct sock *sk) goto bad_message; } + if (sp->hdr.type == RXRPC_PACKET_TYPE_VERSION) { + rxrpc_post_packet_to_local(local, skb); + goto out; + } + if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && (sp->hdr.callNumber == 0 || sp->hdr.seq == 0)) goto bad_message; @@ -731,6 +750,8 @@ void rxrpc_data_ready(struct sock *sk) else goto cant_route_call; } + +out: rxrpc_put_local(local); return; diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 2fc1e659e5c9..aef1bd294e17 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -152,11 +152,13 @@ struct rxrpc_local { struct work_struct destroyer; /* endpoint destroyer */ struct work_struct acceptor; /* incoming call processor */ struct work_struct rejecter; /* packet reject writer */ + struct work_struct event_processor; /* endpoint event processor */ struct list_head services; /* services listening on this endpoint */ struct list_head link; /* link in endpoint list */ struct rw_semaphore defrag_sem; /* control re-enablement of IP DF bit */ struct sk_buff_head accept_queue; /* incoming calls awaiting acceptance */ struct sk_buff_head reject_queue; /* packets awaiting rejection */ + struct sk_buff_head event_queue; /* endpoint event packets awaiting processing */ spinlock_t lock; /* access lock */ rwlock_t services_lock; /* lock for services list */ atomic_t usage; diff --git a/net/rxrpc/ar-local.c b/net/rxrpc/ar-local.c index 87f7135d238b..ca904ed5400a 100644 --- a/net/rxrpc/ar-local.c +++ b/net/rxrpc/ar-local.c @@ -13,16 +13,22 @@ #include <linux/net.h> #include <linux/skbuff.h> #include <linux/slab.h> +#include <linux/udp.h> +#include <linux/ip.h> #include <net/sock.h> #include <net/af_rxrpc.h> +#include <generated/utsrelease.h> #include "ar-internal.h" +static const char rxrpc_version_string[65] = "linux-" UTS_RELEASE " AF_RXRPC"; + static LIST_HEAD(rxrpc_locals); DEFINE_RWLOCK(rxrpc_local_lock); static DECLARE_RWSEM(rxrpc_local_sem); static DECLARE_WAIT_QUEUE_HEAD(rxrpc_local_wq); static void rxrpc_destroy_local(struct work_struct *work); +static void rxrpc_process_local_events(struct work_struct *work); /* * allocate a new local @@ -37,11 +43,13 @@ struct rxrpc_local *rxrpc_alloc_local(struct sockaddr_rxrpc *srx) INIT_WORK(&local->destroyer, &rxrpc_destroy_local); INIT_WORK(&local->acceptor, &rxrpc_accept_incoming_calls); INIT_WORK(&local->rejecter, &rxrpc_reject_packets); + INIT_WORK(&local->event_processor, &rxrpc_process_local_events); INIT_LIST_HEAD(&local->services); INIT_LIST_HEAD(&local->link); init_rwsem(&local->defrag_sem); skb_queue_head_init(&local->accept_queue); skb_queue_head_init(&local->reject_queue); + skb_queue_head_init(&local->event_queue); spin_lock_init(&local->lock); rwlock_init(&local->services_lock); atomic_set(&local->usage, 1); @@ -264,10 +272,12 @@ static void rxrpc_destroy_local(struct work_struct *work) ASSERT(list_empty(&local->services)); ASSERT(!work_pending(&local->acceptor)); ASSERT(!work_pending(&local->rejecter)); + ASSERT(!work_pending(&local->event_processor)); /* finish cleaning up the local descriptor */ rxrpc_purge_queue(&local->accept_queue); rxrpc_purge_queue(&local->reject_queue); + rxrpc_purge_queue(&local->event_queue); kernel_sock_shutdown(local->socket, SHUT_RDWR); sock_release(local->socket); @@ -308,3 +318,91 @@ void __exit rxrpc_destroy_all_locals(void) _leave(""); } + +/* + * Reply to a version request + */ +static void rxrpc_send_version_request(struct rxrpc_local *local, + struct rxrpc_header *hdr, + struct sk_buff *skb) +{ + struct sockaddr_in sin; + struct msghdr msg; + struct kvec iov[2]; + size_t len; + int ret; + + _enter(""); + + sin.sin_family = AF_INET; + sin.sin_port = udp_hdr(skb)->source; + sin.sin_addr.s_addr = ip_hdr(skb)->saddr; + + msg.msg_name = &sin; + msg.msg_namelen = sizeof(sin); + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + hdr->seq = 0; + hdr->serial = 0; + hdr->type = RXRPC_PACKET_TYPE_VERSION; + hdr->flags = RXRPC_LAST_PACKET | (~hdr->flags & RXRPC_CLIENT_INITIATED); + hdr->userStatus = 0; + hdr->_rsvd = 0; + + iov[0].iov_base = hdr; + iov[0].iov_len = sizeof(*hdr); + iov[1].iov_base = (char *)rxrpc_version_string; + iov[1].iov_len = sizeof(rxrpc_version_string); + + len = iov[0].iov_len + iov[1].iov_len; + + _proto("Tx VERSION (reply)"); + + ret = kernel_sendmsg(local->socket, &msg, iov, 2, len); + if (ret < 0) + _debug("sendmsg failed: %d", ret); + + _leave(""); +} + +/* + * Process event packets targetted at a local endpoint. + */ +static void rxrpc_process_local_events(struct work_struct *work) +{ + struct rxrpc_local *local = container_of(work, struct rxrpc_local, event_processor); + struct sk_buff *skb; + char v; + + _enter(""); + + atomic_inc(&local->usage); + + while ((skb = skb_dequeue(&local->event_queue))) { + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + kdebug("{%d},{%u}", local->debug_id, sp->hdr.type); + + switch (sp->hdr.type) { + case RXRPC_PACKET_TYPE_VERSION: + if (skb_copy_bits(skb, 0, &v, 1) < 0) + return; + _proto("Rx VERSION { %02x }", v); + if (v == 0) + rxrpc_send_version_request(local, &sp->hdr, skb); + break; + + default: + /* Just ignore anything we don't understand */ + break; + } + + rxrpc_put_local(local); + rxrpc_free_skb(skb); + } + + rxrpc_put_local(local); + _leave(""); +} diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c index 09f584566e23..c0042807bfc6 100644 --- a/net/rxrpc/ar-output.c +++ b/net/rxrpc/ar-output.c @@ -542,11 +542,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, call->tx_pending = NULL; copied = 0; - if (len > iov_iter_count(&msg->msg_iter)) - len = iov_iter_count(&msg->msg_iter); - while (len) { - int copy; - + do { if (!skb) { size_t size, chunk, max, space; @@ -568,8 +564,8 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, max &= ~(call->conn->size_align - 1UL); chunk = max; - if (chunk > len && !more) - chunk = len; + if (chunk > msg_data_left(msg) && !more) + chunk = msg_data_left(msg); space = chunk + call->conn->size_align; space &= ~(call->conn->size_align - 1UL); @@ -612,23 +608,23 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, sp = rxrpc_skb(skb); /* append next segment of data to the current buffer */ - copy = skb_tailroom(skb); - ASSERTCMP(copy, >, 0); - if (copy > len) - copy = len; - if (copy > sp->remain) - copy = sp->remain; - - _debug("add"); - ret = skb_add_data(skb, &msg->msg_iter, copy); - _debug("added"); - if (ret < 0) - goto efault; - sp->remain -= copy; - skb->mark += copy; - copied += copy; - - len -= copy; + if (msg_data_left(msg) > 0) { + int copy = skb_tailroom(skb); + ASSERTCMP(copy, >, 0); + if (copy > msg_data_left(msg)) + copy = msg_data_left(msg); + if (copy > sp->remain) + copy = sp->remain; + + _debug("add"); + ret = skb_add_data(skb, &msg->msg_iter, copy); + _debug("added"); + if (ret < 0) + goto efault; + sp->remain -= copy; + skb->mark += copy; + copied += copy; + } /* check for the far side aborting the call or a network error * occurring */ @@ -636,7 +632,8 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, goto call_aborted; /* add the packet to the send queue if it's now full */ - if (sp->remain <= 0 || (!len && !more)) { + if (sp->remain <= 0 || + (msg_data_left(msg) == 0 && !more)) { struct rxrpc_connection *conn = call->conn; uint32_t seq; size_t pad; @@ -666,7 +663,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, sp->hdr.serviceId = conn->service_id; sp->hdr.flags = conn->out_clientflag; - if (len == 0 && !more) + if (msg_data_left(msg) == 0 && !more) sp->hdr.flags |= RXRPC_LAST_PACKET; else if (CIRC_SPACE(call->acks_head, call->acks_tail, call->acks_winsz) > 1) @@ -682,10 +679,10 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, memcpy(skb->head, &sp->hdr, sizeof(struct rxrpc_header)); - rxrpc_queue_packet(call, skb, !iov_iter_count(&msg->msg_iter) && !more); + rxrpc_queue_packet(call, skb, !msg_data_left(msg) && !more); skb = NULL; } - } + } while (msg_data_left(msg) > 0); success: ret = copied; diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index eb5b8445fef9..4cdbfb85686a 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -88,11 +88,19 @@ static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch) /* ------------------------------------------------------------- */ +static int ingress_init(struct Qdisc *sch, struct nlattr *opt) +{ + net_inc_ingress_queue(); + + return 0; +} + static void ingress_destroy(struct Qdisc *sch) { struct ingress_qdisc_data *p = qdisc_priv(sch); tcf_destroy_chain(&p->filter_list); + net_dec_ingress_queue(); } static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb) @@ -124,6 +132,7 @@ static struct Qdisc_ops ingress_qdisc_ops __read_mostly = { .id = "ingress", .priv_size = sizeof(struct ingress_qdisc_data), .enqueue = ingress_enqueue, + .init = ingress_init, .destroy = ingress_destroy, .dump = ingress_dump, .owner = THIS_MODULE, diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 179f1c8c0d8b..956ead2cab9a 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -560,8 +560,8 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch) tfifo_dequeue: skb = __skb_dequeue(&sch->q); if (skb) { -deliver: qdisc_qstats_backlog_dec(sch, skb); +deliver: qdisc_unthrottled(sch); qdisc_bstats_update(sch, skb); return skb; @@ -578,6 +578,7 @@ deliver: rb_erase(p, &q->t_root); sch->q.qlen--; + qdisc_qstats_backlog_dec(sch, skb); skb->next = NULL; skb->prev = NULL; skb->tstamp = netem_skb_cb(skb)->tstamp_save; diff --git a/net/socket.c b/net/socket.c index 073809f4125f..5b0126234606 100644 --- a/net/socket.c +++ b/net/socket.c @@ -610,35 +610,27 @@ void __sock_tx_timestamp(const struct sock *sk, __u8 *tx_flags) } EXPORT_SYMBOL(__sock_tx_timestamp); -static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg, - size_t size) +static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) { - return sock->ops->sendmsg(sock, msg, size); + int ret = sock->ops->sendmsg(sock, msg, msg_data_left(msg)); + BUG_ON(ret == -EIOCBQUEUED); + return ret; } -int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) +int sock_sendmsg(struct socket *sock, struct msghdr *msg) { - int err = security_socket_sendmsg(sock, msg, size); + int err = security_socket_sendmsg(sock, msg, + msg_data_left(msg)); - return err ?: sock_sendmsg_nosec(sock, msg, size); + return err ?: sock_sendmsg_nosec(sock, msg); } EXPORT_SYMBOL(sock_sendmsg); int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size) { - mm_segment_t oldfs = get_fs(); - int result; - - set_fs(KERNEL_DS); - /* - * the following is safe, since for compiler definitions of kvec and - * iovec are identical, yielding the same in-core layout and alignment - */ - iov_iter_init(&msg->msg_iter, WRITE, (struct iovec *)vec, num, size); - result = sock_sendmsg(sock, msg, size); - set_fs(oldfs); - return result; + iov_iter_kvec(&msg->msg_iter, WRITE | ITER_KVEC, vec, num, size); + return sock_sendmsg(sock, msg); } EXPORT_SYMBOL(kernel_sendmsg); @@ -755,12 +747,8 @@ int kernel_recvmsg(struct socket *sock, struct msghdr *msg, mm_segment_t oldfs = get_fs(); int result; + iov_iter_kvec(&msg->msg_iter, READ | ITER_KVEC, vec, num, size); set_fs(KERNEL_DS); - /* - * the following is safe, since for compiler definitions of kvec and - * iovec are identical, yielding the same in-core layout and alignment - */ - iov_iter_init(&msg->msg_iter, READ, (struct iovec *)vec, num, size); result = sock_recvmsg(sock, msg, size, flags); set_fs(oldfs); return result; @@ -808,10 +796,10 @@ static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to) if (iocb->ki_pos != 0) return -ESPIPE; - if (iocb->ki_nbytes == 0) /* Match SYS5 behaviour */ + if (!iov_iter_count(to)) /* Match SYS5 behaviour */ return 0; - res = sock_recvmsg(sock, &msg, iocb->ki_nbytes, msg.msg_flags); + res = sock_recvmsg(sock, &msg, iov_iter_count(to), msg.msg_flags); *to = msg.msg_iter; return res; } @@ -833,7 +821,7 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from) if (sock->type == SOCK_SEQPACKET) msg.msg_flags |= MSG_EOR; - res = sock_sendmsg(sock, &msg, iocb->ki_nbytes); + res = sock_sendmsg(sock, &msg); *from = msg.msg_iter; return res; } @@ -1650,18 +1638,14 @@ SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len, struct iovec iov; int fput_needed; - if (len > INT_MAX) - len = INT_MAX; - if (unlikely(!access_ok(VERIFY_READ, buff, len))) - return -EFAULT; + err = import_single_range(WRITE, buff, len, &iov, &msg.msg_iter); + if (unlikely(err)) + return err; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; - iov.iov_base = buff; - iov.iov_len = len; msg.msg_name = NULL; - iov_iter_init(&msg.msg_iter, WRITE, &iov, 1, len); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_namelen = 0; @@ -1675,7 +1659,7 @@ SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len, if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; msg.msg_flags = flags; - err = sock_sendmsg(sock, &msg, len); + err = sock_sendmsg(sock, &msg); out_put: fput_light(sock->file, fput_needed); @@ -1710,26 +1694,22 @@ SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size, int err, err2; int fput_needed; - if (size > INT_MAX) - size = INT_MAX; - if (unlikely(!access_ok(VERIFY_WRITE, ubuf, size))) - return -EFAULT; + err = import_single_range(READ, ubuf, size, &iov, &msg.msg_iter); + if (unlikely(err)) + return err; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; msg.msg_control = NULL; msg.msg_controllen = 0; - iov.iov_len = size; - iov.iov_base = ubuf; - iov_iter_init(&msg.msg_iter, READ, &iov, 1, size); /* Save some cycles and don't copy the address if not needed */ msg.msg_name = addr ? (struct sockaddr *)&address : NULL; /* We assume all kernel code knows the size of sockaddr_storage */ msg.msg_namelen = 0; if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; - err = sock_recvmsg(sock, &msg, size, flags); + err = sock_recvmsg(sock, &msg, iov_iter_count(&msg.msg_iter), flags); if (err >= 0 && addr != NULL) { err2 = move_addr_to_user(&address, @@ -1849,10 +1829,10 @@ struct used_address { unsigned int name_len; }; -static ssize_t copy_msghdr_from_user(struct msghdr *kmsg, - struct user_msghdr __user *umsg, - struct sockaddr __user **save_addr, - struct iovec **iov) +static int copy_msghdr_from_user(struct msghdr *kmsg, + struct user_msghdr __user *umsg, + struct sockaddr __user **save_addr, + struct iovec **iov) { struct sockaddr __user *uaddr; struct iovec __user *uiov; @@ -1898,13 +1878,8 @@ static ssize_t copy_msghdr_from_user(struct msghdr *kmsg, kmsg->msg_iocb = NULL; - err = rw_copy_check_uvector(save_addr ? READ : WRITE, - uiov, nr_segs, - UIO_FASTIOV, *iov, iov); - if (err >= 0) - iov_iter_init(&kmsg->msg_iter, save_addr ? READ : WRITE, - *iov, nr_segs, err); - return err; + return import_iovec(save_addr ? READ : WRITE, uiov, nr_segs, + UIO_FASTIOV, iov, &kmsg->msg_iter); } static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, @@ -1919,7 +1894,7 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, __attribute__ ((aligned(sizeof(__kernel_size_t)))); /* 20 is size of ipv6_pktinfo */ unsigned char *ctl_buf = ctl; - int ctl_len, total_len; + int ctl_len; ssize_t err; msg_sys->msg_name = &address; @@ -1929,8 +1904,7 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, else err = copy_msghdr_from_user(msg_sys, msg, NULL, &iov); if (err < 0) - goto out_freeiov; - total_len = err; + return err; err = -ENOBUFS; @@ -1977,10 +1951,10 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, used_address->name_len == msg_sys->msg_namelen && !memcmp(&used_address->name, msg_sys->msg_name, used_address->name_len)) { - err = sock_sendmsg_nosec(sock, msg_sys, total_len); + err = sock_sendmsg_nosec(sock, msg_sys); goto out_freectl; } - err = sock_sendmsg(sock, msg_sys, total_len); + err = sock_sendmsg(sock, msg_sys); /* * If this is sendmmsg() and sending to current destination address was * successful, remember it. @@ -1996,8 +1970,7 @@ out_freectl: if (ctl_buf != ctl) sock_kfree_s(sock->sk, ctl_buf, ctl_len); out_freeiov: - if (iov != iovstack) - kfree(iov); + kfree(iov); return err; } @@ -2122,8 +2095,8 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg, else err = copy_msghdr_from_user(msg_sys, msg, &uaddr, &iov); if (err < 0) - goto out_freeiov; - total_len = err; + return err; + total_len = iov_iter_count(&msg_sys->msg_iter); cmsg_ptr = (unsigned long)msg_sys->msg_control; msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT); @@ -2161,8 +2134,7 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg, err = len; out_freeiov: - if (iov != iovstack) - kfree(iov); + kfree(iov); return err; } diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index cc331b6cf573..0c8120229a03 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -257,7 +257,7 @@ static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) svc_set_cmsg_data(rqstp, cmh); - if (sock_sendmsg(sock, &msg, 0) < 0) + if (sock_sendmsg(sock, &msg) < 0) goto out; } diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 85d1d4764612..526c4feb3b50 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -238,11 +238,6 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) skb->sp->xvec[skb->sp->len++] = x; - if (xfrm_tunnel_check(skb, x, family)) { - XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR); - goto drop; - } - spin_lock(&x->lock); if (unlikely(x->km.state == XFRM_STATE_ACQ)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMACQUIREERROR); @@ -271,6 +266,11 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) spin_unlock(&x->lock); + if (xfrm_tunnel_check(skb, x, family)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR); + goto drop; + } + seq_hi = htonl(xfrm_replay_seqhi(x, seq)); XFRM_SKB_CB(skb)->seq.input.low = seq; diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c index 30594bfa5fb1..2bbb41822d8e 100644 --- a/security/selinux/nlmsgtab.c +++ b/security/selinux/nlmsgtab.c @@ -153,6 +153,8 @@ int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm) switch (sclass) { case SECCLASS_NETLINK_ROUTE_SOCKET: + /* RTM_MAX always point to RTM_SETxxxx, ie RTM_NEWxxx + 3 */ + BUILD_BUG_ON(RTM_MAX != (RTM_NEWNSID + 3)); err = nlmsg_perm(nlmsg_type, perm, nlmsg_route_perms, sizeof(nlmsg_route_perms)); break; @@ -163,6 +165,7 @@ int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm) break; case SECCLASS_NETLINK_XFRM_SOCKET: + BUILD_BUG_ON(XFRM_MSG_MAX != XFRM_MSG_MAPPING); err = nlmsg_perm(nlmsg_type, perm, nlmsg_xfrm_perms, sizeof(nlmsg_xfrm_perms)); break; diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 279e24f61305..a69ebc79bc50 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -25,7 +25,6 @@ #include <linux/slab.h> #include <linux/time.h> #include <linux/pm_qos.h> -#include <linux/aio.h> #include <linux/io.h> #include <linux/dma-mapping.h> #include <sound/core.h> @@ -35,6 +34,7 @@ #include <sound/pcm_params.h> #include <sound/timer.h> #include <sound/minors.h> +#include <linux/uio.h> /* * Compatibility |