diff options
Diffstat (limited to 'net')
41 files changed, 804 insertions, 216 deletions
diff --git a/net/9p/Kconfig b/net/9p/Kconfig index a75174a33723..e6014e0e51f7 100644 --- a/net/9p/Kconfig +++ b/net/9p/Kconfig @@ -22,6 +22,15 @@ config NET_9P_VIRTIO This builds support for a transports between guest partitions and a host partition. +config NET_9P_XEN + depends on XEN + select XEN_XENBUS_FRONTEND + tristate "9P Xen Transport" + help + This builds support for a transport for 9pfs between + two Xen domains. + + config NET_9P_RDMA depends on INET && INFINIBAND && INFINIBAND_ADDR_TRANS tristate "9P RDMA Transport (Experimental)" diff --git a/net/9p/Makefile b/net/9p/Makefile index a0874cc1f718..697ea7caf466 100644 --- a/net/9p/Makefile +++ b/net/9p/Makefile @@ -1,4 +1,5 @@ obj-$(CONFIG_NET_9P) := 9pnet.o +obj-$(CONFIG_NET_9P_XEN) += 9pnet_xen.o obj-$(CONFIG_NET_9P_VIRTIO) += 9pnet_virtio.o obj-$(CONFIG_NET_9P_RDMA) += 9pnet_rdma.o @@ -14,5 +15,8 @@ obj-$(CONFIG_NET_9P_RDMA) += 9pnet_rdma.o 9pnet_virtio-objs := \ trans_virtio.o \ +9pnet_xen-objs := \ + trans_xen.o \ + 9pnet_rdma-objs := \ trans_rdma.o \ diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c new file mode 100644 index 000000000000..71e85643b3f9 --- /dev/null +++ b/net/9p/trans_xen.c @@ -0,0 +1,545 @@ +/* + * linux/fs/9p/trans_xen + * + * Xen transport layer. + * + * Copyright (C) 2017 by Stefano Stabellini <stefano@aporeto.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <xen/events.h> +#include <xen/grant_table.h> +#include <xen/xen.h> +#include <xen/xenbus.h> +#include <xen/interface/io/9pfs.h> + +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/rwlock.h> +#include <net/9p/9p.h> +#include <net/9p/client.h> +#include <net/9p/transport.h> + +#define XEN_9PFS_NUM_RINGS 2 +#define XEN_9PFS_RING_ORDER 6 +#define XEN_9PFS_RING_SIZE XEN_FLEX_RING_SIZE(XEN_9PFS_RING_ORDER) + +struct xen_9pfs_header { + uint32_t size; + uint8_t id; + uint16_t tag; + + /* uint8_t sdata[]; */ +} __attribute__((packed)); + +/* One per ring, more than one per 9pfs share */ +struct xen_9pfs_dataring { + struct xen_9pfs_front_priv *priv; + + struct xen_9pfs_data_intf *intf; + grant_ref_t ref; + int evtchn; + int irq; + /* protect a ring from concurrent accesses */ + spinlock_t lock; + + struct xen_9pfs_data data; + wait_queue_head_t wq; + struct work_struct work; +}; + +/* One per 9pfs share */ +struct xen_9pfs_front_priv { + struct list_head list; + struct xenbus_device *dev; + char *tag; + struct p9_client *client; + + int num_rings; + struct xen_9pfs_dataring *rings; +}; + +static LIST_HEAD(xen_9pfs_devs); +static DEFINE_RWLOCK(xen_9pfs_lock); + +/* We don't currently allow canceling of requests */ +static int p9_xen_cancel(struct p9_client *client, struct p9_req_t *req) +{ + return 1; +} + +static int p9_xen_create(struct p9_client *client, const char *addr, char *args) +{ + struct xen_9pfs_front_priv *priv; + + read_lock(&xen_9pfs_lock); + list_for_each_entry(priv, &xen_9pfs_devs, list) { + if (!strcmp(priv->tag, addr)) { + priv->client = client; + read_unlock(&xen_9pfs_lock); + return 0; + } + } + read_unlock(&xen_9pfs_lock); + return -EINVAL; +} + +static void p9_xen_close(struct p9_client *client) +{ + struct xen_9pfs_front_priv *priv; + + read_lock(&xen_9pfs_lock); + list_for_each_entry(priv, &xen_9pfs_devs, list) { + if (priv->client == client) { + priv->client = NULL; + read_unlock(&xen_9pfs_lock); + return; + } + } + read_unlock(&xen_9pfs_lock); +} + +static bool p9_xen_write_todo(struct xen_9pfs_dataring *ring, RING_IDX size) +{ + RING_IDX cons, prod; + + cons = ring->intf->out_cons; + prod = ring->intf->out_prod; + virt_mb(); + + return XEN_9PFS_RING_SIZE - + xen_9pfs_queued(prod, cons, XEN_9PFS_RING_SIZE) >= size; +} + +static int p9_xen_request(struct p9_client *client, struct p9_req_t *p9_req) +{ + struct xen_9pfs_front_priv *priv = NULL; + RING_IDX cons, prod, masked_cons, masked_prod; + unsigned long flags; + u32 size = p9_req->tc->size; + struct xen_9pfs_dataring *ring; + int num; + + read_lock(&xen_9pfs_lock); + list_for_each_entry(priv, &xen_9pfs_devs, list) { + if (priv->client == client) + break; + } + read_unlock(&xen_9pfs_lock); + if (!priv || priv->client != client) + return -EINVAL; + + num = p9_req->tc->tag % priv->num_rings; + ring = &priv->rings[num]; + +again: + while (wait_event_interruptible(ring->wq, + p9_xen_write_todo(ring, size)) != 0) + ; + + spin_lock_irqsave(&ring->lock, flags); + cons = ring->intf->out_cons; + prod = ring->intf->out_prod; + virt_mb(); + + if (XEN_9PFS_RING_SIZE - xen_9pfs_queued(prod, cons, + XEN_9PFS_RING_SIZE) < size) { + spin_unlock_irqrestore(&ring->lock, flags); + goto again; + } + + masked_prod = xen_9pfs_mask(prod, XEN_9PFS_RING_SIZE); + masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE); + + xen_9pfs_write_packet(ring->data.out, p9_req->tc->sdata, size, + &masked_prod, masked_cons, XEN_9PFS_RING_SIZE); + + p9_req->status = REQ_STATUS_SENT; + virt_wmb(); /* write ring before updating pointer */ + prod += size; + ring->intf->out_prod = prod; + spin_unlock_irqrestore(&ring->lock, flags); + notify_remote_via_irq(ring->irq); + + return 0; +} + +static void p9_xen_response(struct work_struct *work) +{ + struct xen_9pfs_front_priv *priv; + struct xen_9pfs_dataring *ring; + RING_IDX cons, prod, masked_cons, masked_prod; + struct xen_9pfs_header h; + struct p9_req_t *req; + int status; + + ring = container_of(work, struct xen_9pfs_dataring, work); + priv = ring->priv; + + while (1) { + cons = ring->intf->in_cons; + prod = ring->intf->in_prod; + virt_rmb(); + + if (xen_9pfs_queued(prod, cons, XEN_9PFS_RING_SIZE) < + sizeof(h)) { + notify_remote_via_irq(ring->irq); + return; + } + + masked_prod = xen_9pfs_mask(prod, XEN_9PFS_RING_SIZE); + masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE); + + /* First, read just the header */ + xen_9pfs_read_packet(&h, ring->data.in, sizeof(h), + masked_prod, &masked_cons, + XEN_9PFS_RING_SIZE); + + req = p9_tag_lookup(priv->client, h.tag); + if (!req || req->status != REQ_STATUS_SENT) { + dev_warn(&priv->dev->dev, "Wrong req tag=%x\n", h.tag); + cons += h.size; + virt_mb(); + ring->intf->in_cons = cons; + continue; + } + + memcpy(req->rc, &h, sizeof(h)); + req->rc->offset = 0; + + masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE); + /* Then, read the whole packet (including the header) */ + xen_9pfs_read_packet(req->rc->sdata, ring->data.in, h.size, + masked_prod, &masked_cons, + XEN_9PFS_RING_SIZE); + + virt_mb(); + cons += h.size; + ring->intf->in_cons = cons; + + status = (req->status != REQ_STATUS_ERROR) ? + REQ_STATUS_RCVD : REQ_STATUS_ERROR; + + p9_client_cb(priv->client, req, status); + } +} + +static irqreturn_t xen_9pfs_front_event_handler(int irq, void *r) +{ + struct xen_9pfs_dataring *ring = r; + + if (!ring || !ring->priv->client) { + /* ignore spurious interrupt */ + return IRQ_HANDLED; + } + + wake_up_interruptible(&ring->wq); + schedule_work(&ring->work); + + return IRQ_HANDLED; +} + +static struct p9_trans_module p9_xen_trans = { + .name = "xen", + .maxsize = 1 << (XEN_9PFS_RING_ORDER + XEN_PAGE_SHIFT), + .def = 1, + .create = p9_xen_create, + .close = p9_xen_close, + .request = p9_xen_request, + .cancel = p9_xen_cancel, + .owner = THIS_MODULE, +}; + +static const struct xenbus_device_id xen_9pfs_front_ids[] = { + { "9pfs" }, + { "" } +}; + +static void xen_9pfs_front_free(struct xen_9pfs_front_priv *priv) +{ + int i, j; + + write_lock(&xen_9pfs_lock); + list_del(&priv->list); + write_unlock(&xen_9pfs_lock); + + for (i = 0; i < priv->num_rings; i++) { + if (!priv->rings[i].intf) + break; + if (priv->rings[i].irq > 0) + unbind_from_irqhandler(priv->rings[i].irq, priv->dev); + if (priv->rings[i].data.in) { + for (j = 0; j < (1 << XEN_9PFS_RING_ORDER); j++) { + grant_ref_t ref; + + ref = priv->rings[i].intf->ref[j]; + gnttab_end_foreign_access(ref, 0, 0); + } + free_pages((unsigned long)priv->rings[i].data.in, + XEN_9PFS_RING_ORDER - + (PAGE_SHIFT - XEN_PAGE_SHIFT)); + } + gnttab_end_foreign_access(priv->rings[i].ref, 0, 0); + free_page((unsigned long)priv->rings[i].intf); + } + kfree(priv->rings); + kfree(priv->tag); + kfree(priv); +} + +static int xen_9pfs_front_remove(struct xenbus_device *dev) +{ + struct xen_9pfs_front_priv *priv = dev_get_drvdata(&dev->dev); + + dev_set_drvdata(&dev->dev, NULL); + xen_9pfs_front_free(priv); + return 0; +} + +static int xen_9pfs_front_alloc_dataring(struct xenbus_device *dev, + struct xen_9pfs_dataring *ring) +{ + int i = 0; + int ret = -ENOMEM; + void *bytes = NULL; + + init_waitqueue_head(&ring->wq); + spin_lock_init(&ring->lock); + INIT_WORK(&ring->work, p9_xen_response); + + ring->intf = (struct xen_9pfs_data_intf *)get_zeroed_page(GFP_KERNEL); + if (!ring->intf) + return ret; + ret = gnttab_grant_foreign_access(dev->otherend_id, + virt_to_gfn(ring->intf), 0); + if (ret < 0) + goto out; + ring->ref = ret; + bytes = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + XEN_9PFS_RING_ORDER - (PAGE_SHIFT - XEN_PAGE_SHIFT)); + if (!bytes) { + ret = -ENOMEM; + goto out; + } + for (; i < (1 << XEN_9PFS_RING_ORDER); i++) { + ret = gnttab_grant_foreign_access( + dev->otherend_id, virt_to_gfn(bytes) + i, 0); + if (ret < 0) + goto out; + ring->intf->ref[i] = ret; + } + ring->intf->ring_order = XEN_9PFS_RING_ORDER; + ring->data.in = bytes; + ring->data.out = bytes + XEN_9PFS_RING_SIZE; + + ret = xenbus_alloc_evtchn(dev, &ring->evtchn); + if (ret) + goto out; + ring->irq = bind_evtchn_to_irqhandler(ring->evtchn, + xen_9pfs_front_event_handler, + 0, "xen_9pfs-frontend", ring); + if (ring->irq >= 0) + return 0; + + xenbus_free_evtchn(dev, ring->evtchn); + ret = ring->irq; +out: + if (bytes) { + for (i--; i >= 0; i--) + gnttab_end_foreign_access(ring->intf->ref[i], 0, 0); + free_pages((unsigned long)bytes, + XEN_9PFS_RING_ORDER - + (PAGE_SHIFT - XEN_PAGE_SHIFT)); + } + gnttab_end_foreign_access(ring->ref, 0, 0); + free_page((unsigned long)ring->intf); + return ret; +} + +static int xen_9pfs_front_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int ret, i; + struct xenbus_transaction xbt; + struct xen_9pfs_front_priv *priv = NULL; + char *versions; + unsigned int max_rings, max_ring_order, len = 0; + + versions = xenbus_read(XBT_NIL, dev->otherend, "versions", &len); + if (!len) + return -EINVAL; + if (strcmp(versions, "1")) { + kfree(versions); + return -EINVAL; + } + kfree(versions); + max_rings = xenbus_read_unsigned(dev->otherend, "max-rings", 0); + if (max_rings < XEN_9PFS_NUM_RINGS) + return -EINVAL; + max_ring_order = xenbus_read_unsigned(dev->otherend, + "max-ring-page-order", 0); + if (max_ring_order < XEN_9PFS_RING_ORDER) + return -EINVAL; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->dev = dev; + priv->num_rings = XEN_9PFS_NUM_RINGS; + priv->rings = kcalloc(priv->num_rings, sizeof(*priv->rings), + GFP_KERNEL); + if (!priv->rings) { + kfree(priv); + return -ENOMEM; + } + + for (i = 0; i < priv->num_rings; i++) { + priv->rings[i].priv = priv; + ret = xen_9pfs_front_alloc_dataring(dev, &priv->rings[i]); + if (ret < 0) + goto error; + } + + again: + ret = xenbus_transaction_start(&xbt); + if (ret) { + xenbus_dev_fatal(dev, ret, "starting transaction"); + goto error; + } + ret = xenbus_printf(xbt, dev->nodename, "version", "%u", 1); + if (ret) + goto error_xenbus; + ret = xenbus_printf(xbt, dev->nodename, "num-rings", "%u", + priv->num_rings); + if (ret) + goto error_xenbus; + for (i = 0; i < priv->num_rings; i++) { + char str[16]; + + BUILD_BUG_ON(XEN_9PFS_NUM_RINGS > 9); + sprintf(str, "ring-ref%u", i); + ret = xenbus_printf(xbt, dev->nodename, str, "%d", + priv->rings[i].ref); + if (ret) + goto error_xenbus; + + sprintf(str, "event-channel-%u", i); + ret = xenbus_printf(xbt, dev->nodename, str, "%u", + priv->rings[i].evtchn); + if (ret) + goto error_xenbus; + } + priv->tag = xenbus_read(xbt, dev->nodename, "tag", NULL); + if (!priv->tag) { + ret = -EINVAL; + goto error_xenbus; + } + ret = xenbus_transaction_end(xbt, 0); + if (ret) { + if (ret == -EAGAIN) + goto again; + xenbus_dev_fatal(dev, ret, "completing transaction"); + goto error; + } + + write_lock(&xen_9pfs_lock); + list_add_tail(&priv->list, &xen_9pfs_devs); + write_unlock(&xen_9pfs_lock); + dev_set_drvdata(&dev->dev, priv); + xenbus_switch_state(dev, XenbusStateInitialised); + + return 0; + + error_xenbus: + xenbus_transaction_end(xbt, 1); + xenbus_dev_fatal(dev, ret, "writing xenstore"); + error: + dev_set_drvdata(&dev->dev, NULL); + xen_9pfs_front_free(priv); + return ret; +} + +static int xen_9pfs_front_resume(struct xenbus_device *dev) +{ + dev_warn(&dev->dev, "suspsend/resume unsupported\n"); + return 0; +} + +static void xen_9pfs_front_changed(struct xenbus_device *dev, + enum xenbus_state backend_state) +{ + switch (backend_state) { + case XenbusStateReconfiguring: + case XenbusStateReconfigured: + case XenbusStateInitialising: + case XenbusStateInitialised: + case XenbusStateUnknown: + break; + + case XenbusStateInitWait: + break; + + case XenbusStateConnected: + xenbus_switch_state(dev, XenbusStateConnected); + break; + + case XenbusStateClosed: + if (dev->state == XenbusStateClosed) + break; + /* Missed the backend's CLOSING state -- fallthrough */ + case XenbusStateClosing: + xenbus_frontend_closed(dev); + break; + } +} + +static struct xenbus_driver xen_9pfs_front_driver = { + .ids = xen_9pfs_front_ids, + .probe = xen_9pfs_front_probe, + .remove = xen_9pfs_front_remove, + .resume = xen_9pfs_front_resume, + .otherend_changed = xen_9pfs_front_changed, +}; + +int p9_trans_xen_init(void) +{ + if (!xen_domain()) + return -ENODEV; + + pr_info("Initialising Xen transport for 9pfs\n"); + + v9fs_register_trans(&p9_xen_trans); + return xenbus_register_frontend(&xen_9pfs_front_driver); +} +module_init(p9_trans_xen_init); + +void p9_trans_xen_exit(void) +{ + v9fs_unregister_trans(&p9_xen_trans); + return xenbus_unregister_driver(&xen_9pfs_front_driver); +} +module_exit(p9_trans_xen_exit); diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c index 4e0b0c359325..e0bb624c3845 100644 --- a/net/bridge/netfilter/ebt_dnat.c +++ b/net/bridge/netfilter/ebt_dnat.c @@ -9,6 +9,7 @@ */ #include <linux/module.h> #include <net/sock.h> +#include "../br_private.h" #include <linux/netfilter.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_bridge/ebtables.h> @@ -18,11 +19,30 @@ static unsigned int ebt_dnat_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ebt_nat_info *info = par->targinfo; + struct net_device *dev; if (!skb_make_writable(skb, 0)) return EBT_DROP; ether_addr_copy(eth_hdr(skb)->h_dest, info->mac); + + if (is_multicast_ether_addr(info->mac)) { + if (is_broadcast_ether_addr(info->mac)) + skb->pkt_type = PACKET_BROADCAST; + else + skb->pkt_type = PACKET_MULTICAST; + } else { + if (xt_hooknum(par) != NF_BR_BROUTING) + dev = br_port_get_rcu(xt_in(par))->br->dev; + else + dev = xt_in(par); + + if (ether_addr_equal(info->mac, dev->dev_addr)) + skb->pkt_type = PACKET_HOST; + else + skb->pkt_type = PACKET_OTHERHOST; + } + return info->target; } diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 108533859a53..4eb773ccce11 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -187,7 +187,7 @@ void *ceph_kvmalloc(size_t size, gfp_t flags) return ptr; } - return __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL); + return __vmalloc(size, flags, PAGE_KERNEL); } diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index f76bb3332613..5766a6c896c4 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1386,8 +1386,9 @@ static void prepare_write_keepalive(struct ceph_connection *con) dout("prepare_write_keepalive %p\n", con); con_out_kvec_reset(con); if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) { - struct timespec now = CURRENT_TIME; + struct timespec now; + ktime_get_real_ts(&now); con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2); ceph_encode_timespec(&con->out_temp_keepalive2, &now); con_out_kvec_add(con, sizeof(con->out_temp_keepalive2), @@ -3176,8 +3177,9 @@ bool ceph_con_keepalive_expired(struct ceph_connection *con, { if (interval > 0 && (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2)) { - struct timespec now = CURRENT_TIME; + struct timespec now; struct timespec ts; + ktime_get_real_ts(&now); jiffies_to_timespec(interval, &ts); ts = timespec_add(con->last_keepalive_ack, ts); return timespec_compare(&now, &ts) >= 0; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index e15ea9e4c495..242d7c0d92f8 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -3574,7 +3574,7 @@ ceph_osdc_watch(struct ceph_osd_client *osdc, ceph_oid_copy(&lreq->t.base_oid, oid); ceph_oloc_copy(&lreq->t.base_oloc, oloc); lreq->t.flags = CEPH_OSD_FLAG_WRITE; - lreq->mtime = CURRENT_TIME; + ktime_get_real_ts(&lreq->mtime); lreq->reg_req = alloc_linger_request(lreq); if (!lreq->reg_req) { @@ -3632,7 +3632,7 @@ int ceph_osdc_unwatch(struct ceph_osd_client *osdc, ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid); ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); req->r_flags = CEPH_OSD_FLAG_WRITE; - req->r_mtime = CURRENT_TIME; + ktime_get_real_ts(&req->r_mtime); osd_req_op_watch_init(req, 0, lreq->linger_id, CEPH_OSD_WATCH_OP_UNWATCH); diff --git a/net/core/dev.c b/net/core/dev.c index d07aa5ffb511..96cf83da0d66 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -81,6 +81,7 @@ #include <linux/hash.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/mutex.h> #include <linux/string.h> #include <linux/mm.h> @@ -4235,7 +4236,7 @@ static int __netif_receive_skb(struct sk_buff *skb) int ret; if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { - unsigned long pflags = current->flags; + unsigned int noreclaim_flag; /* * PFMEMALLOC skbs are special, they should @@ -4246,9 +4247,9 @@ static int __netif_receive_skb(struct sk_buff *skb) * Use PF_MEMALLOC as this saves us from propagating the allocation * context down to all allocation sites. */ - current->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); ret = __netif_receive_skb_core(skb, true); - current_restore_flags(pflags, PF_MEMALLOC); + memalloc_noreclaim_restore(noreclaim_flag); } else ret = __netif_receive_skb_core(skb, false); @@ -7264,12 +7265,10 @@ static int netif_alloc_rx_queues(struct net_device *dev) BUG_ON(count < 1); - rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); - if (!rx) { - rx = vzalloc(sz); - if (!rx) - return -ENOMEM; - } + rx = kvzalloc(sz, GFP_KERNEL | __GFP_REPEAT); + if (!rx) + return -ENOMEM; + dev->_rx = rx; for (i = 0; i < count; i++) @@ -7306,12 +7305,10 @@ static int netif_alloc_netdev_queues(struct net_device *dev) if (count < 1 || count > 0xffff) return -EINVAL; - tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); - if (!tx) { - tx = vzalloc(sz); - if (!tx) - return -ENOMEM; - } + tx = kvzalloc(sz, GFP_KERNEL | __GFP_REPEAT); + if (!tx) + return -ENOMEM; + dev->_tx = tx; netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); @@ -7845,9 +7842,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, /* ensure 32-byte alignment of whole construct */ alloc_size += NETDEV_ALIGN - 1; - p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); - if (!p) - p = vzalloc(alloc_size); + p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_REPEAT); if (!p) return NULL; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 6e67315ec368..bcb0f610ee42 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1054,7 +1054,7 @@ static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev) return err; } - if (nla_put(skb, IFLA_PHYS_PORT_NAME, strlen(name), name)) + if (nla_put_string(skb, IFLA_PHYS_PORT_NAME, name)) return -EMSGSIZE; return 0; diff --git a/net/core/sock.c b/net/core/sock.c index b5baeb9cb0fb..79c6aee6af9b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -102,6 +102,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> @@ -372,14 +373,14 @@ EXPORT_SYMBOL_GPL(sk_clear_memalloc); int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) { int ret; - unsigned long pflags = current->flags; + unsigned int noreclaim_flag; /* these should have been dropped before queueing */ BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); - current->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); ret = sk->sk_backlog_rcv(sk, skb); - current_restore_flags(pflags, PF_MEMALLOC); + memalloc_noreclaim_restore(noreclaim_flag); return ret; } diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 9afa2a5030b2..405483a07efc 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -2361,7 +2361,8 @@ MODULE_AUTHOR("Linux DECnet Project Team"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_DECnet); -static char banner[] __initdata = KERN_INFO "NET4: DECnet for Linux: V.2.5.68s (C) 1995-2003 Linux DECnet Project Team\n"; +static const char banner[] __initconst = KERN_INFO +"NET4: DECnet for Linux: V.2.5.68s (C) 1995-2003 Linux DECnet Project Team\n"; static int __init decnet_init(void) { diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 8bea74298173..e9a59d2d91d4 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -678,11 +678,7 @@ int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) /* no more locks than number of hash buckets */ nblocks = min(nblocks, hashinfo->ehash_mask + 1); - hashinfo->ehash_locks = kmalloc_array(nblocks, locksz, - GFP_KERNEL | __GFP_NOWARN); - if (!hashinfo->ehash_locks) - hashinfo->ehash_locks = vmalloc(nblocks * locksz); - + hashinfo->ehash_locks = kvmalloc_array(nblocks, locksz, GFP_KERNEL); if (!hashinfo->ehash_locks) return -ENOMEM; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 9d943974de2b..bdffad875691 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -358,6 +358,9 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, rt->dst.dev->mtu); return -EMSGSIZE; } + if (length < sizeof(struct iphdr)) + return -EINVAL; + if (flags&MSG_PROBE) goto out; diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 9d0d4f39e42b..653bbd67e3a3 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -1011,10 +1011,7 @@ static int __net_init tcp_net_metrics_init(struct net *net) tcp_metrics_hash_log = order_base_2(slots); size = sizeof(struct tcpm_hash_bucket) << tcp_metrics_hash_log; - tcp_metrics_hash = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); - if (!tcp_metrics_hash) - tcp_metrics_hash = vzalloc(size); - + tcp_metrics_hash = kvzalloc(size, GFP_KERNEL); if (!tcp_metrics_hash) return -ENOMEM; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 8f6373b0cd77..717be4de5324 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -523,6 +523,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; newtp->rx_opt.mss_clamp = req->mss; tcp_ecn_openreq_child(newtp, req); + newtp->fastopen_req = NULL; newtp->fastopen_rsk = NULL; newtp->syn_data_acked = 0; newtp->rack.mstamp.v64 = 0; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index b09ac38d8dc4..77a4bd526d6e 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3328,7 +3328,8 @@ static int fixup_permanent_addr(struct inet6_dev *idev, idev->dev, 0, 0); } - addrconf_dad_start(ifp); + if (ifp->state == INET6_IFADDR_STATE_PREDAD) + addrconf_dad_start(ifp); return 0; } @@ -3683,7 +3684,7 @@ restart: if (keep) { /* set state to skip the notifier below */ state = INET6_IFADDR_STATE_DEAD; - ifa->state = 0; + ifa->state = INET6_IFADDR_STATE_PREDAD; if (!(ifa->flags & IFA_F_NODAD)) ifa->flags |= IFA_F_TENTATIVE; @@ -6572,6 +6573,8 @@ int __init addrconf_init(void) goto errlo; } + ip6_route_init_special_entries(); + for (i = 0; i < IN6_ADDR_HSIZE; i++) INIT_HLIST_HEAD(&inet6_addr_lst[i]); diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index af8f52ee7180..2fd5ca151dcf 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -41,13 +41,7 @@ static int alloc_ila_locks(struct ila_net *ilan) size = roundup_pow_of_two(nr_pcpus * LOCKS_PER_CPU); if (sizeof(spinlock_t) != 0) { -#ifdef CONFIG_NUMA - if (size * sizeof(spinlock_t) > PAGE_SIZE) - ilan->locks = vmalloc(size * sizeof(spinlock_t)); - else -#endif - ilan->locks = kmalloc_array(size, sizeof(spinlock_t), - GFP_KERNEL); + ilan->locks = kvmalloc(size * sizeof(spinlock_t), GFP_KERNEL); if (!ilan->locks) return -ENOMEM; for (i = 0; i < size; i++) diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c index bf3ad3e7b647..b2b4f031b3a1 100644 --- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -235,7 +235,7 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, inside->icmp6.icmp6_cksum = csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, skb->len - hdrlen, IPPROTO_ICMPV6, - csum_partial(&inside->icmp6, + skb_checksum(skb, hdrlen, skb->len - hdrlen, 0)); } diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 0da6a12b5472..1f992d9e261d 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -632,6 +632,8 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, ipv6_local_error(sk, EMSGSIZE, fl6, rt->dst.dev->mtu); return -EMSGSIZE; } + if (length < sizeof(struct ipv6hdr)) + return -EINVAL; if (flags&MSG_PROBE) goto out; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a1bf426c959b..2f1136627dcb 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4027,6 +4027,21 @@ static struct notifier_block ip6_route_dev_notifier = { .priority = 0, }; +void __init ip6_route_init_special_entries(void) +{ + /* Registering of the loopback is done before this portion of code, + * the loopback reference in rt6_info will not be taken, do it + * manually for init_net */ + init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; + init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); + #ifdef CONFIG_IPV6_MULTIPLE_TABLES + init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; + init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); + init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; + init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); + #endif +} + int __init ip6_route_init(void) { int ret; @@ -4053,17 +4068,6 @@ int __init ip6_route_init(void) ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; - /* Registering of the loopback is done before this portion of code, - * the loopback reference in rt6_info will not be taken, do it - * manually for init_net */ - init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; - init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); - #ifdef CONFIG_IPV6_MULTIPLE_TABLES - init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; - init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); - init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; - init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); - #endif ret = fib6_init(); if (ret) goto out_register_subsys; diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 088e2b459d0f..257ec66009da 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -2005,10 +2005,7 @@ static int resize_platform_label_table(struct net *net, size_t limit) unsigned index; if (size) { - labels = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); - if (!labels) - labels = vzalloc(size); - + labels = kvzalloc(size, GFP_KERNEL); if (!labels) goto nolabels; } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 668d9643f0cc..1fa3c2307b6e 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -3078,6 +3078,17 @@ nla_put_failure: return skb->len; } +static bool ip_vs_is_af_valid(int af) +{ + if (af == AF_INET) + return true; +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6 && ipv6_mod_enabled()) + return true; +#endif + return false; +} + static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *usvc, struct nlattr *nla, int full_entry, @@ -3105,11 +3116,7 @@ static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs, memset(usvc, 0, sizeof(*usvc)); usvc->af = nla_get_u16(nla_af); -#ifdef CONFIG_IP_VS_IPV6 - if (usvc->af != AF_INET && usvc->af != AF_INET6) -#else - if (usvc->af != AF_INET) -#endif + if (!ip_vs_is_af_valid(usvc->af)) return -EAFNOSUPPORT; if (nla_fwmark) { @@ -3612,6 +3619,11 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) if (udest.af == 0) udest.af = svc->af; + if (!ip_vs_is_af_valid(udest.af)) { + ret = -EAFNOSUPPORT; + goto out; + } + if (udest.af != svc->af && cmd != IPVS_CMD_DEL_DEST) { /* The synchronization protocol is incompatible * with mixed family services diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index f9245dbfe435..3c8f1ed2f555 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1853,7 +1853,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, &nf_conntrack_htable_size, 0600); -static unsigned int total_extension_size(void) +static __always_inline unsigned int total_extension_size(void) { /* remember to add new extensions below */ BUILD_BUG_ON(NF_CT_EXT_NUM > 9); diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 4b9dfe3eef62..3a60efa7799b 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -385,7 +385,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me) struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) }; unsigned int h = helper_hash(&me->tuple); struct nf_conntrack_helper *cur; - int ret = 0; + int ret = 0, i; BUG_ON(me->expect_policy == NULL); BUG_ON(me->expect_class_max >= NF_CT_MAX_EXPECT_CLASSES); @@ -395,10 +395,26 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me) return -EINVAL; mutex_lock(&nf_ct_helper_mutex); - hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) { - if (nf_ct_tuple_src_mask_cmp(&cur->tuple, &me->tuple, &mask)) { - ret = -EEXIST; - goto out; + for (i = 0; i < nf_ct_helper_hsize; i++) { + hlist_for_each_entry(cur, &nf_ct_helper_hash[i], hnode) { + if (!strcmp(cur->name, me->name) && + (cur->tuple.src.l3num == NFPROTO_UNSPEC || + cur->tuple.src.l3num == me->tuple.src.l3num) && + cur->tuple.dst.protonum == me->tuple.dst.protonum) { + ret = -EEXIST; + goto out; + } + } + } + + /* avoid unpredictable behaviour for auto_assign_helper */ + if (!(me->flags & NF_CT_HELPER_F_USERSPACE)) { + hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) { + if (nf_ct_tuple_src_mask_cmp(&cur->tuple, &me->tuple, + &mask)) { + ret = -EEXIST; + goto out; + } } } hlist_add_head_rcu(&me->hnode, &nf_ct_helper_hash[h]); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 5f6f2f388928..dcf561b5c97a 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -417,8 +417,7 @@ nla_put_failure: return -1; } -static int ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, - const struct nf_conn *ct) +static int ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, struct nf_conn *ct) { struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); struct nf_ct_seqadj *seq; @@ -426,15 +425,20 @@ static int ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, if (!(ct->status & IPS_SEQ_ADJUST) || !seqadj) return 0; + spin_lock_bh(&ct->lock); seq = &seqadj->seq[IP_CT_DIR_ORIGINAL]; if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_ORIG) == -1) - return -1; + goto err; seq = &seqadj->seq[IP_CT_DIR_REPLY]; if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_REPLY) == -1) - return -1; + goto err; + spin_unlock_bh(&ct->lock); return 0; +err: + spin_unlock_bh(&ct->lock); + return -1; } static int ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct) @@ -1417,6 +1421,24 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct, } #endif +static void +__ctnetlink_change_status(struct nf_conn *ct, unsigned long on, + unsigned long off) +{ + unsigned int bit; + + /* Ignore these unchangable bits */ + on &= ~IPS_UNCHANGEABLE_MASK; + off &= ~IPS_UNCHANGEABLE_MASK; + + for (bit = 0; bit < __IPS_MAX_BIT; bit++) { + if (on & (1 << bit)) + set_bit(bit, &ct->status); + else if (off & (1 << bit)) + clear_bit(bit, &ct->status); + } +} + static int ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[]) { @@ -1436,10 +1458,7 @@ ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[]) /* ASSURED bit can only be set */ return -EBUSY; - /* Be careful here, modifying NAT bits can screw up things, - * so don't let users modify them directly if they don't pass - * nf_nat_range. */ - ct->status |= status & ~(IPS_NAT_DONE_MASK | IPS_NAT_MASK); + __ctnetlink_change_status(ct, status, 0); return 0; } @@ -1508,23 +1527,11 @@ static int ctnetlink_change_helper(struct nf_conn *ct, return 0; } + rcu_read_lock(); helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), nf_ct_protonum(ct)); if (helper == NULL) { -#ifdef CONFIG_MODULES - spin_unlock_bh(&nf_conntrack_expect_lock); - - if (request_module("nfct-helper-%s", helpname) < 0) { - spin_lock_bh(&nf_conntrack_expect_lock); - return -EOPNOTSUPP; - } - - spin_lock_bh(&nf_conntrack_expect_lock); - helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), - nf_ct_protonum(ct)); - if (helper) - return -EAGAIN; -#endif + rcu_read_unlock(); return -EOPNOTSUPP; } @@ -1533,13 +1540,16 @@ static int ctnetlink_change_helper(struct nf_conn *ct, /* update private helper data if allowed. */ if (helper->from_nlattr) helper->from_nlattr(helpinfo, ct); - return 0; + err = 0; } else - return -EBUSY; + err = -EBUSY; + } else { + /* we cannot set a helper for an existing conntrack */ + err = -EOPNOTSUPP; } - /* we cannot set a helper for an existing conntrack */ - return -EOPNOTSUPP; + rcu_read_unlock(); + return err; } static int ctnetlink_change_timeout(struct nf_conn *ct, @@ -1630,25 +1640,30 @@ ctnetlink_change_seq_adj(struct nf_conn *ct, if (!seqadj) return 0; + spin_lock_bh(&ct->lock); if (cda[CTA_SEQ_ADJ_ORIG]) { ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_ORIGINAL], cda[CTA_SEQ_ADJ_ORIG]); if (ret < 0) - return ret; + goto err; - ct->status |= IPS_SEQ_ADJUST; + set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); } if (cda[CTA_SEQ_ADJ_REPLY]) { ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_REPLY], cda[CTA_SEQ_ADJ_REPLY]); if (ret < 0) - return ret; + goto err; - ct->status |= IPS_SEQ_ADJUST; + set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); } + spin_unlock_bh(&ct->lock); return 0; +err: + spin_unlock_bh(&ct->lock); + return ret; } static int @@ -1959,9 +1974,7 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl, err = -EEXIST; ct = nf_ct_tuplehash_to_ctrack(h); if (!(nlh->nlmsg_flags & NLM_F_EXCL)) { - spin_lock_bh(&nf_conntrack_expect_lock); err = ctnetlink_change_conntrack(ct, cda); - spin_unlock_bh(&nf_conntrack_expect_lock); if (err == 0) { nf_conntrack_eventmask_report((1 << IPCT_REPLY) | (1 << IPCT_ASSURED) | @@ -2294,10 +2307,10 @@ ctnetlink_update_status(struct nf_conn *ct, const struct nlattr * const cda[]) /* This check is less strict than ctnetlink_change_status() * because callers often flip IPS_EXPECTED bits when sending * an NFQA_CT attribute to the kernel. So ignore the - * unchangeable bits but do not error out. + * unchangeable bits but do not error out. Also user programs + * are allowed to clear the bits that they are allowed to change. */ - ct->status = (status & ~IPS_UNCHANGEABLE_MASK) | - (ct->status & IPS_UNCHANGEABLE_MASK); + __ctnetlink_change_status(ct, status, ~status); return 0; } @@ -2351,11 +2364,7 @@ ctnetlink_glue_parse(const struct nlattr *attr, struct nf_conn *ct) if (ret < 0) return ret; - spin_lock_bh(&nf_conntrack_expect_lock); - ret = ctnetlink_glue_parse_ct((const struct nlattr **)cda, ct); - spin_unlock_bh(&nf_conntrack_expect_lock); - - return ret; + return ctnetlink_glue_parse_ct((const struct nlattr **)cda, ct); } static int ctnetlink_glue_exp_parse(const struct nlattr * const *cda, diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 1c6482d2c4dc..559225029740 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3778,6 +3778,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = set->ops->insert(ctx->net, set, &elem, &ext2); if (err) { if (err == -EEXIST) { + if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) ^ + nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) || + nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) ^ + nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF)) + return -EBUSY; if ((nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) && memcmp(nft_set_ext_data(ext), diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 3948da380259..66221ad891a9 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -82,8 +82,7 @@ static void nft_dynset_eval(const struct nft_expr *expr, nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { timeout = priv->timeout ? : set->timeout; *nft_set_ext_expiration(ext) = jiffies + timeout; - } else if (sexpr == NULL) - goto out; + } if (sexpr != NULL) sexpr->ops->eval(sexpr, regs, pkt); @@ -92,7 +91,7 @@ static void nft_dynset_eval(const struct nft_expr *expr, regs->verdict.code = NFT_BREAK; return; } -out: + if (!priv->invert) regs->verdict.code = NFT_BREAK; } diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 8ebbc2940f4c..b988162b5b15 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -257,6 +257,11 @@ static int nft_bitmap_init(const struct nft_set *set, static void nft_bitmap_destroy(const struct nft_set *set) { + struct nft_bitmap *priv = nft_set_priv(set); + struct nft_bitmap_elem *be, *n; + + list_for_each_entry_safe(be, n, &priv->list, head) + nft_set_elem_destroy(set, be, true); } static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features, diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 14857afc9937..8876b7da6884 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -763,17 +763,8 @@ EXPORT_SYMBOL(xt_check_entry_offsets); */ unsigned int *xt_alloc_entry_offsets(unsigned int size) { - unsigned int *off; + return kvmalloc_array(size, sizeof(unsigned int), GFP_KERNEL | __GFP_ZERO); - off = kcalloc(size, sizeof(unsigned int), GFP_KERNEL | __GFP_NOWARN); - - if (off) - return off; - - if (size < (SIZE_MAX / sizeof(unsigned int))) - off = vmalloc(size * sizeof(unsigned int)); - - return off; } EXPORT_SYMBOL(xt_alloc_entry_offsets); @@ -1007,8 +998,7 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size) if (sz <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) info = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); if (!info) { - info = __vmalloc(sz, GFP_KERNEL | __GFP_NOWARN | - __GFP_NORETRY | __GFP_HIGHMEM, + info = __vmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY, PAGE_KERNEL); if (!info) return NULL; @@ -1051,8 +1041,10 @@ struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, list_for_each_entry(t, &init_net.xt.tables[af], list) { if (strcmp(t->name, name)) continue; - if (!try_module_get(t->me)) + if (!try_module_get(t->me)) { + mutex_unlock(&xt[af].mutex); return NULL; + } mutex_unlock(&xt[af].mutex); if (t->table_init(net) != 0) { @@ -1114,7 +1106,7 @@ static int xt_jumpstack_alloc(struct xt_table_info *i) size = sizeof(void **) * nr_cpu_ids; if (size > PAGE_SIZE) - i->jumpstack = vzalloc(size); + i->jumpstack = kvzalloc(size, GFP_KERNEL); else i->jumpstack = kzalloc(size, GFP_KERNEL); if (i->jumpstack == NULL) @@ -1136,12 +1128,8 @@ static int xt_jumpstack_alloc(struct xt_table_info *i) */ size = sizeof(void *) * i->stacksize * 2u; for_each_possible_cpu(cpu) { - if (size > PAGE_SIZE) - i->jumpstack[cpu] = vmalloc_node(size, - cpu_to_node(cpu)); - else - i->jumpstack[cpu] = kmalloc_node(size, - GFP_KERNEL, cpu_to_node(cpu)); + i->jumpstack[cpu] = kvmalloc_node(size, GFP_KERNEL, + cpu_to_node(cpu)); if (i->jumpstack[cpu] == NULL) /* * Freeing will be done later on by the callers. The diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 3cbe1bcf6a74..bb7ad82dcd56 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -168,8 +168,10 @@ xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par, goto err_put_timeout; } timeout_ext = nf_ct_timeout_ext_add(ct, timeout, GFP_ATOMIC); - if (timeout_ext == NULL) + if (!timeout_ext) { ret = -ENOMEM; + goto err_put_timeout; + } rcu_read_unlock(); return ret; @@ -201,6 +203,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, struct xt_ct_target_info_v1 *info) { struct nf_conntrack_zone zone; + struct nf_conn_help *help; struct nf_conn *ct; int ret = -EOPNOTSUPP; @@ -249,7 +252,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, if (info->timeout[0]) { ret = xt_ct_set_timeout(ct, par, info->timeout); if (ret < 0) - goto err3; + goto err4; } __set_bit(IPS_CONFIRMED_BIT, &ct->status); nf_conntrack_get(&ct->ct_general); @@ -257,6 +260,10 @@ out: info->ct = ct; return 0; +err4: + help = nfct_help(ct); + if (help) + module_put(help->helper->me); err3: nf_ct_tmpl_free(ct); err2: diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 37d581a31cff..3f6c4fa78bdb 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -388,10 +388,7 @@ static int recent_mt_check(const struct xt_mtchk_param *par, } sz = sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size; - if (sz <= PAGE_SIZE) - t = kzalloc(sz, GFP_KERNEL); - else - t = vzalloc(sz); + t = kvzalloc(sz, GFP_KERNEL); if (t == NULL) { ret = -ENOMEM; goto out; diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 770bbec878f1..e75ef39669c5 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -152,7 +152,7 @@ static int socket_mt_enable_defrag(struct net *net, int family) switch (family) { case NFPROTO_IPV4: return nf_defrag_ipv4_enable(net); -#ifdef XT_SOCKET_HAVE_IPV6 +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) case NFPROTO_IPV6: return nf_defrag_ipv6_enable(net); #endif diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 42a95919df09..bf602e33c40a 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -516,10 +516,38 @@ ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, u16 proto, const struct sk_buff *skb) { struct nf_conntrack_tuple tuple; + struct nf_conntrack_expect *exp; if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, net, &tuple)) return NULL; - return __nf_ct_expect_find(net, zone, &tuple); + + exp = __nf_ct_expect_find(net, zone, &tuple); + if (exp) { + struct nf_conntrack_tuple_hash *h; + + /* Delete existing conntrack entry, if it clashes with the + * expectation. This can happen since conntrack ALGs do not + * check for clashes between (new) expectations and existing + * conntrack entries. nf_conntrack_in() will check the + * expectations only if a conntrack entry can not be found, + * which can lead to OVS finding the expectation (here) in the + * init direction, but which will not be removed by the + * nf_conntrack_in() call, if a matching conntrack entry is + * found instead. In this case all init direction packets + * would be reported as new related packets, while reply + * direction packets would be reported as un-related + * established packets. + */ + h = nf_conntrack_find_get(net, zone, &tuple); + if (h) { + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + + nf_ct_delete(ct, 0, 0); + nf_conntrack_put(&ct->ct_general); + } + } + + return exp; } /* This replicates logic from nf_conntrack_core.c that is not exported. */ diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 2efb36c08f2a..dee469fed967 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -203,8 +203,7 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, *arg = (unsigned long) head; rcu_assign_pointer(tp->root, new); - if (head) - call_rcu(&head->rcu, mall_destroy_rcu); + call_rcu(&head->rcu, mall_destroy_rcu); return 0; err_replace_hw_filter: diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index d00f4c7c2f3a..b30a2c70bd48 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -376,10 +376,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt) if (mask != q->tab_mask) { struct sk_buff **ntab; - ntab = kcalloc(mask + 1, sizeof(struct sk_buff *), - GFP_KERNEL | __GFP_NOWARN); - if (!ntab) - ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *)); + ntab = kvmalloc_array((mask + 1), sizeof(struct sk_buff *), GFP_KERNEL | __GFP_ZERO); if (!ntab) return -ENOMEM; diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index da4f67bda0ee..b488721a0059 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -624,16 +624,6 @@ static void fq_rehash(struct fq_sched_data *q, q->stat_gc_flows += fcnt; } -static void *fq_alloc_node(size_t sz, int node) -{ - void *ptr; - - ptr = kmalloc_node(sz, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN, node); - if (!ptr) - ptr = vmalloc_node(sz, node); - return ptr; -} - static void fq_free(void *addr) { kvfree(addr); @@ -650,7 +640,7 @@ static int fq_resize(struct Qdisc *sch, u32 log) return 0; /* If XPS was setup, we can allocate memory on right NUMA node */ - array = fq_alloc_node(sizeof(struct rb_root) << log, + array = kvmalloc_node(sizeof(struct rb_root) << log, GFP_KERNEL | __GFP_REPEAT, netdev_queue_numa_node_read(sch->dev_queue)); if (!array) return -ENOMEM; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 18bbb5476c83..9201abce928c 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -446,27 +446,13 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) return 0; } -static void *fq_codel_zalloc(size_t sz) -{ - void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN); - - if (!ptr) - ptr = vzalloc(sz); - return ptr; -} - -static void fq_codel_free(void *addr) -{ - kvfree(addr); -} - static void fq_codel_destroy(struct Qdisc *sch) { struct fq_codel_sched_data *q = qdisc_priv(sch); tcf_destroy_chain(&q->filter_list); - fq_codel_free(q->backlogs); - fq_codel_free(q->flows); + kvfree(q->backlogs); + kvfree(q->flows); } static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) @@ -493,13 +479,13 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) } if (!q->flows) { - q->flows = fq_codel_zalloc(q->flows_cnt * - sizeof(struct fq_codel_flow)); + q->flows = kvzalloc(q->flows_cnt * + sizeof(struct fq_codel_flow), GFP_KERNEL); if (!q->flows) return -ENOMEM; - q->backlogs = fq_codel_zalloc(q->flows_cnt * sizeof(u32)); + q->backlogs = kvzalloc(q->flows_cnt * sizeof(u32), GFP_KERNEL); if (!q->backlogs) { - fq_codel_free(q->flows); + kvfree(q->flows); return -ENOMEM; } for (i = 0; i < q->flows_cnt; i++) { diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index c19d346e6c5a..51d3ba682af9 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -467,29 +467,14 @@ static void hhf_reset(struct Qdisc *sch) rtnl_kfree_skbs(skb, skb); } -static void *hhf_zalloc(size_t sz) -{ - void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN); - - if (!ptr) - ptr = vzalloc(sz); - - return ptr; -} - -static void hhf_free(void *addr) -{ - kvfree(addr); -} - static void hhf_destroy(struct Qdisc *sch) { int i; struct hhf_sched_data *q = qdisc_priv(sch); for (i = 0; i < HHF_ARRAYS_CNT; i++) { - hhf_free(q->hhf_arrays[i]); - hhf_free(q->hhf_valid_bits[i]); + kvfree(q->hhf_arrays[i]); + kvfree(q->hhf_valid_bits[i]); } for (i = 0; i < HH_FLOWS_CNT; i++) { @@ -503,7 +488,7 @@ static void hhf_destroy(struct Qdisc *sch) kfree(flow); } } - hhf_free(q->hh_flows); + kvfree(q->hh_flows); } static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = { @@ -609,8 +594,8 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt) if (!q->hh_flows) { /* Initialize heavy-hitter flow table. */ - q->hh_flows = hhf_zalloc(HH_FLOWS_CNT * - sizeof(struct list_head)); + q->hh_flows = kvzalloc(HH_FLOWS_CNT * + sizeof(struct list_head), GFP_KERNEL); if (!q->hh_flows) return -ENOMEM; for (i = 0; i < HH_FLOWS_CNT; i++) @@ -624,8 +609,8 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt) /* Initialize heavy-hitter filter arrays. */ for (i = 0; i < HHF_ARRAYS_CNT; i++) { - q->hhf_arrays[i] = hhf_zalloc(HHF_ARRAYS_LEN * - sizeof(u32)); + q->hhf_arrays[i] = kvzalloc(HHF_ARRAYS_LEN * + sizeof(u32), GFP_KERNEL); if (!q->hhf_arrays[i]) { /* Note: hhf_destroy() will be called * by our caller. @@ -637,8 +622,8 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt) /* Initialize valid bits of heavy-hitter filter arrays. */ for (i = 0; i < HHF_ARRAYS_CNT; i++) { - q->hhf_valid_bits[i] = hhf_zalloc(HHF_ARRAYS_LEN / - BITS_PER_BYTE); + q->hhf_valid_bits[i] = kvzalloc(HHF_ARRAYS_LEN / + BITS_PER_BYTE, GFP_KERNEL); if (!q->hhf_valid_bits[i]) { /* Note: hhf_destroy() will be called * by our caller. diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index f0ce4780f395..1b3dd6190e93 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -702,15 +702,11 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr) spinlock_t *root_lock; struct disttable *d; int i; - size_t s; if (n > NETEM_DIST_MAX) return -EINVAL; - s = sizeof(struct disttable) + n * sizeof(s16); - d = kmalloc(s, GFP_KERNEL | __GFP_NOWARN); - if (!d) - d = vmalloc(s); + d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL); if (!d) return -ENOMEM; diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index b00e02c139de..332d94be6e1c 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -685,11 +685,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) static void *sfq_alloc(size_t sz) { - void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN); - - if (!ptr) - ptr = vmalloc(sz); - return ptr; + return kvmalloc(sz, GFP_KERNEL); } static void sfq_free(void *addr) diff --git a/net/sysctl_net.c b/net/sysctl_net.c index 919981324171..9aed6fe1bf1a 100644 --- a/net/sysctl_net.c +++ b/net/sysctl_net.c @@ -106,7 +106,6 @@ __init int net_sysctl_init(void) ret = register_pernet_subsys(&sysctl_pernet_ops); if (ret) goto out1; - register_sysctl_root(&net_sysctl_root); out: return ret; out1: |