diff options
Diffstat (limited to 'net')
821 files changed, 52608 insertions, 14028 deletions
diff --git a/net/802/mrp.c b/net/802/mrp.c index 2cfdfbfbb2ed..bea6e43d45a0 100644 --- a/net/802/mrp.c +++ b/net/802/mrp.c @@ -523,7 +523,7 @@ int mrp_request_join(const struct net_device *dev, struct mrp_attr *attr; if (sizeof(struct mrp_skb_cb) + len > - FIELD_SIZEOF(struct sk_buff, cb)) + sizeof_field(struct sk_buff, cb)) return -ENOMEM; spin_lock_bh(&app->lock); @@ -548,7 +548,7 @@ void mrp_request_leave(const struct net_device *dev, struct mrp_attr *attr; if (sizeof(struct mrp_skb_cb) + len > - FIELD_SIZEOF(struct sk_buff, cb)) + sizeof_field(struct sk_buff, cb)) return; spin_lock_bh(&app->lock); @@ -692,7 +692,7 @@ static int mrp_pdu_parse_vecattr(struct mrp_applicant *app, * advance to the next event in its Vector. */ if (sizeof(struct mrp_skb_cb) + mrp_cb(skb)->mh->attrlen > - FIELD_SIZEOF(struct sk_buff, cb)) + sizeof_field(struct sk_buff, cb)) return -1; if (skb_copy_bits(skb, *offset, mrp_cb(skb)->attrvalue, mrp_cb(skb)->mh->attrlen) < 0) diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 54728d2eda18..d4bcfd8f95bf 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -172,7 +172,6 @@ int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack) if (err < 0) goto out_uninit_mvrp; - vlan->nest_level = dev_get_nest_level(real_dev) + 1; err = register_netdevice(dev); if (err < 0) goto out_uninit_mvrp; diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index c46daf09a501..bb7ec1a3915d 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -126,6 +126,7 @@ int vlan_check_real_dev(struct net_device *real_dev, void vlan_setup(struct net_device *dev); int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack); void unregister_vlan_dev(struct net_device *dev, struct list_head *head); +void vlan_dev_uninit(struct net_device *dev); bool vlan_dev_inherit_address(struct net_device *dev, struct net_device *real_dev); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 93eadf179123..990b9fde28c6 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -489,36 +489,6 @@ static void vlan_dev_set_rx_mode(struct net_device *vlan_dev) dev_uc_sync(vlan_dev_priv(vlan_dev)->real_dev, vlan_dev); } -/* - * vlan network devices have devices nesting below it, and are a special - * "super class" of normal network devices; split their locks off into a - * separate class since they always nest. - */ -static struct lock_class_key vlan_netdev_xmit_lock_key; -static struct lock_class_key vlan_netdev_addr_lock_key; - -static void vlan_dev_set_lockdep_one(struct net_device *dev, - struct netdev_queue *txq, - void *_subclass) -{ - lockdep_set_class_and_subclass(&txq->_xmit_lock, - &vlan_netdev_xmit_lock_key, - *(int *)_subclass); -} - -static void vlan_dev_set_lockdep_class(struct net_device *dev, int subclass) -{ - lockdep_set_class_and_subclass(&dev->addr_list_lock, - &vlan_netdev_addr_lock_key, - subclass); - netdev_for_each_tx_queue(dev, vlan_dev_set_lockdep_one, &subclass); -} - -static int vlan_dev_get_lock_subclass(struct net_device *dev) -{ - return vlan_dev_priv(dev)->nest_level; -} - static const struct header_ops vlan_header_ops = { .create = vlan_dev_hard_header, .parse = eth_header_parse, @@ -609,8 +579,6 @@ static int vlan_dev_init(struct net_device *dev) SET_NETDEV_DEVTYPE(dev, &vlan_type); - vlan_dev_set_lockdep_class(dev, vlan_dev_get_lock_subclass(dev)); - vlan->vlan_pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); if (!vlan->vlan_pcpu_stats) return -ENOMEM; @@ -618,7 +586,8 @@ static int vlan_dev_init(struct net_device *dev) return 0; } -static void vlan_dev_uninit(struct net_device *dev) +/* Note: this function might be called multiple times for the same device. */ +void vlan_dev_uninit(struct net_device *dev) { struct vlan_priority_tci_mapping *pm; struct vlan_dev_priv *vlan = vlan_dev_priv(dev); @@ -678,8 +647,8 @@ static int vlan_ethtool_get_ts_info(struct net_device *dev, const struct ethtool_ops *ops = vlan->real_dev->ethtool_ops; struct phy_device *phydev = vlan->real_dev->phydev; - if (phydev && phydev->drv && phydev->drv->ts_info) { - return phydev->drv->ts_info(phydev, info); + if (phy_has_tsinfo(phydev)) { + return phy_ts_info(phydev, info); } else if (ops->get_ts_info) { return ops->get_ts_info(vlan->real_dev, info); } else { @@ -812,7 +781,6 @@ static const struct net_device_ops vlan_netdev_ops = { .ndo_netpoll_cleanup = vlan_dev_netpoll_cleanup, #endif .ndo_fix_features = vlan_dev_fix_features, - .ndo_get_lock_subclass = vlan_dev_get_lock_subclass, .ndo_get_iflink = vlan_dev_get_iflink, }; diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index c482a6fe9393..0db85aeb119b 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -108,11 +108,13 @@ static int vlan_changelink(struct net_device *dev, struct nlattr *tb[], struct ifla_vlan_flags *flags; struct ifla_vlan_qos_mapping *m; struct nlattr *attr; - int rem; + int rem, err; if (data[IFLA_VLAN_FLAGS]) { flags = nla_data(data[IFLA_VLAN_FLAGS]); - vlan_dev_change_flags(dev, flags->flags, flags->mask); + err = vlan_dev_change_flags(dev, flags->flags, flags->mask); + if (err) + return err; } if (data[IFLA_VLAN_INGRESS_QOS]) { nla_for_each_nested(attr, data[IFLA_VLAN_INGRESS_QOS], rem) { @@ -123,7 +125,9 @@ static int vlan_changelink(struct net_device *dev, struct nlattr *tb[], if (data[IFLA_VLAN_EGRESS_QOS]) { nla_for_each_nested(attr, data[IFLA_VLAN_EGRESS_QOS], rem) { m = nla_data(attr); - vlan_dev_set_egress_priority(dev, m->from, m->to); + err = vlan_dev_set_egress_priority(dev, m->from, m->to); + if (err) + return err; } } return 0; @@ -179,10 +183,11 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev, return -EINVAL; err = vlan_changelink(dev, tb, data, extack); - if (err < 0) - return err; - - return register_vlan_dev(dev, extack); + if (!err) + err = register_vlan_dev(dev, extack); + if (err) + vlan_dev_uninit(dev); + return err; } static inline size_t vlan_qos_map_size(unsigned int n) diff --git a/net/9p/client.c b/net/9p/client.c index 9622f3e469f6..1d48afc7033c 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -281,6 +281,7 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size) p9pdu_reset(&req->tc); p9pdu_reset(&req->rc); + req->t_err = 0; req->status = REQ_STATUS_ALLOC; init_waitqueue_head(&req->wq); INIT_LIST_HEAD(&req->req_list); diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index bac8dad5dd69..b21c3c209815 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -685,9 +685,9 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args) goto error; /* Create the Completion Queue */ - rdma->cq = ib_alloc_cq(rdma->cm_id->device, client, - opts.sq_depth + opts.rq_depth + 1, - 0, IB_POLL_SOFTIRQ); + rdma->cq = ib_alloc_cq_any(rdma->cm_id->device, client, + opts.sq_depth + opts.rq_depth + 1, + IB_POLL_SOFTIRQ); if (IS_ERR(rdma->cq)) goto error; diff --git a/net/Kconfig b/net/Kconfig index 57f51a279ad6..b0937a700f01 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -91,6 +91,7 @@ if INET source "net/ipv4/Kconfig" source "net/ipv6/Kconfig" source "net/netlabel/Kconfig" +source "net/mptcp/Kconfig" endif # if INET @@ -108,9 +109,10 @@ config NETWORK_PHY_TIMESTAMPING bool "Timestamping in PHY devices" select NET_PTP_CLASSIFY help - This allows timestamping of network packets by PHYs with - hardware timestamping capabilities. This option adds some - overhead in the transmit and receive paths. + This allows timestamping of network packets by PHYs (or + other MII bus snooping devices) with hardware timestamping + capabilities. This option adds some overhead in the transmit + and receive paths. If you are unsure how to answer this question, answer N. @@ -258,7 +260,7 @@ config XPS default y config HWBM - bool + bool config CGROUP_NET_PRIO bool "Network priority cgroup" @@ -309,12 +311,12 @@ config BPF_STREAM_PARSER select STREAM_PARSER select NET_SOCK_MSG ---help--- - Enabling this allows a stream parser to be used with - BPF_MAP_TYPE_SOCKMAP. + Enabling this allows a stream parser to be used with + BPF_MAP_TYPE_SOCKMAP. - BPF_MAP_TYPE_SOCKMAP provides a map type to use with network sockets. - It can be used to enforce socket policy, implement socket redirects, - etc. + BPF_MAP_TYPE_SOCKMAP provides a map type to use with network sockets. + It can be used to enforce socket policy, implement socket redirects, + etc. config NET_FLOW_LIMIT bool @@ -349,12 +351,12 @@ config NET_DROP_MONITOR tristate "Network packet drop alerting service" depends on INET && TRACEPOINTS ---help--- - This feature provides an alerting service to userspace in the - event that packets are discarded in the network stack. Alerts - are broadcast via netlink socket to any listening user space - process. If you don't need network drop alerts, or if you are ok - just checking the various proc files and other utilities for - drop statistics, say N here. + This feature provides an alerting service to userspace in the + event that packets are discarded in the network stack. Alerts + are broadcast via netlink socket to any listening user space + process. If you don't need network drop alerts, or if you are ok + just checking the various proc files and other utilities for + drop statistics, say N here. endmenu @@ -430,9 +432,10 @@ config NET_SOCK_MSG config NET_DEVLINK bool default n + imply NET_DROP_MONITOR config PAGE_POOL - bool + bool config FAILOVER tristate "Generic failover module" @@ -447,6 +450,14 @@ config FAILOVER migration of VMs with direct attached VFs by failing over to the paravirtual datapath when the VF is unplugged. +config ETHTOOL_NETLINK + bool "Netlink interface for ethtool" + default y + help + An alternative userspace interface for ethtool based on generic + netlink. It provides better extensibility and some new features, + e.g. notification messages. + endif # if NET # Used by archs to tell that they support BPF JIT compiler plus which flavour. diff --git a/net/Makefile b/net/Makefile index 449fc0b221f8..07ea48160874 100644 --- a/net/Makefile +++ b/net/Makefile @@ -13,7 +13,7 @@ obj-$(CONFIG_NET) += $(tmp-y) # LLC has to be linked before the files in net/802/ obj-$(CONFIG_LLC) += llc/ -obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/ bpf/ +obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/ bpf/ ethtool/ obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_TLS) += tls/ @@ -87,3 +87,4 @@ endif obj-$(CONFIG_QRTR) += qrtr/ obj-$(CONFIG_NET_NCSI) += ncsi/ obj-$(CONFIG_XDP_SOCKETS) += xdp/ +obj-$(CONFIG_MPTCP) += mptcp/ diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index a8cb6b2e20c1..b41375d4d295 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -953,8 +953,8 @@ static unsigned long atalk_sum_skb(const struct sk_buff *skb, int offset, if (copy > len) copy = len; vaddr = kmap_atomic(skb_frag_page(frag)); - sum = atalk_sum_partial(vaddr + frag->page_offset + - offset - start, copy, sum); + sum = atalk_sum_partial(vaddr + skb_frag_off(frag) + + offset - start, copy, sum); kunmap_atomic(vaddr); if (!(len -= copy)) @@ -1023,6 +1023,11 @@ static int atalk_create(struct net *net, struct socket *sock, int protocol, */ if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) goto out; + + rc = -EPERM; + if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW)) + goto out; + rc = -ENOMEM; sk = sk_alloc(net, PF_APPLETALK, GFP_KERNEL, &ddp_proto, kern); if (!sk) diff --git a/net/atm/atm_sysfs.c b/net/atm/atm_sysfs.c index 39b94ca5f65d..aa1b57161f3b 100644 --- a/net/atm/atm_sysfs.c +++ b/net/atm/atm_sysfs.c @@ -33,23 +33,17 @@ static ssize_t show_atmaddress(struct device *cdev, unsigned long flags; struct atm_dev *adev = to_atm_dev(cdev); struct atm_dev_addr *aaddr; - int bin[] = { 1, 2, 10, 6, 1 }, *fmt = bin; - int i, j, count = 0; + int count = 0; spin_lock_irqsave(&adev->lock, flags); list_for_each_entry(aaddr, &adev->local, entry) { - for (i = 0, j = 0; i < ATM_ESA_LEN; ++i, ++j) { - if (j == *fmt) { - count += scnprintf(buf + count, - PAGE_SIZE - count, "."); - ++fmt; - j = 0; - } - count += scnprintf(buf + count, - PAGE_SIZE - count, "%02x", - aaddr->addr.sas_addr.prv[i]); - } - count += scnprintf(buf + count, PAGE_SIZE - count, "\n"); + count += scnprintf(buf + count, PAGE_SIZE - count, + "%1phN.%2phN.%10phN.%6phN.%1phN\n", + &aaddr->addr.sas_addr.prv[0], + &aaddr->addr.sas_addr.prv[1], + &aaddr->addr.sas_addr.prv[3], + &aaddr->addr.sas_addr.prv[13], + &aaddr->addr.sas_addr.prv[19]); } spin_unlock_irqrestore(&adev->lock, flags); diff --git a/net/atm/clip.c b/net/atm/clip.c index a7972da7235d..294cb9efe3d3 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -89,7 +89,7 @@ static void unlink_clip_vcc(struct clip_vcc *clip_vcc) struct clip_vcc **walk; if (!entry) { - pr_crit("!clip_vcc->entry (clip_vcc %p)\n", clip_vcc); + pr_err("!clip_vcc->entry (clip_vcc %p)\n", clip_vcc); return; } netif_tx_lock_bh(entry->neigh->dev); /* block clip_start_xmit() */ @@ -109,10 +109,10 @@ static void unlink_clip_vcc(struct clip_vcc *clip_vcc) error = neigh_update(entry->neigh, NULL, NUD_NONE, NEIGH_UPDATE_F_ADMIN, 0); if (error) - pr_crit("neigh_update failed with %d\n", error); + pr_err("neigh_update failed with %d\n", error); goto out; } - pr_crit("ATMARP: failed (entry %p, vcc 0x%p)\n", entry, clip_vcc); + pr_err("ATMARP: failed (entry %p, vcc 0x%p)\n", entry, clip_vcc); out: netif_tx_unlock_bh(entry->neigh->dev); } diff --git a/net/atm/common.c b/net/atm/common.c index b7528e77997c..0ce530af534d 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -668,7 +668,7 @@ __poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait) mask |= EPOLLHUP; /* readable? */ - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* writable? */ diff --git a/net/atm/lec.c b/net/atm/lec.c index 5a77c235a212..25fa3a7b72bd 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -194,7 +194,7 @@ lec_send(struct atm_vcc *vcc, struct sk_buff *skb) dev->stats.tx_bytes += skb->len; } -static void lec_tx_timeout(struct net_device *dev) +static void lec_tx_timeout(struct net_device *dev, unsigned int txqueue) { pr_info("%s\n", dev->name); netif_trans_update(dev); @@ -799,14 +799,9 @@ static const char *lec_arp_get_status_string(unsigned char status) static void lec_info(struct seq_file *seq, struct lec_arp_table *entry) { - int i; - - for (i = 0; i < ETH_ALEN; i++) - seq_printf(seq, "%2.2x", entry->mac_addr[i] & 0xff); - seq_printf(seq, " "); - for (i = 0; i < ATM_ESA_LEN; i++) - seq_printf(seq, "%2.2x", entry->atm_addr[i] & 0xff); - seq_printf(seq, " %s %4.4x", lec_arp_get_status_string(entry->status), + seq_printf(seq, "%pM ", entry->mac_addr); + seq_printf(seq, "%*phN ", ATM_ESA_LEN, entry->atm_addr); + seq_printf(seq, "%s %4.4x", lec_arp_get_status_string(entry->status), entry->flags & 0xffff); if (entry->vcc) seq_printf(seq, "%3d %3d ", entry->vcc->vpi, entry->vcc->vci); @@ -1354,7 +1349,7 @@ static void dump_arp_table(struct lec_priv *priv) { struct lec_arp_table *rulla; char buf[256]; - int i, j, offset; + int i, offset; pr_info("Dump %p:\n", priv); for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { @@ -1362,14 +1357,10 @@ static void dump_arp_table(struct lec_priv *priv) &priv->lec_arp_tables[i], next) { offset = 0; offset += sprintf(buf, "%d: %p\n", i, rulla); - offset += sprintf(buf + offset, "Mac: %pM", + offset += sprintf(buf + offset, "Mac: %pM ", rulla->mac_addr); - offset += sprintf(buf + offset, " Atm:"); - for (j = 0; j < ATM_ESA_LEN; j++) { - offset += sprintf(buf + offset, - "%2.2x ", - rulla->atm_addr[j] & 0xff); - } + offset += sprintf(buf + offset, "Atm: %*ph ", ATM_ESA_LEN, + rulla->atm_addr); offset += sprintf(buf + offset, "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ", rulla->vcc ? rulla->vcc->vpi : 0, @@ -1392,12 +1383,9 @@ static void dump_arp_table(struct lec_priv *priv) pr_info("No forward\n"); hlist_for_each_entry(rulla, &priv->lec_no_forward, next) { offset = 0; - offset += sprintf(buf + offset, "Mac: %pM", rulla->mac_addr); - offset += sprintf(buf + offset, " Atm:"); - for (j = 0; j < ATM_ESA_LEN; j++) { - offset += sprintf(buf + offset, "%2.2x ", - rulla->atm_addr[j] & 0xff); - } + offset += sprintf(buf + offset, "Mac: %pM ", rulla->mac_addr); + offset += sprintf(buf + offset, "Atm: %*ph ", ATM_ESA_LEN, + rulla->atm_addr); offset += sprintf(buf + offset, "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ", rulla->vcc ? rulla->vcc->vpi : 0, @@ -1417,12 +1405,9 @@ static void dump_arp_table(struct lec_priv *priv) pr_info("Empty ones\n"); hlist_for_each_entry(rulla, &priv->lec_arp_empty_ones, next) { offset = 0; - offset += sprintf(buf + offset, "Mac: %pM", rulla->mac_addr); - offset += sprintf(buf + offset, " Atm:"); - for (j = 0; j < ATM_ESA_LEN; j++) { - offset += sprintf(buf + offset, "%2.2x ", - rulla->atm_addr[j] & 0xff); - } + offset += sprintf(buf + offset, "Mac: %pM ", rulla->mac_addr); + offset += sprintf(buf + offset, "Atm: %*ph ", ATM_ESA_LEN, + rulla->atm_addr); offset += sprintf(buf + offset, "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ", rulla->vcc ? rulla->vcc->vpi : 0, @@ -1442,12 +1427,9 @@ static void dump_arp_table(struct lec_priv *priv) pr_info("Multicast Forward VCCs\n"); hlist_for_each_entry(rulla, &priv->mcast_fwds, next) { offset = 0; - offset += sprintf(buf + offset, "Mac: %pM", rulla->mac_addr); - offset += sprintf(buf + offset, " Atm:"); - for (j = 0; j < ATM_ESA_LEN; j++) { - offset += sprintf(buf + offset, "%2.2x ", - rulla->atm_addr[j] & 0xff); - } + offset += sprintf(buf + offset, "Mac: %pM ", rulla->mac_addr); + offset += sprintf(buf + offset, "Atm: %*ph ", ATM_ESA_LEN, + rulla->atm_addr); offset += sprintf(buf + offset, "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ", rulla->vcc ? rulla->vcc->vpi : 0, @@ -1973,17 +1955,8 @@ lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data, * Vcc which we don't want to make default vcc, * attach it anyway. */ - pr_debug("LEC_ARP:Attaching data direct, not default: %2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x\n", - ioc_data->atm_addr[0], ioc_data->atm_addr[1], - ioc_data->atm_addr[2], ioc_data->atm_addr[3], - ioc_data->atm_addr[4], ioc_data->atm_addr[5], - ioc_data->atm_addr[6], ioc_data->atm_addr[7], - ioc_data->atm_addr[8], ioc_data->atm_addr[9], - ioc_data->atm_addr[10], ioc_data->atm_addr[11], - ioc_data->atm_addr[12], ioc_data->atm_addr[13], - ioc_data->atm_addr[14], ioc_data->atm_addr[15], - ioc_data->atm_addr[16], ioc_data->atm_addr[17], - ioc_data->atm_addr[18], ioc_data->atm_addr[19]); + pr_debug("LEC_ARP:Attaching data direct, not default: %*phN\n", + ATM_ESA_LEN, ioc_data->atm_addr); entry = make_entry(priv, bus_mac); if (entry == NULL) goto out; @@ -1999,17 +1972,8 @@ lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data, dump_arp_table(priv); goto out; } - pr_debug("LEC_ARP:Attaching data direct, default: %2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x\n", - ioc_data->atm_addr[0], ioc_data->atm_addr[1], - ioc_data->atm_addr[2], ioc_data->atm_addr[3], - ioc_data->atm_addr[4], ioc_data->atm_addr[5], - ioc_data->atm_addr[6], ioc_data->atm_addr[7], - ioc_data->atm_addr[8], ioc_data->atm_addr[9], - ioc_data->atm_addr[10], ioc_data->atm_addr[11], - ioc_data->atm_addr[12], ioc_data->atm_addr[13], - ioc_data->atm_addr[14], ioc_data->atm_addr[15], - ioc_data->atm_addr[16], ioc_data->atm_addr[17], - ioc_data->atm_addr[18], ioc_data->atm_addr[19]); + pr_debug("LEC_ARP:Attaching data direct, default: %*phN\n", + ATM_ESA_LEN, ioc_data->atm_addr); for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { hlist_for_each_entry(entry, &priv->lec_arp_tables[i], next) { diff --git a/net/atm/mpoa_caches.c b/net/atm/mpoa_caches.c index 4bb418313720..3286f9d527d3 100644 --- a/net/atm/mpoa_caches.c +++ b/net/atm/mpoa_caches.c @@ -180,8 +180,7 @@ static int cache_hit(in_cache_entry *entry, struct mpoa_client *mpc) static void in_cache_put(in_cache_entry *entry) { if (refcount_dec_and_test(&entry->use)) { - memset(entry, 0, sizeof(in_cache_entry)); - kfree(entry); + kzfree(entry); } } @@ -416,8 +415,7 @@ static eg_cache_entry *eg_cache_get_by_src_ip(__be32 ipaddr, static void eg_cache_put(eg_cache_entry *entry) { if (refcount_dec_and_test(&entry->use)) { - memset(entry, 0, sizeof(eg_cache_entry)); - kfree(entry); + kzfree(entry); } } diff --git a/net/atm/mpoa_proc.c b/net/atm/mpoa_proc.c index 46d6cd9a36ae..829db9eba0cb 100644 --- a/net/atm/mpoa_proc.c +++ b/net/atm/mpoa_proc.c @@ -53,15 +53,12 @@ static ssize_t proc_mpc_write(struct file *file, const char __user *buff, static int parse_qos(const char *buff); -/* - * Define allowed FILE OPERATIONS - */ -static const struct file_operations mpc_file_operations = { - .open = proc_mpc_open, - .read = seq_read, - .llseek = seq_lseek, - .write = proc_mpc_write, - .release = seq_release, +static const struct proc_ops mpc_proc_ops = { + .proc_open = proc_mpc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = proc_mpc_write, + .proc_release = seq_release, }; /* @@ -290,7 +287,7 @@ int mpc_proc_init(void) { struct proc_dir_entry *p; - p = proc_create(STAT_FILE_NAME, 0, atm_proc_root, &mpc_file_operations); + p = proc_create(STAT_FILE_NAME, 0, atm_proc_root, &mpc_proc_ops); if (!p) { pr_err("Unable to initialize /proc/atm/%s\n", STAT_FILE_NAME); return -ENOMEM; diff --git a/net/atm/pppoatm.c b/net/atm/pppoatm.c index bd3da9af5ef6..45d8e1d5d033 100644 --- a/net/atm/pppoatm.c +++ b/net/atm/pppoatm.c @@ -216,9 +216,7 @@ static void pppoatm_push(struct atm_vcc *atmvcc, struct sk_buff *skb) pvcc->chan.mtu += LLC_LEN; break; } - pr_debug("Couldn't autodetect yet (skb: %02X %02X %02X %02X %02X %02X)\n", - skb->data[0], skb->data[1], skb->data[2], - skb->data[3], skb->data[4], skb->data[5]); + pr_debug("Couldn't autodetect yet (skb: %6ph)\n", skb->data); goto error; case e_vc: break; diff --git a/net/atm/proc.c b/net/atm/proc.c index d79221fd4dae..4369ffa3302a 100644 --- a/net/atm/proc.c +++ b/net/atm/proc.c @@ -36,9 +36,9 @@ static ssize_t proc_dev_atm_read(struct file *file, char __user *buf, size_t count, loff_t *pos); -static const struct file_operations proc_atm_dev_ops = { - .read = proc_dev_atm_read, - .llseek = noop_llseek, +static const struct proc_ops atm_dev_proc_ops = { + .proc_read = proc_dev_atm_read, + .proc_lseek = noop_llseek, }; static void add_stats(struct seq_file *seq, const char *aal, @@ -134,8 +134,7 @@ static void vcc_seq_stop(struct seq_file *seq, void *v) static void *vcc_seq_next(struct seq_file *seq, void *v, loff_t *pos) { v = vcc_walk(seq, 1); - if (v) - (*pos)++; + (*pos)++; return v; } @@ -360,7 +359,7 @@ int atm_proc_dev_register(struct atm_dev *dev) goto err_out; dev->proc_entry = proc_create_data(dev->proc_name, 0, atm_proc_root, - &proc_atm_dev_ops, dev); + &atm_dev_proc_ops, dev); if (!dev->proc_entry) goto err_free_name; return 0; diff --git a/net/atm/signaling.c b/net/atm/signaling.c index 6c11cdf4dd4c..fbd0c5e7b299 100644 --- a/net/atm/signaling.c +++ b/net/atm/signaling.c @@ -109,7 +109,7 @@ static int sigd_send(struct atm_vcc *vcc, struct sk_buff *skb) dev_kfree_skb(skb); goto as_indicate_complete; } - sk->sk_ack_backlog++; + sk_acceptq_added(sk); skb_queue_tail(&sk->sk_receive_queue, skb); pr_debug("waking sk_sleep(sk) 0x%p\n", sk_sleep(sk)); sk->sk_state_change(sk); diff --git a/net/atm/svc.c b/net/atm/svc.c index 908cbb8654f5..ba144d035e3d 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -381,7 +381,7 @@ static int svc_accept(struct socket *sock, struct socket *newsock, int flags, msg->pvc.sap_addr.vpi, msg->pvc.sap_addr.vci); dev_kfree_skb(skb); - sk->sk_ack_backlog--; + sk_acceptq_removed(sk); if (error) { sigd_enq2(NULL, as_reject, old_vcc, NULL, NULL, &old_vcc->qos, error); diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index ca5207767dc2..ff57ea89c27e 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -808,7 +808,7 @@ static int ax25_create(struct net *net, struct socket *sock, int protocol, struct sock *sk; ax25_cb *ax25; - if (protocol < 0 || protocol > SK_PROTOCOL_MAX) + if (protocol < 0 || protocol > U8_MAX) return -EINVAL; if (!net_eq(net, &init_net)) @@ -855,6 +855,8 @@ static int ax25_create(struct net *net, struct socket *sock, int protocol, break; case SOCK_RAW: + if (!capable(CAP_NET_RAW)) + return -EPERM; break; default: return -ESOCKTNOSUPPORT; @@ -1382,7 +1384,7 @@ static int ax25_accept(struct socket *sock, struct socket *newsock, int flags, /* Now attach up the new socket */ kfree_skb(skb); - sk->sk_ack_backlog--; + sk_acceptq_removed(sk); newsock->state = SS_CONNECTED; out: diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c index dcdbaeeb2358..cd6afe895db9 100644 --- a/net/ax25/ax25_in.c +++ b/net/ax25/ax25_in.c @@ -356,7 +356,7 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev, make->sk_state = TCP_ESTABLISHED; - sk->sk_ack_backlog++; + sk_acceptq_added(sk); bh_unlock_sock(sk); } else { if (!mine) diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index a3d188dfbe75..c762758a4649 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -# Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +# Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: # # Marek Lindner, Simon Wunderlich @@ -12,11 +12,11 @@ config BATMAN_ADV depends on NET select LIBCRC32C help - B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is - a routing protocol for multi-hop ad-hoc mesh networks. The - networks may be wired or wireless. See - https://www.open-mesh.org/ for more information and user space - tools. + B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is + a routing protocol for multi-hop ad-hoc mesh networks. The + networks may be wired or wireless. See + https://www.open-mesh.org/ for more information and user space + tools. config BATMAN_ADV_BATMAN_V bool "B.A.T.M.A.N. V protocol" @@ -100,7 +100,6 @@ config BATMAN_ADV_DEBUG config BATMAN_ADV_SYSFS bool "batman-adv sysfs entries" depends on BATMAN_ADV - default y help Say Y here if you want to enable batman-adv device configuration and status interface through sysfs attributes. It is replaced by the diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index fd63e116d9ff..daa49af7ff40 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -# Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +# Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: # # Marek Lindner, Simon Wunderlich diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c index fa39eaaab9d7..382fbe51fd34 100644 --- a/net/batman-adv/bat_algo.c +++ b/net/batman-adv/bat_algo.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h index 37898da8ad48..686a60bc9492 100644 --- a/net/batman-adv/bat_algo.h +++ b/net/batman-adv/bat_algo.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Linus LĂĽssing */ diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 240ed70912d6..f0209505e41a 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ @@ -22,6 +22,8 @@ #include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> +#include <linux/lockdep.h> +#include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/pkt_sched.h> @@ -193,14 +195,18 @@ static int batadv_iv_ogm_iface_enable(struct batadv_hard_iface *hard_iface) unsigned char *ogm_buff; u32 random_seqno; + mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); + /* randomize initial seqno to avoid collision */ get_random_bytes(&random_seqno, sizeof(random_seqno)); atomic_set(&hard_iface->bat_iv.ogm_seqno, random_seqno); hard_iface->bat_iv.ogm_buff_len = BATADV_OGM_HLEN; ogm_buff = kmalloc(hard_iface->bat_iv.ogm_buff_len, GFP_ATOMIC); - if (!ogm_buff) + if (!ogm_buff) { + mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); return -ENOMEM; + } hard_iface->bat_iv.ogm_buff = ogm_buff; @@ -212,35 +218,59 @@ static int batadv_iv_ogm_iface_enable(struct batadv_hard_iface *hard_iface) batadv_ogm_packet->reserved = 0; batadv_ogm_packet->tq = BATADV_TQ_MAX_VALUE; + mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); + return 0; } static void batadv_iv_ogm_iface_disable(struct batadv_hard_iface *hard_iface) { + mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); + kfree(hard_iface->bat_iv.ogm_buff); hard_iface->bat_iv.ogm_buff = NULL; + + mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); } static void batadv_iv_ogm_iface_update_mac(struct batadv_hard_iface *hard_iface) { struct batadv_ogm_packet *batadv_ogm_packet; - unsigned char *ogm_buff = hard_iface->bat_iv.ogm_buff; + void *ogm_buff; - batadv_ogm_packet = (struct batadv_ogm_packet *)ogm_buff; + mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); + + ogm_buff = hard_iface->bat_iv.ogm_buff; + if (!ogm_buff) + goto unlock; + + batadv_ogm_packet = ogm_buff; ether_addr_copy(batadv_ogm_packet->orig, hard_iface->net_dev->dev_addr); ether_addr_copy(batadv_ogm_packet->prev_sender, hard_iface->net_dev->dev_addr); + +unlock: + mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); } static void batadv_iv_ogm_primary_iface_set(struct batadv_hard_iface *hard_iface) { struct batadv_ogm_packet *batadv_ogm_packet; - unsigned char *ogm_buff = hard_iface->bat_iv.ogm_buff; + void *ogm_buff; - batadv_ogm_packet = (struct batadv_ogm_packet *)ogm_buff; + mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); + + ogm_buff = hard_iface->bat_iv.ogm_buff; + if (!ogm_buff) + goto unlock; + + batadv_ogm_packet = ogm_buff; batadv_ogm_packet->ttl = BATADV_TTL; + +unlock: + mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); } /* when do we schedule our own ogm to be sent */ @@ -277,17 +307,23 @@ static u8 batadv_hop_penalty(u8 tq, const struct batadv_priv *bat_priv) * batadv_iv_ogm_aggr_packet() - checks if there is another OGM attached * @buff_pos: current position in the skb * @packet_len: total length of the skb - * @tvlv_len: tvlv length of the previously considered OGM + * @ogm_packet: potential OGM in buffer * * Return: true if there is enough space for another OGM, false otherwise. */ -static bool batadv_iv_ogm_aggr_packet(int buff_pos, int packet_len, - __be16 tvlv_len) +static bool +batadv_iv_ogm_aggr_packet(int buff_pos, int packet_len, + const struct batadv_ogm_packet *ogm_packet) { int next_buff_pos = 0; - next_buff_pos += buff_pos + BATADV_OGM_HLEN; - next_buff_pos += ntohs(tvlv_len); + /* check if there is enough space for the header */ + next_buff_pos += buff_pos + sizeof(*ogm_packet); + if (next_buff_pos > packet_len) + return false; + + /* check if there is enough space for the optional TVLV */ + next_buff_pos += ntohs(ogm_packet->tvlv_len); return (next_buff_pos <= packet_len) && (next_buff_pos <= BATADV_MAX_AGGREGATION_BYTES); @@ -315,7 +351,7 @@ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet, /* adjust all flags and log packets */ while (batadv_iv_ogm_aggr_packet(buff_pos, forw_packet->packet_len, - batadv_ogm_packet->tvlv_len)) { + batadv_ogm_packet)) { /* we might have aggregated direct link packets with an * ordinary base packet */ @@ -736,7 +772,11 @@ batadv_iv_ogm_slide_own_bcast_window(struct batadv_hard_iface *hard_iface) } } -static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) +/** + * batadv_iv_ogm_schedule_buff() - schedule submission of hardif ogm buffer + * @hard_iface: interface whose ogm buffer should be transmitted + */ +static void batadv_iv_ogm_schedule_buff(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); unsigned char **ogm_buff = &hard_iface->bat_iv.ogm_buff; @@ -747,9 +787,7 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) u16 tvlv_len = 0; unsigned long send_time; - if (hard_iface->if_status == BATADV_IF_NOT_IN_USE || - hard_iface->if_status == BATADV_IF_TO_BE_REMOVED) - return; + lockdep_assert_held(&hard_iface->bat_iv.ogm_buff_mutex); /* the interface gets activated here to avoid race conditions between * the moment of activating the interface in @@ -817,6 +855,17 @@ out: batadv_hardif_put(primary_if); } +static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) +{ + if (hard_iface->if_status == BATADV_IF_NOT_IN_USE || + hard_iface->if_status == BATADV_IF_TO_BE_REMOVED) + return; + + mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); + batadv_iv_ogm_schedule_buff(hard_iface); + mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); +} + /** * batadv_iv_orig_ifinfo_sum() - Get bcast_own sum for originator over iterface * @orig_node: originator which reproadcasted the OGMs directly @@ -1704,7 +1753,7 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb, /* unpack the aggregated packets and process them one by one */ while (batadv_iv_ogm_aggr_packet(ogm_offset, skb_headlen(skb), - ogm_packet->tvlv_len)) { + ogm_packet)) { batadv_iv_ogm_process(skb, ogm_offset, if_incoming); ogm_offset += BATADV_OGM_HLEN; diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h index c7a9ba305bfc..0c57c1000c64 100644 --- a/net/batman-adv/bat_iv_ogm.h +++ b/net/batman-adv/bat_iv_ogm.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 22672cb3e25d..0ecaf1bb0068 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors: * * Linus LĂĽssing, Marek Lindner */ @@ -79,6 +79,7 @@ static int batadv_v_iface_enable(struct batadv_hard_iface *hard_iface) static void batadv_v_iface_disable(struct batadv_hard_iface *hard_iface) { + batadv_v_ogm_iface_disable(hard_iface); batadv_v_elp_iface_disable(hard_iface); } @@ -1081,6 +1082,11 @@ void batadv_v_hardif_init(struct batadv_hard_iface *hard_iface) */ atomic_set(&hard_iface->bat_v.throughput_override, 0); atomic_set(&hard_iface->bat_v.elp_interval, 500); + + hard_iface->bat_v.aggr_len = 0; + skb_queue_head_init(&hard_iface->bat_v.aggr_list); + INIT_DELAYED_WORK(&hard_iface->bat_v.aggr_wq, + batadv_v_ogm_aggr_work); } /** diff --git a/net/batman-adv/bat_v.h b/net/batman-adv/bat_v.h index 37833db098e6..5e0be10bc84e 100644 --- a/net/batman-adv/bat_v.h +++ b/net/batman-adv/bat_v.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Linus LĂĽssing */ diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 2614a9caee00..1e3172db7492 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: * * Linus LĂĽssing, Marek Lindner */ @@ -107,10 +107,17 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) } if (ret) goto default_throughput; - if (!(sinfo.filled & BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT))) - goto default_throughput; - return sinfo.expected_throughput / 100; + if (sinfo.filled & BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT)) + return sinfo.expected_throughput / 100; + + /* try to estimate the expected throughput based on reported tx + * rates + */ + if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE)) + return cfg80211_calculate_bitrate(&sinfo.txrate) / 3; + + goto default_throughput; } /* if not a wifi interface, check if this device provides data via diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h index 1a29505f4f66..4358d436be2a 100644 --- a/net/batman-adv/bat_v_elp.h +++ b/net/batman-adv/bat_v_elp.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors: * * Linus LĂĽssing, Marek Lindner */ diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index fad95ef64e01..969466218999 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors: * * Antonio Quartulli */ @@ -17,12 +17,15 @@ #include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> +#include <linux/lockdep.h> +#include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/random.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> +#include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> @@ -77,6 +80,20 @@ struct batadv_orig_node *batadv_v_ogm_orig_get(struct batadv_priv *bat_priv, } /** + * batadv_v_ogm_start_queue_timer() - restart the OGM aggregation timer + * @hard_iface: the interface to use to send the OGM + */ +static void batadv_v_ogm_start_queue_timer(struct batadv_hard_iface *hard_iface) +{ + unsigned int msecs = BATADV_MAX_AGGREGATION_MS * 1000; + + /* msecs * [0.9, 1.1] */ + msecs += prandom_u32() % (msecs / 5) - (msecs / 10); + queue_delayed_work(batadv_event_workqueue, &hard_iface->bat_v.aggr_wq, + msecs_to_jiffies(msecs / 1000)); +} + +/** * batadv_v_ogm_start_timer() - restart the OGM sending timer * @bat_priv: the bat priv with all the soft interface information */ @@ -116,14 +133,132 @@ static void batadv_v_ogm_send_to_if(struct sk_buff *skb, } /** - * batadv_v_ogm_send() - periodic worker broadcasting the own OGM - * @work: work queue item + * batadv_v_ogm_len() - OGMv2 packet length + * @skb: the OGM to check + * + * Return: Length of the given OGMv2 packet, including tvlv length, excluding + * ethernet header length. */ -static void batadv_v_ogm_send(struct work_struct *work) +static unsigned int batadv_v_ogm_len(struct sk_buff *skb) +{ + struct batadv_ogm2_packet *ogm_packet; + + ogm_packet = (struct batadv_ogm2_packet *)skb->data; + return BATADV_OGM2_HLEN + ntohs(ogm_packet->tvlv_len); +} + +/** + * batadv_v_ogm_queue_left() - check if given OGM still fits aggregation queue + * @skb: the OGM to check + * @hard_iface: the interface to use to send the OGM + * + * Caller needs to hold the hard_iface->bat_v.aggr_list.lock. + * + * Return: True, if the given OGMv2 packet still fits, false otherwise. + */ +static bool batadv_v_ogm_queue_left(struct sk_buff *skb, + struct batadv_hard_iface *hard_iface) +{ + unsigned int max = min_t(unsigned int, hard_iface->net_dev->mtu, + BATADV_MAX_AGGREGATION_BYTES); + unsigned int ogm_len = batadv_v_ogm_len(skb); + + lockdep_assert_held(&hard_iface->bat_v.aggr_list.lock); + + return hard_iface->bat_v.aggr_len + ogm_len <= max; +} + +/** + * batadv_v_ogm_aggr_list_free - free all elements in an aggregation queue + * @hard_iface: the interface holding the aggregation queue + * + * Empties the OGMv2 aggregation queue and frees all the skbs it contained. + * + * Caller needs to hold the hard_iface->bat_v.aggr_list.lock. + */ +static void batadv_v_ogm_aggr_list_free(struct batadv_hard_iface *hard_iface) +{ + lockdep_assert_held(&hard_iface->bat_v.aggr_list.lock); + + __skb_queue_purge(&hard_iface->bat_v.aggr_list); + hard_iface->bat_v.aggr_len = 0; +} + +/** + * batadv_v_ogm_aggr_send() - flush & send aggregation queue + * @hard_iface: the interface with the aggregation queue to flush + * + * Aggregates all OGMv2 packets currently in the aggregation queue into a + * single OGMv2 packet and transmits this aggregate. + * + * The aggregation queue is empty after this call. + * + * Caller needs to hold the hard_iface->bat_v.aggr_list.lock. + */ +static void batadv_v_ogm_aggr_send(struct batadv_hard_iface *hard_iface) +{ + unsigned int aggr_len = hard_iface->bat_v.aggr_len; + struct sk_buff *skb_aggr; + unsigned int ogm_len; + struct sk_buff *skb; + + lockdep_assert_held(&hard_iface->bat_v.aggr_list.lock); + + if (!aggr_len) + return; + + skb_aggr = dev_alloc_skb(aggr_len + ETH_HLEN + NET_IP_ALIGN); + if (!skb_aggr) { + batadv_v_ogm_aggr_list_free(hard_iface); + return; + } + + skb_reserve(skb_aggr, ETH_HLEN + NET_IP_ALIGN); + skb_reset_network_header(skb_aggr); + + while ((skb = __skb_dequeue(&hard_iface->bat_v.aggr_list))) { + hard_iface->bat_v.aggr_len -= batadv_v_ogm_len(skb); + + ogm_len = batadv_v_ogm_len(skb); + skb_put_data(skb_aggr, skb->data, ogm_len); + + consume_skb(skb); + } + + batadv_v_ogm_send_to_if(skb_aggr, hard_iface); +} + +/** + * batadv_v_ogm_queue_on_if() - queue a batman ogm on a given interface + * @skb: the OGM to queue + * @hard_iface: the interface to queue the OGM on + */ +static void batadv_v_ogm_queue_on_if(struct sk_buff *skb, + struct batadv_hard_iface *hard_iface) +{ + struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); + + if (!atomic_read(&bat_priv->aggregated_ogms)) { + batadv_v_ogm_send_to_if(skb, hard_iface); + return; + } + + spin_lock_bh(&hard_iface->bat_v.aggr_list.lock); + if (!batadv_v_ogm_queue_left(skb, hard_iface)) + batadv_v_ogm_aggr_send(hard_iface); + + hard_iface->bat_v.aggr_len += batadv_v_ogm_len(skb); + __skb_queue_tail(&hard_iface->bat_v.aggr_list, skb); + spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock); +} + +/** + * batadv_v_ogm_send_softif() - periodic worker broadcasting the own OGM + * @bat_priv: the bat priv with all the soft interface information + */ +static void batadv_v_ogm_send_softif(struct batadv_priv *bat_priv) { struct batadv_hard_iface *hard_iface; - struct batadv_priv_bat_v *bat_v; - struct batadv_priv *bat_priv; struct batadv_ogm2_packet *ogm_packet; struct sk_buff *skb, *skb_tmp; unsigned char *ogm_buff; @@ -131,8 +266,7 @@ static void batadv_v_ogm_send(struct work_struct *work) u16 tvlv_len = 0; int ret; - bat_v = container_of(work, struct batadv_priv_bat_v, ogm_wq.work); - bat_priv = container_of(bat_v, struct batadv_priv, bat_v); + lockdep_assert_held(&bat_priv->bat_v.ogm_buff_mutex); if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) goto out; @@ -210,7 +344,7 @@ static void batadv_v_ogm_send(struct work_struct *work) break; } - batadv_v_ogm_send_to_if(skb_tmp, hard_iface); + batadv_v_ogm_queue_on_if(skb_tmp, hard_iface); batadv_hardif_put(hard_iface); } rcu_read_unlock(); @@ -224,6 +358,44 @@ out: } /** + * batadv_v_ogm_send() - periodic worker broadcasting the own OGM + * @work: work queue item + */ +static void batadv_v_ogm_send(struct work_struct *work) +{ + struct batadv_priv_bat_v *bat_v; + struct batadv_priv *bat_priv; + + bat_v = container_of(work, struct batadv_priv_bat_v, ogm_wq.work); + bat_priv = container_of(bat_v, struct batadv_priv, bat_v); + + mutex_lock(&bat_priv->bat_v.ogm_buff_mutex); + batadv_v_ogm_send_softif(bat_priv); + mutex_unlock(&bat_priv->bat_v.ogm_buff_mutex); +} + +/** + * batadv_v_ogm_aggr_work() - OGM queue periodic task per interface + * @work: work queue item + * + * Emits aggregated OGM message in regular intervals. + */ +void batadv_v_ogm_aggr_work(struct work_struct *work) +{ + struct batadv_hard_iface_bat_v *batv; + struct batadv_hard_iface *hard_iface; + + batv = container_of(work, struct batadv_hard_iface_bat_v, aggr_wq.work); + hard_iface = container_of(batv, struct batadv_hard_iface, bat_v); + + spin_lock_bh(&hard_iface->bat_v.aggr_list.lock); + batadv_v_ogm_aggr_send(hard_iface); + spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock); + + batadv_v_ogm_start_queue_timer(hard_iface); +} + +/** * batadv_v_ogm_iface_enable() - prepare an interface for B.A.T.M.A.N. V * @hard_iface: the interface to prepare * @@ -235,12 +407,26 @@ int batadv_v_ogm_iface_enable(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); + batadv_v_ogm_start_queue_timer(hard_iface); batadv_v_ogm_start_timer(bat_priv); return 0; } /** + * batadv_v_ogm_iface_disable() - release OGM interface private resources + * @hard_iface: interface for which the resources have to be released + */ +void batadv_v_ogm_iface_disable(struct batadv_hard_iface *hard_iface) +{ + cancel_delayed_work_sync(&hard_iface->bat_v.aggr_wq); + + spin_lock_bh(&hard_iface->bat_v.aggr_list.lock); + batadv_v_ogm_aggr_list_free(hard_iface); + spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock); +} + +/** * batadv_v_ogm_primary_iface_set() - set a new primary interface * @primary_iface: the new primary interface */ @@ -249,11 +435,15 @@ void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface) struct batadv_priv *bat_priv = netdev_priv(primary_iface->soft_iface); struct batadv_ogm2_packet *ogm_packet; + mutex_lock(&bat_priv->bat_v.ogm_buff_mutex); if (!bat_priv->bat_v.ogm_buff) - return; + goto unlock; ogm_packet = (struct batadv_ogm2_packet *)bat_priv->bat_v.ogm_buff; ether_addr_copy(ogm_packet->orig, primary_iface->net_dev->dev_addr); + +unlock: + mutex_unlock(&bat_priv->bat_v.ogm_buff_mutex); } /** @@ -382,7 +572,7 @@ static void batadv_v_ogm_forward(struct batadv_priv *bat_priv, if_outgoing->net_dev->name, ntohl(ogm_forward->throughput), ogm_forward->ttl, if_incoming->net_dev->name); - batadv_v_ogm_send_to_if(skb, if_outgoing); + batadv_v_ogm_queue_on_if(skb, if_outgoing); out: if (orig_ifinfo) @@ -631,17 +821,23 @@ batadv_v_ogm_process_per_outif(struct batadv_priv *bat_priv, * batadv_v_ogm_aggr_packet() - checks if there is another OGM aggregated * @buff_pos: current position in the skb * @packet_len: total length of the skb - * @tvlv_len: tvlv length of the previously considered OGM + * @ogm2_packet: potential OGM2 in buffer * * Return: true if there is enough space for another OGM, false otherwise. */ -static bool batadv_v_ogm_aggr_packet(int buff_pos, int packet_len, - __be16 tvlv_len) +static bool +batadv_v_ogm_aggr_packet(int buff_pos, int packet_len, + const struct batadv_ogm2_packet *ogm2_packet) { int next_buff_pos = 0; - next_buff_pos += buff_pos + BATADV_OGM2_HLEN; - next_buff_pos += ntohs(tvlv_len); + /* check if there is enough space for the header */ + next_buff_pos += buff_pos + sizeof(*ogm2_packet); + if (next_buff_pos > packet_len) + return false; + + /* check if there is enough space for the optional TVLV */ + next_buff_pos += ntohs(ogm2_packet->tvlv_len); return (next_buff_pos <= packet_len) && (next_buff_pos <= BATADV_MAX_AGGREGATION_BYTES); @@ -818,7 +1014,7 @@ int batadv_v_ogm_packet_recv(struct sk_buff *skb, ogm_packet = (struct batadv_ogm2_packet *)skb->data; while (batadv_v_ogm_aggr_packet(ogm_offset, skb_headlen(skb), - ogm_packet->tvlv_len)) { + ogm_packet)) { batadv_v_ogm_process(skb, ogm_offset, if_incoming); ogm_offset += BATADV_OGM2_HLEN; @@ -869,6 +1065,8 @@ int batadv_v_ogm_init(struct batadv_priv *bat_priv) atomic_set(&bat_priv->bat_v.ogm_seqno, random_seqno); INIT_DELAYED_WORK(&bat_priv->bat_v.ogm_wq, batadv_v_ogm_send); + mutex_init(&bat_priv->bat_v.ogm_buff_mutex); + return 0; } @@ -880,7 +1078,11 @@ void batadv_v_ogm_free(struct batadv_priv *bat_priv) { cancel_delayed_work_sync(&bat_priv->bat_v.ogm_wq); + mutex_lock(&bat_priv->bat_v.ogm_buff_mutex); + kfree(bat_priv->bat_v.ogm_buff); bat_priv->bat_v.ogm_buff = NULL; bat_priv->bat_v.ogm_buff_len = 0; + + mutex_unlock(&bat_priv->bat_v.ogm_buff_mutex); } diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h index 2a50df7fc2bf..0ae2575f70bb 100644 --- a/net/batman-adv/bat_v_ogm.h +++ b/net/batman-adv/bat_v_ogm.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors: * * Antonio Quartulli */ @@ -11,10 +11,13 @@ #include <linux/skbuff.h> #include <linux/types.h> +#include <linux/workqueue.h> int batadv_v_ogm_init(struct batadv_priv *bat_priv); void batadv_v_ogm_free(struct batadv_priv *bat_priv); +void batadv_v_ogm_aggr_work(struct work_struct *work); int batadv_v_ogm_iface_enable(struct batadv_hard_iface *hard_iface); +void batadv_v_ogm_iface_disable(struct batadv_hard_iface *hard_iface); struct batadv_orig_node *batadv_v_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr); void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface); diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c index 7f04a6acf14e..4bc695cda397 100644 --- a/net/batman-adv/bitarray.c +++ b/net/batman-adv/bitarray.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2006-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2020 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner */ diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h index 84ad2d2b6ac9..533c6d44cb58 100644 --- a/net/batman-adv/bitarray.h +++ b/net/batman-adv/bitarray.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2006-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2020 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner */ diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 663a53b6d36e..41cc87f06b14 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: * * Simon Wunderlich */ @@ -844,7 +844,7 @@ static bool batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr, /* handle as ANNOUNCE frame */ backbone_gw->lasttime = jiffies; - crc = ntohs(*((__be16 *)(&an_addr[4]))); + crc = ntohs(*((__force __be16 *)(&an_addr[4]))); batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): ANNOUNCE vid %d (sent by %pM)... CRC = %#.4x\n", diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index 02b24a861a85..41edb2c4a327 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: * * Simon Wunderlich */ diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index 38c4d8e51155..452856c27d20 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h index 1c5afd301ce9..7e2e8f586f42 100644 --- a/net/batman-adv/debugfs.h +++ b/net/batman-adv/debugfs.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index b0af3a11d406..3d21dd83f8cc 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: * * Antonio Quartulli */ @@ -246,7 +246,7 @@ static u8 *batadv_arp_hw_src(struct sk_buff *skb, int hdr_size) */ static __be32 batadv_arp_ip_src(struct sk_buff *skb, int hdr_size) { - return *(__be32 *)(batadv_arp_hw_src(skb, hdr_size) + ETH_ALEN); + return *(__force __be32 *)(batadv_arp_hw_src(skb, hdr_size) + ETH_ALEN); } /** @@ -270,7 +270,9 @@ static u8 *batadv_arp_hw_dst(struct sk_buff *skb, int hdr_size) */ static __be32 batadv_arp_ip_dst(struct sk_buff *skb, int hdr_size) { - return *(__be32 *)(batadv_arp_hw_src(skb, hdr_size) + ETH_ALEN * 2 + 4); + u8 *dst = batadv_arp_hw_src(skb, hdr_size) + ETH_ALEN * 2 + 4; + + return *(__force __be32 *)dst; } /** @@ -285,16 +287,18 @@ static u32 batadv_hash_dat(const void *data, u32 size) u32 hash = 0; const struct batadv_dat_entry *dat = data; const unsigned char *key; + __be16 vid; u32 i; - key = (const unsigned char *)&dat->ip; + key = (__force const unsigned char *)&dat->ip; for (i = 0; i < sizeof(dat->ip); i++) { hash += key[i]; hash += (hash << 10); hash ^= (hash >> 6); } - key = (const unsigned char *)&dat->vid; + vid = htons(dat->vid); + key = (__force const unsigned char *)&vid; for (i = 0; i < sizeof(dat->vid); i++) { hash += key[i]; hash += (hash << 10); diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h index 67c7729add55..2bff2f4a325c 100644 --- a/net/batman-adv/distributed-arp-table.h +++ b/net/batman-adv/distributed-arp-table.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: * * Antonio Quartulli */ diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 385fccdcf69d..7cad97644d05 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors: * * Martin Hundebøll <martin@hundeboll.net> */ diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h index abfe8c6556de..881ef328b6cd 100644 --- a/net/batman-adv/fragmentation.h +++ b/net/batman-adv/fragmentation.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors: * * Martin Hundebøll <martin@hundeboll.net> */ diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 47df4c678988..e22e49289677 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index 0be8e7178ec7..88b5dba84354 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index fc55750542e4..16cd9450ceb1 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h index 211b14b37db8..c3a0c5a7f7e9 100644 --- a/net/batman-adv/gateway_common.h +++ b/net/batman-adv/gateway_common.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index c90e47342bb0..c7e98a40dd33 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ @@ -18,6 +18,7 @@ #include <linux/kref.h> #include <linux/limits.h> #include <linux/list.h> +#include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/printk.h> #include <linux/rculist.h> @@ -929,6 +930,7 @@ batadv_hardif_add_interface(struct net_device *net_dev) INIT_LIST_HEAD(&hard_iface->list); INIT_HLIST_HEAD(&hard_iface->neigh_list); + mutex_init(&hard_iface->bat_iv.ogm_buff_mutex); spin_lock_init(&hard_iface->neigh_list_lock); kref_init(&hard_iface->refcount); diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index bbb8a6f18d6b..bad2e50135e8 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c index a9d4e176f4de..68638e0450a6 100644 --- a/net/batman-adv/hash.c +++ b/net/batman-adv/hash.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2006-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2020 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner */ diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index 57877f0b78e0..91ae9f32b580 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2006-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2020 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner */ diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index 0a70b66e8770..ccb535c77e5d 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h index 27fafff586df..6abd0f4742ef 100644 --- a/net/batman-adv/icmp_socket.h +++ b/net/batman-adv/icmp_socket.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c index 11941cf1adcc..a67b2b091447 100644 --- a/net/batman-adv/log.c +++ b/net/batman-adv/log.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h index 741cfa3719ff..f9884dc56cf3 100644 --- a/net/batman-adv/log.h +++ b/net/batman-adv/log.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ @@ -74,7 +74,7 @@ __printf(2, 3); * @bat_priv: the bat priv with all the soft interface information * @ratelimited: whether output should be rate limited * @fmt: format string - * @arg...: variable arguments + * @arg: variable arguments */ #define _batadv_dbg(type, bat_priv, ratelimited, fmt, arg...) \ do { \ @@ -98,7 +98,7 @@ static inline void _batadv_dbg(int type __always_unused, * batadv_dbg() - Store debug output without ratelimiting * @type: type of debug message * @bat_priv: the bat priv with all the soft interface information - * @arg...: format string and variable arguments + * @arg: format string and variable arguments */ #define batadv_dbg(type, bat_priv, arg...) \ _batadv_dbg(type, bat_priv, 0, ## arg) @@ -107,7 +107,7 @@ static inline void _batadv_dbg(int type __always_unused, * batadv_dbg_ratelimited() - Store debug output with ratelimiting * @type: type of debug message * @bat_priv: the bat priv with all the soft interface information - * @arg...: format string and variable arguments + * @arg: format string and variable arguments */ #define batadv_dbg_ratelimited(type, bat_priv, arg...) \ _batadv_dbg(type, bat_priv, 1, ## arg) @@ -116,7 +116,7 @@ static inline void _batadv_dbg(int type __always_unused, * batadv_info() - Store message in debug buffer and print it to kmsg buffer * @net_dev: the soft interface net device * @fmt: format string - * @arg...: variable arguments + * @arg: variable arguments */ #define batadv_info(net_dev, fmt, arg...) \ do { \ @@ -130,7 +130,7 @@ static inline void _batadv_dbg(int type __always_unused, * batadv_err() - Store error in debug buffer and print it to kmsg buffer * @net_dev: the soft interface net device * @fmt: format string - * @arg...: variable arguments + * @arg: variable arguments */ #define batadv_err(net_dev, fmt, arg...) \ do { \ diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 4a89177def64..d8a255c85e77 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ @@ -548,7 +548,7 @@ static void batadv_recv_handler_init(void) BUILD_BUG_ON(sizeof(struct batadv_tvlv_tt_change) != 12); BUILD_BUG_ON(sizeof(struct batadv_tvlv_roam_adv) != 8); - i = FIELD_SIZEOF(struct sk_buff, cb); + i = sizeof_field(struct sk_buff, cb); BUILD_BUG_ON(sizeof(struct batadv_skb_cb) > i); /* broadcast packet */ diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 3d4c04d87ff3..692306df7b6f 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2019.3" +#define BATADV_SOURCE_VERSION "2020.0" #endif /* B.A.T.M.A.N. parameters */ diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 1d5bdf3a4b65..9ebdc1e864b9 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2014-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2014-2020 B.A.T.M.A.N. contributors: * * Linus LĂĽssing */ @@ -1421,7 +1421,7 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, if (*orig) return BATADV_FORW_SINGLE; - /* fall through */ + fallthrough; case 0: return BATADV_FORW_NONE; default: diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h index 5d9e2bb29c97..ebf825991ecd 100644 --- a/net/batman-adv/multicast.h +++ b/net/batman-adv/multicast.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2014-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2014-2020 B.A.T.M.A.N. contributors: * * Linus LĂĽssing */ diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 6f08fd122a8d..02ed073f95a9 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2016-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2016-2020 B.A.T.M.A.N. contributors: * * Matthias Schiffer */ @@ -164,7 +164,7 @@ batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype) { struct nlattr *attr = nlmsg_find_attr(nlh, GENL_HDRLEN, attrtype); - return attr ? nla_get_u32(attr) : 0; + return (attr && nla_len(attr) == sizeof(u32)) ? nla_get_u32(attr) : 0; } /** diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h index ddc674e47dbb..7ee48f916997 100644 --- a/net/batman-adv/netlink.h +++ b/net/batman-adv/netlink.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2016-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2016-2020 B.A.T.M.A.N. contributors: * * Matthias Schiffer */ diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 580609389f0f..8f0717c3f7b5 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2012-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2020 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen */ diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h index 753fa49723cf..334289084127 100644 --- a/net/batman-adv/network-coding.h +++ b/net/batman-adv/network-coding.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2012-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2020 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen */ diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 38613487fb1b..5b0c2fffc214 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 512a1f99dd75..7bc01c138b3a 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index f0f864820dea..3632bd976c56 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h index c20feac95107..2ed49db6eff5 100644 --- a/net/batman-adv/routing.h +++ b/net/batman-adv/routing.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 3ce5f7bad369..7f8ade04e08e 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index 5fc0fd1e5d08..0d36e15589f6 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index c7a2e77ca1da..5f05a728f347 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ @@ -22,7 +22,6 @@ #include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> -#include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/percpu.h> @@ -230,7 +229,7 @@ static netdev_tx_t batadv_interface_tx(struct sk_buff *skb, break; } - /* fall through */ + fallthrough; case ETH_P_BATMAN: goto dropped; } @@ -436,7 +435,7 @@ void batadv_interface_rx(struct net_device *soft_iface, /* clean the netfilter state now that the batman-adv header has been * removed */ - nf_reset(skb); + nf_reset_ct(skb); if (unlikely(!pskb_may_pull(skb, ETH_HLEN))) goto dropped; @@ -455,7 +454,7 @@ void batadv_interface_rx(struct net_device *soft_iface, if (vhdr->h_vlan_encapsulated_proto != htons(ETH_P_BATMAN)) break; - /* fall through */ + fallthrough; case ETH_P_BATMAN: goto dropped; } @@ -740,36 +739,6 @@ static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto, return 0; } -/* batman-adv network devices have devices nesting below it and are a special - * "super class" of normal network devices; split their locks off into a - * separate class since they always nest. - */ -static struct lock_class_key batadv_netdev_xmit_lock_key; -static struct lock_class_key batadv_netdev_addr_lock_key; - -/** - * batadv_set_lockdep_class_one() - Set lockdep class for a single tx queue - * @dev: device which owns the tx queue - * @txq: tx queue to modify - * @_unused: always NULL - */ -static void batadv_set_lockdep_class_one(struct net_device *dev, - struct netdev_queue *txq, - void *_unused) -{ - lockdep_set_class(&txq->_xmit_lock, &batadv_netdev_xmit_lock_key); -} - -/** - * batadv_set_lockdep_class() - Set txq and addr_list lockdep class - * @dev: network device to modify - */ -static void batadv_set_lockdep_class(struct net_device *dev) -{ - lockdep_set_class(&dev->addr_list_lock, &batadv_netdev_addr_lock_key); - netdev_for_each_tx_queue(dev, batadv_set_lockdep_class_one, NULL); -} - /** * batadv_softif_init_late() - late stage initialization of soft interface * @dev: registered network device to modify @@ -783,8 +752,6 @@ static int batadv_softif_init_late(struct net_device *dev) int ret; size_t cnt_len = sizeof(u64) * BATADV_CNT_NUM; - batadv_set_lockdep_class(dev); - bat_priv = netdev_priv(dev); bat_priv->soft_iface = dev; @@ -943,10 +910,10 @@ static const struct net_device_ops batadv_netdev_ops = { static void batadv_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { - strlcpy(info->driver, "B.A.T.M.A.N. advanced", sizeof(info->driver)); - strlcpy(info->version, BATADV_SOURCE_VERSION, sizeof(info->version)); - strlcpy(info->fw_version, "N/A", sizeof(info->fw_version)); - strlcpy(info->bus_info, "batman", sizeof(info->bus_info)); + strscpy(info->driver, "B.A.T.M.A.N. advanced", sizeof(info->driver)); + strscpy(info->version, BATADV_SOURCE_VERSION, sizeof(info->version)); + strscpy(info->fw_version, "N/A", sizeof(info->fw_version)); + strscpy(info->bus_info, "batman", sizeof(info->bus_info)); } /* Inspired by drivers/net/ethernet/dlink/sundance.c:1702 diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h index 29139ad769fe..534e08d6ad91 100644 --- a/net/batman-adv/soft-interface.h +++ b/net/batman-adv/soft-interface.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index 1efcb97039cd..c45962d8527b 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ @@ -1070,7 +1070,7 @@ static ssize_t batadv_store_mesh_iface(struct kobject *kobj, dev_hold(net_dev); INIT_WORK(&store_work->work, batadv_store_mesh_iface_work); store_work->net_dev = net_dev; - strlcpy(store_work->soft_iface_name, buff, + strscpy(store_work->soft_iface_name, buff, sizeof(store_work->soft_iface_name)); queue_work(batadv_event_workqueue, &store_work->work); diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h index 5e466093dfa5..d987f8b30a98 100644 --- a/net/batman-adv/sysfs.h +++ b/net/batman-adv/sysfs.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index dd6a9a40dbb9..bd2ac570c42c 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2012-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2020 B.A.T.M.A.N. contributors: * * Edo Monticelli, Antonio Quartulli */ diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h index 78d310da0ad3..140105215aa2 100644 --- a/net/batman-adv/tp_meter.h +++ b/net/batman-adv/tp_meter.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2012-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2020 B.A.T.M.A.N. contributors: * * Edo Monticelli, Antonio Quartulli */ diff --git a/net/batman-adv/trace.c b/net/batman-adv/trace.c index 3cedd2c36528..3444d9e4e90d 100644 --- a/net/batman-adv/trace.c +++ b/net/batman-adv/trace.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors: * * Sven Eckelmann */ diff --git a/net/batman-adv/trace.h b/net/batman-adv/trace.h index d8f764521c0b..f631b1e01b89 100644 --- a/net/batman-adv/trace.h +++ b/net/batman-adv/trace.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors: * * Sven Eckelmann */ diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 8a482c5ec67b..852932838ddc 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli */ diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h index 4a98860d7f0e..b24d35b9226a 100644 --- a/net/batman-adv/translation-table.h +++ b/net/batman-adv/translation-table.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli */ diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index aae63f0d21eb..0963a43ad996 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h index 36985000a0a8..d509d00c7a23 100644 --- a/net/batman-adv/tvlv.h +++ b/net/batman-adv/tvlv.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 6ae139d74e0f..4a17a66cc572 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ @@ -17,6 +17,7 @@ #include <linux/if.h> #include <linux/if_ether.h> #include <linux/kref.h> +#include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/sched.h> /* for linux/wait.h */ @@ -81,6 +82,9 @@ struct batadv_hard_iface_bat_iv { /** @ogm_seqno: OGM sequence number - used to identify each OGM */ atomic_t ogm_seqno; + + /** @ogm_buff_mutex: lock protecting ogm_buff and ogm_buff_len */ + struct mutex ogm_buff_mutex; }; /** @@ -117,6 +121,15 @@ struct batadv_hard_iface_bat_v { /** @elp_wq: workqueue used to schedule ELP transmissions */ struct delayed_work elp_wq; + /** @aggr_wq: workqueue used to transmit queued OGM packets */ + struct delayed_work aggr_wq; + + /** @aggr_list: queue for to be aggregated OGM packets */ + struct sk_buff_head aggr_list; + + /** @aggr_len: size of the OGM aggregate (excluding ethernet header) */ + unsigned int aggr_len; + /** * @throughput_override: throughput override to disable link * auto-detection @@ -444,7 +457,7 @@ struct batadv_orig_node { /** * @tt_lock: prevents from updating the table while reading it. Table * update is made up by two operations (data structure update and - * metdata -CRC/TTVN-recalculation) and they have to be executed + * metadata -CRC/TTVN-recalculation) and they have to be executed * atomically in order to avoid another thread to read the * table/metadata between those. */ @@ -998,7 +1011,7 @@ struct batadv_priv_tt { /** * @commit_lock: prevents from executing a local TT commit while reading * the local table. The local TT commit is made up by two operations - * (data structure update and metdata -CRC/TTVN- recalculation) and + * (data structure update and metadata -CRC/TTVN- recalculation) and * they have to be executed atomically in order to avoid another thread * to read the table/metadata between those. */ @@ -1527,6 +1540,9 @@ struct batadv_priv_bat_v { /** @ogm_seqno: OGM sequence number - used to identify each OGM */ atomic_t ogm_seqno; + /** @ogm_buff_mutex: lock protecting ogm_buff and ogm_buff_len */ + struct mutex ogm_buff_mutex; + /** @ogm_wq: workqueue used to schedule OGM transmissions */ struct delayed_work ogm_wq; }; diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 9d41de1ec90f..4febc82a7c76 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -571,19 +571,11 @@ static netdev_tx_t bt_xmit(struct sk_buff *skb, struct net_device *netdev) return err < 0 ? NET_XMIT_DROP : err; } -static int bt_dev_init(struct net_device *dev) -{ - netdev_lockdep_set_classes(dev); - - return 0; -} - static const struct net_device_ops netdev_ops = { - .ndo_init = bt_dev_init, .ndo_start_xmit = bt_xmit, }; -static struct header_ops header_ops = { +static const struct header_ops header_ops = { .create = header_create, }; diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig index 2efac049ad4c..165148c7c4ce 100644 --- a/net/bluetooth/Kconfig +++ b/net/bluetooth/Kconfig @@ -9,8 +9,9 @@ menuconfig BT depends on RFKILL || !RFKILL select CRC16 select CRYPTO - select CRYPTO_BLKCIPHER - select CRYPTO_AES + select CRYPTO_SKCIPHER + select CRYPTO_LIB_AES + imply CRYPTO_AES select CRYPTO_CMAC select CRYPTO_ECB select CRYPTO_SHA256 diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 94ddf19998c7..3fd124927d4d 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -173,7 +173,7 @@ void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh) else release_sock(sk); - parent->sk_ack_backlog++; + sk_acceptq_added(parent); } EXPORT_SYMBOL(bt_accept_enqueue); @@ -185,7 +185,7 @@ void bt_accept_unlink(struct sock *sk) BT_DBG("sk %p state %d", sk, sk->sk_state); list_del_init(&bt_sk(sk)->accept_q); - bt_sk(sk)->parent->sk_ack_backlog--; + sk_acceptq_removed(bt_sk(sk)->parent); bt_sk(sk)->parent = NULL; sock_put(sk); } @@ -460,7 +460,7 @@ __poll_t bt_sock_poll(struct file *file, struct socket *sock, if (sk->sk_state == BT_LISTEN) return bt_accept_poll(sk); - if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); @@ -470,7 +470,7 @@ __poll_t bt_sock_poll(struct file *file, struct socket *sock, if (sk->sk_shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; if (sk->sk_state == BT_CLOSED) diff --git a/net/bluetooth/bnep/netdev.c b/net/bluetooth/bnep/netdev.c index 1d4d7d415730..cc1cff63194f 100644 --- a/net/bluetooth/bnep/netdev.c +++ b/net/bluetooth/bnep/netdev.c @@ -112,7 +112,7 @@ static int bnep_net_set_mac_addr(struct net_device *dev, void *arg) return 0; } -static void bnep_net_timeout(struct net_device *dev) +static void bnep_net_timeout(struct net_device *dev, unsigned int txqueue) { BT_DBG("net_timeout"); netif_wake_queue(dev); diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index ad5b0ac1f9ce..87691404d0c6 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -934,6 +934,14 @@ static void hci_req_directed_advertising(struct hci_request *req, return; memset(&cp, 0, sizeof(cp)); + + /* Some controllers might reject command if intervals are not + * within range for undirected advertising. + * BCM20702A0 is known to be affected by this. + */ + cp.min_interval = cpu_to_le16(0x0020); + cp.max_interval = cpu_to_le16(0x0020); + cp.type = LE_ADV_DIRECT_IND; cp.own_address_type = own_addr_type; cp.direct_addr_type = conn->dst_type; @@ -1168,8 +1176,10 @@ struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst, if (!conn) return ERR_PTR(-ENOMEM); - if (hci_explicit_conn_params_set(hdev, dst, dst_type) < 0) + if (hci_explicit_conn_params_set(hdev, dst, dst_type) < 0) { + hci_conn_del(conn); return ERR_PTR(-EBUSY); + } conn->state = BT_CONNECT; set_bit(HCI_CONN_SCANNING, &conn->flags); diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 04bc79359a17..cbbc34a006d1 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -842,8 +842,8 @@ static int hci_init4_req(struct hci_request *req, unsigned long opt) if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) { struct hci_cp_le_write_def_data_len cp; - cp.tx_len = hdev->le_max_tx_len; - cp.tx_time = hdev->le_max_tx_time; + cp.tx_len = cpu_to_le16(hdev->le_max_tx_len); + cp.tx_time = cpu_to_le16(hdev->le_max_tx_time); hci_req_add(req, HCI_OP_LE_WRITE_DEF_DATA_LEN, sizeof(cp), &cp); } @@ -1444,11 +1444,20 @@ static int hci_dev_do_open(struct hci_dev *hdev) if (hci_dev_test_flag(hdev, HCI_SETUP) || test_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks)) { + bool invalid_bdaddr; + hci_sock_dev_event(hdev, HCI_DEV_SETUP); if (hdev->setup) ret = hdev->setup(hdev); + /* The transport driver can set the quirk to mark the + * BD_ADDR invalid before creating the HCI device or in + * its setup callback. + */ + invalid_bdaddr = test_bit(HCI_QUIRK_INVALID_BDADDR, + &hdev->quirks); + if (ret) goto setup_failed; @@ -1457,20 +1466,33 @@ static int hci_dev_do_open(struct hci_dev *hdev) hci_dev_get_bd_addr_from_property(hdev); if (bacmp(&hdev->public_addr, BDADDR_ANY) && - hdev->set_bdaddr) + hdev->set_bdaddr) { ret = hdev->set_bdaddr(hdev, &hdev->public_addr); + + /* If setting of the BD_ADDR from the device + * property succeeds, then treat the address + * as valid even if the invalid BD_ADDR + * quirk indicates otherwise. + */ + if (!ret) + invalid_bdaddr = false; + } } setup_failed: /* The transport driver can set these quirks before * creating the HCI device or in its setup callback. * + * For the invalid BD_ADDR quirk it is possible that + * it becomes a valid address if the bootloader does + * provide it (see above). + * * In case any of them is set, the controller has to * start up as unconfigured. */ if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) || - test_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks)) + invalid_bdaddr) hci_dev_set_flag(hdev, HCI_UNCONFIGURED); /* For an unconfigured controller it is required to @@ -2289,6 +2311,33 @@ void hci_smp_irks_clear(struct hci_dev *hdev) } } +void hci_blocked_keys_clear(struct hci_dev *hdev) +{ + struct blocked_key *b; + + list_for_each_entry_rcu(b, &hdev->blocked_keys, list) { + list_del_rcu(&b->list); + kfree_rcu(b, rcu); + } +} + +bool hci_is_blocked_key(struct hci_dev *hdev, u8 type, u8 val[16]) +{ + bool blocked = false; + struct blocked_key *b; + + rcu_read_lock(); + list_for_each_entry(b, &hdev->blocked_keys, list) { + if (b->type == type && !memcmp(b->val, val, sizeof(b->val))) { + blocked = true; + break; + } + } + + rcu_read_unlock(); + return blocked; +} + struct link_key *hci_find_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr) { struct link_key *k; @@ -2297,6 +2346,16 @@ struct link_key *hci_find_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr) list_for_each_entry_rcu(k, &hdev->link_keys, list) { if (bacmp(bdaddr, &k->bdaddr) == 0) { rcu_read_unlock(); + + if (hci_is_blocked_key(hdev, + HCI_BLOCKED_KEY_TYPE_LINKKEY, + k->val)) { + bt_dev_warn_ratelimited(hdev, + "Link key blocked for %pMR", + &k->bdaddr); + return NULL; + } + return k; } } @@ -2365,6 +2424,15 @@ struct smp_ltk *hci_find_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, if (smp_ltk_is_sc(k) || ltk_role(k->type) == role) { rcu_read_unlock(); + + if (hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_LTK, + k->val)) { + bt_dev_warn_ratelimited(hdev, + "LTK blocked for %pMR", + &k->bdaddr); + return NULL; + } + return k; } } @@ -2375,31 +2443,42 @@ struct smp_ltk *hci_find_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, struct smp_irk *hci_find_irk_by_rpa(struct hci_dev *hdev, bdaddr_t *rpa) { + struct smp_irk *irk_to_return = NULL; struct smp_irk *irk; rcu_read_lock(); list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) { if (!bacmp(&irk->rpa, rpa)) { - rcu_read_unlock(); - return irk; + irk_to_return = irk; + goto done; } } list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) { if (smp_irk_matches(hdev, irk->val, rpa)) { bacpy(&irk->rpa, rpa); - rcu_read_unlock(); - return irk; + irk_to_return = irk; + goto done; } } + +done: + if (irk_to_return && hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_IRK, + irk_to_return->val)) { + bt_dev_warn_ratelimited(hdev, "Identity key blocked for %pMR", + &irk_to_return->bdaddr); + irk_to_return = NULL; + } + rcu_read_unlock(); - return NULL; + return irk_to_return; } struct smp_irk *hci_find_irk_by_addr(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type) { + struct smp_irk *irk_to_return = NULL; struct smp_irk *irk; /* Identity Address must be public or static random */ @@ -2410,13 +2489,23 @@ struct smp_irk *hci_find_irk_by_addr(struct hci_dev *hdev, bdaddr_t *bdaddr, list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) { if (addr_type == irk->addr_type && bacmp(bdaddr, &irk->bdaddr) == 0) { - rcu_read_unlock(); - return irk; + irk_to_return = irk; + goto done; } } + +done: + + if (irk_to_return && hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_IRK, + irk_to_return->val)) { + bt_dev_warn_ratelimited(hdev, "Identity key blocked for %pMR", + &irk_to_return->bdaddr); + irk_to_return = NULL; + } + rcu_read_unlock(); - return NULL; + return irk_to_return; } struct link_key *hci_add_link_key(struct hci_dev *hdev, struct hci_conn *conn, @@ -3222,6 +3311,7 @@ struct hci_dev *hci_alloc_dev(void) INIT_LIST_HEAD(&hdev->pend_le_reports); INIT_LIST_HEAD(&hdev->conn_hash.list); INIT_LIST_HEAD(&hdev->adv_instances); + INIT_LIST_HEAD(&hdev->blocked_keys); INIT_WORK(&hdev->rx_work, hci_rx_work); INIT_WORK(&hdev->cmd_work, hci_cmd_work); @@ -3421,6 +3511,7 @@ void hci_unregister_dev(struct hci_dev *hdev) hci_bdaddr_list_clear(&hdev->le_resolv_list); hci_conn_params_clear_all(hdev); hci_discovery_filter_clear(hdev); + hci_blocked_keys_clear(hdev); hci_dev_unlock(hdev); hci_dev_put(hdev); @@ -3474,7 +3565,8 @@ int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb) if (hci_skb_pkt_type(skb) != HCI_EVENT_PKT && hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT && - hci_skb_pkt_type(skb) != HCI_SCODATA_PKT) { + hci_skb_pkt_type(skb) != HCI_SCODATA_PKT && + hci_skb_pkt_type(skb) != HCI_ISODATA_PKT) { kfree_skb(skb); return -EINVAL; } @@ -4196,15 +4288,10 @@ static void hci_sched_le(struct hci_dev *hdev) if (!hci_conn_num(hdev, LE_LINK)) return; - if (!hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { - /* LE tx timeout must be longer than maximum - * link supervision timeout (40.9 seconds) */ - if (!hdev->le_cnt && hdev->le_pkts && - time_after(jiffies, hdev->le_last_tx + HZ * 45)) - hci_link_tx_to(hdev, LE_LINK); - } - cnt = hdev->le_pkts ? hdev->le_cnt : hdev->acl_cnt; + + __check_timeout(hdev, cnt); + tmp = cnt; while (cnt && (chan = hci_chan_sent(hdev, LE_LINK, "e))) { u32 priority = (skb_peek(&chan->data_q))->priority; @@ -4440,7 +4527,14 @@ static void hci_rx_work(struct work_struct *work) hci_send_to_sock(hdev, skb); } - if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { + /* If the device has been opened in HCI_USER_CHANNEL, + * the userspace has exclusive access to device. + * When device is HCI_INIT, we still need to process + * the data packets to the driver in order + * to complete its setup(). + */ + if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && + !test_bit(HCI_INIT, &hdev->flags)) { kfree_skb(skb); continue; } @@ -4450,6 +4544,7 @@ static void hci_rx_work(struct work_struct *work) switch (hci_skb_pkt_type(skb)) { case HCI_ACLDATA_PKT: case HCI_SCODATA_PKT: + case HCI_ISODATA_PKT: kfree_skb(skb); continue; } diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index 402e2cc54044..6b1314c738b8 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -26,6 +26,7 @@ #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> +#include "smp.h" #include "hci_debugfs.h" #define DEFINE_QUIRK_ATTRIBUTE(__name, __quirk) \ @@ -152,6 +153,21 @@ static int blacklist_show(struct seq_file *f, void *p) DEFINE_SHOW_ATTRIBUTE(blacklist); +static int blocked_keys_show(struct seq_file *f, void *p) +{ + struct hci_dev *hdev = f->private; + struct blocked_key *key; + + rcu_read_lock(); + list_for_each_entry_rcu(key, &hdev->blocked_keys, list) + seq_printf(f, "%u %*phN\n", key->type, 16, key->val); + rcu_read_unlock(); + + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(blocked_keys); + static int uuids_show(struct seq_file *f, void *p) { struct hci_dev *hdev = f->private; @@ -308,6 +324,8 @@ void hci_debugfs_create_common(struct hci_dev *hdev) &device_list_fops); debugfs_create_file("blacklist", 0444, hdev->debugfs, hdev, &blacklist_fops); + debugfs_create_file("blocked_keys", 0444, hdev->debugfs, hdev, + &blocked_keys_fops); debugfs_create_file("uuids", 0444, hdev->debugfs, hdev, &uuids_fops); debugfs_create_file("remote_oob", 0400, hdev->debugfs, hdev, &remote_oob_fops); @@ -972,6 +990,62 @@ static int adv_max_interval_get(void *data, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(adv_max_interval_fops, adv_max_interval_get, adv_max_interval_set, "%llu\n"); +static int min_key_size_set(void *data, u64 val) +{ + struct hci_dev *hdev = data; + + if (val > hdev->le_max_key_size || val < SMP_MIN_ENC_KEY_SIZE) + return -EINVAL; + + hci_dev_lock(hdev); + hdev->le_min_key_size = val; + hci_dev_unlock(hdev); + + return 0; +} + +static int min_key_size_get(void *data, u64 *val) +{ + struct hci_dev *hdev = data; + + hci_dev_lock(hdev); + *val = hdev->le_min_key_size; + hci_dev_unlock(hdev); + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(min_key_size_fops, min_key_size_get, + min_key_size_set, "%llu\n"); + +static int max_key_size_set(void *data, u64 val) +{ + struct hci_dev *hdev = data; + + if (val > SMP_MAX_ENC_KEY_SIZE || val < hdev->le_min_key_size) + return -EINVAL; + + hci_dev_lock(hdev); + hdev->le_max_key_size = val; + hci_dev_unlock(hdev); + + return 0; +} + +static int max_key_size_get(void *data, u64 *val) +{ + struct hci_dev *hdev = data; + + hci_dev_lock(hdev); + *val = hdev->le_max_key_size; + hci_dev_unlock(hdev); + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(max_key_size_fops, max_key_size_get, + max_key_size_set, "%llu\n"); + static int auth_payload_timeout_set(void *data, u64 val) { struct hci_dev *hdev = data; @@ -1054,6 +1128,10 @@ void hci_debugfs_create_le(struct hci_dev *hdev) &adv_max_interval_fops); debugfs_create_u16("discov_interleaved_timeout", 0644, hdev->debugfs, &hdev->discov_interleaved_timeout); + debugfs_create_file("min_key_size", 0644, hdev->debugfs, hdev, + &min_key_size_fops); + debugfs_create_file("max_key_size", 0644, hdev->debugfs, hdev, + &max_key_size_fops); debugfs_create_file("auth_payload_timeout", 0644, hdev->debugfs, hdev, &auth_payload_timeout_fops); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index cdb00c2ef242..6ddc4a74a5e4 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -5451,7 +5451,7 @@ static void hci_le_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_unlock(hdev); } -static u8 ext_evt_type_to_legacy(u16 evt_type) +static u8 ext_evt_type_to_legacy(struct hci_dev *hdev, u16 evt_type) { if (evt_type & LE_EXT_ADV_LEGACY_PDU) { switch (evt_type) { @@ -5468,10 +5468,7 @@ static u8 ext_evt_type_to_legacy(u16 evt_type) return LE_ADV_SCAN_RSP; } - BT_ERR_RATELIMITED("Unknown advertising packet type: 0x%02x", - evt_type); - - return LE_ADV_INVALID; + goto invalid; } if (evt_type & LE_EXT_ADV_CONN_IND) { @@ -5491,8 +5488,9 @@ static u8 ext_evt_type_to_legacy(u16 evt_type) evt_type & LE_EXT_ADV_DIRECT_IND) return LE_ADV_NONCONN_IND; - BT_ERR_RATELIMITED("Unknown advertising packet type: 0x%02x", - evt_type); +invalid: + bt_dev_err_ratelimited(hdev, "Unknown advertising packet type: 0x%02x", + evt_type); return LE_ADV_INVALID; } @@ -5510,7 +5508,7 @@ static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb) u16 evt_type; evt_type = __le16_to_cpu(ev->evt_type); - legacy_evt_type = ext_evt_type_to_legacy(evt_type); + legacy_evt_type = ext_evt_type_to_legacy(hdev, evt_type); if (legacy_evt_type != LE_ADV_INVALID) { process_adv_report(hdev, legacy_evt_type, &ev->bdaddr, ev->bdaddr_type, NULL, 0, ev->rssi, @@ -5660,11 +5658,6 @@ static void hci_le_remote_conn_param_req_evt(struct hci_dev *hdev, return send_conn_param_neg_reply(hdev, handle, HCI_ERROR_UNKNOWN_CONN_ID); - if (min < hcon->le_conn_min_interval || - max > hcon->le_conn_max_interval) - return send_conn_param_neg_reply(hdev, handle, - HCI_ERROR_INVALID_LL_PARAMS); - if (hci_check_conn_params(min, max, latency, timeout)) return send_conn_param_neg_reply(hdev, handle, HCI_ERROR_INVALID_LL_PARAMS); @@ -5725,6 +5718,29 @@ static void hci_le_direct_adv_report_evt(struct hci_dev *hdev, hci_dev_unlock(hdev); } +static void hci_le_phy_update_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_le_phy_update_complete *ev = (void *) skb->data; + struct hci_conn *conn; + + BT_DBG("%s status 0x%2.2x", hdev->name, ev->status); + + if (!ev->status) + return; + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle)); + if (!conn) + goto unlock; + + conn->le_tx_phy = ev->tx_phy; + conn->le_rx_phy = ev->rx_phy; + +unlock: + hci_dev_unlock(hdev); +} + static void hci_le_meta_evt(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_ev_le_meta *le_ev = (void *) skb->data; @@ -5760,6 +5776,10 @@ static void hci_le_meta_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_le_direct_adv_report_evt(hdev, skb); break; + case HCI_EV_LE_PHY_UPDATE_COMPLETE: + hci_le_phy_update_evt(hdev, skb); + break; + case HCI_EV_LE_EXT_ADV_REPORT: hci_le_ext_adv_report_evt(hdev, skb); break; diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 621f1a97d803..2a1b64dbf76e 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -904,9 +904,9 @@ static u8 get_adv_instance_scan_rsp_len(struct hci_dev *hdev, u8 instance) { struct adv_info *adv_instance; - /* Ignore instance 0 */ + /* Instance 0x00 always set local name */ if (instance == 0x00) - return 0; + return 1; adv_instance = hci_find_adv_instance(hdev, instance); if (!adv_instance) @@ -923,9 +923,9 @@ static u8 get_cur_adv_instance_scan_rsp_len(struct hci_dev *hdev) u8 instance = hdev->cur_adv_instance; struct adv_info *adv_instance; - /* Ignore instance 0 */ + /* Instance 0x00 always set local name */ if (instance == 0x00) - return 0; + return 1; adv_instance = hci_find_adv_instance(hdev, instance); if (!adv_instance) @@ -1054,6 +1054,7 @@ void __hci_req_enable_advertising(struct hci_request *req) struct hci_cp_le_set_adv_param cp; u8 own_addr_type, enable = 0x01; bool connectable; + u16 adv_min_interval, adv_max_interval; u32 flags; flags = get_adv_instance_flags(hdev, hdev->cur_adv_instance); @@ -1087,16 +1088,30 @@ void __hci_req_enable_advertising(struct hci_request *req) return; memset(&cp, 0, sizeof(cp)); - cp.min_interval = cpu_to_le16(hdev->le_adv_min_interval); - cp.max_interval = cpu_to_le16(hdev->le_adv_max_interval); - if (connectable) + if (connectable) { cp.type = LE_ADV_IND; - else if (get_cur_adv_instance_scan_rsp_len(hdev)) - cp.type = LE_ADV_SCAN_IND; - else - cp.type = LE_ADV_NONCONN_IND; + adv_min_interval = hdev->le_adv_min_interval; + adv_max_interval = hdev->le_adv_max_interval; + } else { + if (get_cur_adv_instance_scan_rsp_len(hdev)) + cp.type = LE_ADV_SCAN_IND; + else + cp.type = LE_ADV_NONCONN_IND; + + if (!hci_dev_test_flag(hdev, HCI_DISCOVERABLE) || + hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) { + adv_min_interval = DISCOV_LE_FAST_ADV_INT_MIN; + adv_max_interval = DISCOV_LE_FAST_ADV_INT_MAX; + } else { + adv_min_interval = hdev->le_adv_min_interval; + adv_max_interval = hdev->le_adv_max_interval; + } + } + + cp.min_interval = cpu_to_le16(adv_min_interval); + cp.max_interval = cpu_to_le16(adv_max_interval); cp.own_address_type = own_addr_type; cp.channel_map = hdev->le_adv_channel_map; @@ -1258,6 +1273,14 @@ static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr) instance_flags = get_adv_instance_flags(hdev, instance); + /* If instance already has the flags set skip adding it once + * again. + */ + if (adv_instance && eir_get_data(adv_instance->adv_data, + adv_instance->adv_data_len, EIR_FLAGS, + NULL)) + goto skip_flags; + /* The Add Advertising command allows userspace to set both the general * and limited discoverable flags. */ @@ -1290,6 +1313,7 @@ static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr) } } +skip_flags: if (adv_instance) { memcpy(ptr, adv_instance->adv_data, adv_instance->adv_data_len); @@ -1675,7 +1699,7 @@ int __hci_req_enable_ext_advertising(struct hci_request *req, u8 instance) * scheduling it. */ if (adv_instance && adv_instance->duration) { - u16 duration = adv_instance->duration * MSEC_PER_SEC; + u16 duration = adv_instance->timeout * MSEC_PER_SEC; /* Time = N * 10 ms */ adv_set->duration = cpu_to_le16(duration / 10); diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index d32077b28433..9c4a093f8960 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -23,7 +23,7 @@ */ /* Bluetooth HCI sockets. */ - +#include <linux/compat.h> #include <linux/export.h> #include <linux/utsname.h> #include <linux/sched.h> @@ -211,7 +211,8 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb) if (hci_skb_pkt_type(skb) != HCI_COMMAND_PKT && hci_skb_pkt_type(skb) != HCI_EVENT_PKT && hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT && - hci_skb_pkt_type(skb) != HCI_SCODATA_PKT) + hci_skb_pkt_type(skb) != HCI_SCODATA_PKT && + hci_skb_pkt_type(skb) != HCI_ISODATA_PKT) continue; if (is_filtered_packet(sk, skb)) continue; @@ -220,7 +221,8 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb) continue; if (hci_skb_pkt_type(skb) != HCI_EVENT_PKT && hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT && - hci_skb_pkt_type(skb) != HCI_SCODATA_PKT) + hci_skb_pkt_type(skb) != HCI_SCODATA_PKT && + hci_skb_pkt_type(skb) != HCI_ISODATA_PKT) continue; } else { /* Don't send frame to other channel types */ @@ -324,6 +326,12 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb) else opcode = cpu_to_le16(HCI_MON_SCO_TX_PKT); break; + case HCI_ISODATA_PKT: + if (bt_cb(skb)->incoming) + opcode = cpu_to_le16(HCI_MON_ISO_RX_PKT); + else + opcode = cpu_to_le16(HCI_MON_ISO_TX_PKT); + break; case HCI_DIAG_PKT: opcode = cpu_to_le16(HCI_MON_VENDOR_DIAG); break; @@ -831,6 +839,8 @@ static int hci_sock_release(struct socket *sock) if (!sk) return 0; + lock_sock(sk); + switch (hci_pi(sk)->channel) { case HCI_CHANNEL_MONITOR: atomic_dec(&monitor_promisc); @@ -878,6 +888,7 @@ static int hci_sock_release(struct socket *sock) skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_write_queue); + release_sock(sk); sock_put(sk); return 0; } @@ -1054,6 +1065,22 @@ done: return err; } +#ifdef CONFIG_COMPAT +static int hci_sock_compat_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + switch (cmd) { + case HCIDEVUP: + case HCIDEVDOWN: + case HCIDEVRESET: + case HCIDEVRESTAT: + return hci_sock_ioctl(sock, cmd, arg); + } + + return hci_sock_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); +} +#endif + static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { @@ -1746,7 +1773,8 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg, */ if (hci_skb_pkt_type(skb) != HCI_COMMAND_PKT && hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT && - hci_skb_pkt_type(skb) != HCI_SCODATA_PKT) { + hci_skb_pkt_type(skb) != HCI_SCODATA_PKT && + hci_skb_pkt_type(skb) != HCI_ISODATA_PKT) { err = -EINVAL; goto drop; } @@ -1790,7 +1818,8 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg, } if (hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT && - hci_skb_pkt_type(skb) != HCI_SCODATA_PKT) { + hci_skb_pkt_type(skb) != HCI_SCODATA_PKT && + hci_skb_pkt_type(skb) != HCI_ISODATA_PKT) { err = -EINVAL; goto drop; } @@ -1974,6 +2003,9 @@ static const struct proto_ops hci_sock_ops = { .sendmsg = hci_sock_sendmsg, .recvmsg = hci_sock_recvmsg, .ioctl = hci_sock_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = hci_sock_compat_ioctl, +#endif .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 8d889969ae7e..bef84b95e2c4 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -267,7 +267,7 @@ static int hidp_get_raw_report(struct hid_device *hid, set_bit(HIDP_WAITING_FOR_RETURN, &session->flags); data[0] = report_number; ret = hidp_send_ctrl_message(session, report_type, data, 1); - if (ret) + if (ret < 0) goto err; /* Wait for the return of the report. The returned report @@ -343,7 +343,7 @@ static int hidp_set_raw_report(struct hid_device *hid, unsigned char reportnum, data[0] = reportnum; set_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags); ret = hidp_send_ctrl_message(session, report_type, data, count); - if (ret) + if (ret < 0) goto err; /* Wait for the ACK from the device. */ diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index dfc1edb168b7..195459a1e53e 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1289,6 +1289,9 @@ static void l2cap_le_connect(struct l2cap_chan *chan) if (test_and_set_bit(FLAG_LE_CONN_REQ_SENT, &chan->flags)) return; + if (!chan->imtu) + chan->imtu = chan->conn->mtu; + l2cap_le_flowctl_init(chan, 0); req.psm = chan->psm; @@ -3226,6 +3229,49 @@ static inline void l2cap_txwin_setup(struct l2cap_chan *chan) chan->ack_win = chan->tx_win; } +static void l2cap_mtu_auto(struct l2cap_chan *chan) +{ + struct hci_conn *conn = chan->conn->hcon; + + chan->imtu = L2CAP_DEFAULT_MIN_MTU; + + /* The 2-DH1 packet has between 2 and 56 information bytes + * (including the 2-byte payload header) + */ + if (!(conn->pkt_type & HCI_2DH1)) + chan->imtu = 54; + + /* The 3-DH1 packet has between 2 and 85 information bytes + * (including the 2-byte payload header) + */ + if (!(conn->pkt_type & HCI_3DH1)) + chan->imtu = 83; + + /* The 2-DH3 packet has between 2 and 369 information bytes + * (including the 2-byte payload header) + */ + if (!(conn->pkt_type & HCI_2DH3)) + chan->imtu = 367; + + /* The 3-DH3 packet has between 2 and 554 information bytes + * (including the 2-byte payload header) + */ + if (!(conn->pkt_type & HCI_3DH3)) + chan->imtu = 552; + + /* The 2-DH5 packet has between 2 and 681 information bytes + * (including the 2-byte payload header) + */ + if (!(conn->pkt_type & HCI_2DH5)) + chan->imtu = 679; + + /* The 3-DH5 packet has between 2 and 1023 information bytes + * (including the 2-byte payload header) + */ + if (!(conn->pkt_type & HCI_3DH5)) + chan->imtu = 1021; +} + static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data, size_t data_size) { struct l2cap_conf_req *req = data; @@ -3255,8 +3301,12 @@ static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data, size_t data } done: - if (chan->imtu != L2CAP_DEFAULT_MTU) - l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu, endptr - ptr); + if (chan->imtu != L2CAP_DEFAULT_MTU) { + if (!chan->imtu) + l2cap_mtu_auto(chan); + l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu, + endptr - ptr); + } switch (chan->mode) { case L2CAP_MODE_BASIC: @@ -4936,10 +4986,8 @@ void __l2cap_physical_cfm(struct l2cap_chan *chan, int result) BT_DBG("chan %p, result %d, local_amp_id %d, remote_amp_id %d", chan, result, local_amp_id, remote_amp_id); - if (chan->state == BT_DISCONN || chan->state == BT_CLOSED) { - l2cap_chan_unlock(chan); + if (chan->state == BT_DISCONN || chan->state == BT_CLOSED) return; - } if (chan->state != BT_CONNECTED) { l2cap_do_create(chan, result, local_amp_id, remote_amp_id); @@ -5033,7 +5081,6 @@ static inline int l2cap_move_channel_req(struct l2cap_conn *conn, chan->move_role = L2CAP_MOVE_ROLE_RESPONDER; l2cap_move_setup(chan); chan->move_id = req->dest_amp_id; - icid = chan->dcid; if (req->dest_amp_id == AMP_ID_BREDR) { /* Moving to BR/EDR */ @@ -5305,14 +5352,7 @@ static inline int l2cap_conn_param_update_req(struct l2cap_conn *conn, memset(&rsp, 0, sizeof(rsp)); - if (min < hcon->le_conn_min_interval || - max > hcon->le_conn_max_interval) { - BT_DBG("requested connection interval exceeds current bounds."); - err = -EINVAL; - } else { - err = hci_check_conn_params(min, max, latency, to_multiplier); - } - + err = hci_check_conn_params(min, max, latency, to_multiplier); if (err) rsp.result = cpu_to_le16(L2CAP_CONN_PARAM_REJECTED); else diff --git a/net/bluetooth/lib.c b/net/bluetooth/lib.c index 63e65d9b4b24..c09e0a3a0ed9 100644 --- a/net/bluetooth/lib.c +++ b/net/bluetooth/lib.c @@ -183,6 +183,22 @@ void bt_err(const char *format, ...) } EXPORT_SYMBOL(bt_err); +void bt_warn_ratelimited(const char *format, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, format); + + vaf.fmt = format; + vaf.va = &args; + + pr_warn_ratelimited("%pV", &vaf); + + va_end(args); +} +EXPORT_SYMBOL(bt_warn_ratelimited); + void bt_err_ratelimited(const char *format, ...) { struct va_format vaf; diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 150114e33b20..3074363c68df 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -38,7 +38,7 @@ #include "mgmt_util.h" #define MGMT_VERSION 1 -#define MGMT_REVISION 14 +#define MGMT_REVISION 15 static const u16 mgmt_commands[] = { MGMT_OP_READ_INDEX_LIST, @@ -106,6 +106,7 @@ static const u16 mgmt_commands[] = { MGMT_OP_START_LIMITED_DISCOVERY, MGMT_OP_READ_EXT_INFO, MGMT_OP_SET_APPEARANCE, + MGMT_OP_SET_BLOCKED_KEYS, }; static const u16 mgmt_events[] = { @@ -175,7 +176,7 @@ static const u16 mgmt_untrusted_events[] = { "\x00\x00\x00\x00\x00\x00\x00\x00" /* HCI to MGMT error code conversion table */ -static u8 mgmt_status_table[] = { +static const u8 mgmt_status_table[] = { MGMT_STATUS_SUCCESS, MGMT_STATUS_UNKNOWN_COMMAND, /* Unknown Command */ MGMT_STATUS_NOT_CONNECTED, /* No Connection */ @@ -2341,6 +2342,14 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data, for (i = 0; i < key_count; i++) { struct mgmt_link_key_info *key = &cp->keys[i]; + if (hci_is_blocked_key(hdev, + HCI_BLOCKED_KEY_TYPE_LINKKEY, + key->val)) { + bt_dev_warn(hdev, "Skipping blocked link key for %pMR", + &key->addr.bdaddr); + continue; + } + /* Always ignore debug keys and require a new pairing if * the user wants to use them. */ @@ -2588,7 +2597,6 @@ static int get_connections(struct sock *sk, struct hci_dev *hdev, void *data, { struct mgmt_rp_get_connections *rp; struct hci_conn *c; - size_t rp_len; int err; u16 i; @@ -2608,8 +2616,7 @@ static int get_connections(struct sock *sk, struct hci_dev *hdev, void *data, i++; } - rp_len = sizeof(*rp) + (i * sizeof(struct mgmt_addr_info)); - rp = kmalloc(rp_len, GFP_KERNEL); + rp = kmalloc(struct_size(rp, addr, i), GFP_KERNEL); if (!rp) { err = -ENOMEM; goto unlock; @@ -2629,10 +2636,8 @@ static int get_connections(struct sock *sk, struct hci_dev *hdev, void *data, rp->conn_count = cpu_to_le16(i); /* Recalculate length in case of filtered SCO connections, etc */ - rp_len = sizeof(*rp) + (i * sizeof(struct mgmt_addr_info)); - err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CONNECTIONS, 0, rp, - rp_len); + struct_size(rp, addr, i)); kfree(rp); @@ -3286,7 +3291,7 @@ static int set_appearance(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_set_appearance *cp = data; - u16 apperance; + u16 appearance; int err; BT_DBG(""); @@ -3295,12 +3300,12 @@ static int set_appearance(struct sock *sk, struct hci_dev *hdev, void *data, return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_APPEARANCE, MGMT_STATUS_NOT_SUPPORTED); - apperance = le16_to_cpu(cp->appearance); + appearance = le16_to_cpu(cp->appearance); hci_dev_lock(hdev); - if (hdev->appearance != apperance) { - hdev->appearance = apperance; + if (hdev->appearance != appearance) { + hdev->appearance = appearance; if (hci_dev_test_flag(hdev, HCI_LE_ADV)) adv_expire(hdev, MGMT_ADV_FLAG_APPEARANCE); @@ -3535,6 +3540,55 @@ unlock: return err; } +static int set_blocked_keys(struct sock *sk, struct hci_dev *hdev, void *data, + u16 len) +{ + int err = MGMT_STATUS_SUCCESS; + struct mgmt_cp_set_blocked_keys *keys = data; + const u16 max_key_count = ((U16_MAX - sizeof(*keys)) / + sizeof(struct mgmt_blocked_key_info)); + u16 key_count, expected_len; + int i; + + BT_DBG("request for %s", hdev->name); + + key_count = __le16_to_cpu(keys->key_count); + if (key_count > max_key_count) { + bt_dev_err(hdev, "too big key_count value %u", key_count); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BLOCKED_KEYS, + MGMT_STATUS_INVALID_PARAMS); + } + + expected_len = struct_size(keys, keys, key_count); + if (expected_len != len) { + bt_dev_err(hdev, "expected %u bytes, got %u bytes", + expected_len, len); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BLOCKED_KEYS, + MGMT_STATUS_INVALID_PARAMS); + } + + hci_dev_lock(hdev); + + hci_blocked_keys_clear(hdev); + + for (i = 0; i < keys->key_count; ++i) { + struct blocked_key *b = kzalloc(sizeof(*b), GFP_KERNEL); + + if (!b) { + err = MGMT_STATUS_NO_RESOURCES; + break; + } + + b->type = keys->keys[i].type; + memcpy(b->val, keys->keys[i].val, sizeof(b->val)); + list_add_rcu(&b->list, &hdev->blocked_keys); + } + hci_dev_unlock(hdev); + + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_BLOCKED_KEYS, + err, NULL, 0); +} + static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status, u16 opcode, struct sk_buff *skb) { @@ -5055,6 +5109,14 @@ static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data, for (i = 0; i < irk_count; i++) { struct mgmt_irk_info *irk = &cp->irks[i]; + if (hci_is_blocked_key(hdev, + HCI_BLOCKED_KEY_TYPE_IRK, + irk->val)) { + bt_dev_warn(hdev, "Skipping blocked IRK for %pMR", + &irk->addr.bdaddr); + continue; + } + hci_add_irk(hdev, &irk->addr.bdaddr, le_addr_type(irk->addr.type), irk->val, BDADDR_ANY); @@ -5138,6 +5200,14 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, struct mgmt_ltk_info *key = &cp->keys[i]; u8 type, authenticated; + if (hci_is_blocked_key(hdev, + HCI_BLOCKED_KEY_TYPE_LTK, + key->val)) { + bt_dev_warn(hdev, "Skipping blocked LTK for %pMR", + &key->addr.bdaddr); + continue; + } + switch (key->type) { case MGMT_LTK_UNAUTHENTICATED: authenticated = 0x00; @@ -6918,6 +6988,8 @@ static const struct hci_mgmt_handler mgmt_handlers[] = { { set_appearance, MGMT_SET_APPEARANCE_SIZE }, { get_phy_configuration, MGMT_GET_PHY_CONFIGURATION_SIZE }, { set_phy_configuration, MGMT_SET_PHY_CONFIGURATION_SIZE }, + { set_blocked_keys, MGMT_OP_SET_BLOCKED_KEYS_SIZE, + HCI_MGMT_VAR_LEN }, }; void mgmt_index_added(struct hci_dev *hdev) diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 90bb53aa4bee..b4eaf21360ef 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -24,7 +24,7 @@ /* * RFCOMM sockets. */ - +#include <linux/compat.h> #include <linux/export.h> #include <linux/debugfs.h> #include <linux/sched/signal.h> @@ -909,6 +909,13 @@ static int rfcomm_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned lon return err; } +#ifdef CONFIG_COMPAT +static int rfcomm_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + return rfcomm_sock_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); +} +#endif + static int rfcomm_sock_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; @@ -1042,7 +1049,10 @@ static const struct proto_ops rfcomm_sock_ops = { .gettstamp = sock_gettstamp, .poll = bt_sock_poll, .socketpair = sock_no_socketpair, - .mmap = sock_no_mmap + .mmap = sock_no_mmap, +#ifdef CONFIG_COMPAT + .compat_ioctl = rfcomm_sock_compat_ioctl, +#endif }; static const struct net_proto_family rfcomm_sock_family_ops = { diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 6c2b4e6e87ba..204f14f8b507 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -23,6 +23,7 @@ #include <linux/debugfs.h> #include <linux/scatterlist.h> #include <linux/crypto.h> +#include <crypto/aes.h> #include <crypto/algapi.h> #include <crypto/b128ops.h> #include <crypto/hash.h> @@ -88,7 +89,6 @@ struct smp_dev { u8 local_rand[16]; bool debug_key; - struct crypto_cipher *tfm_aes; struct crypto_shash *tfm_cmac; struct crypto_kpp *tfm_ecdh; }; @@ -127,7 +127,6 @@ struct smp_chan { u8 dhkey[32]; u8 mackey[16]; - struct crypto_cipher *tfm_aes; struct crypto_shash *tfm_cmac; struct crypto_kpp *tfm_ecdh; }; @@ -377,22 +376,18 @@ static int smp_h7(struct crypto_shash *tfm_cmac, const u8 w[16], * s1 and ah. */ -static int smp_e(struct crypto_cipher *tfm, const u8 *k, u8 *r) +static int smp_e(const u8 *k, u8 *r) { + struct crypto_aes_ctx ctx; uint8_t tmp[16], data[16]; int err; SMP_DBG("k %16phN r %16phN", k, r); - if (!tfm) { - BT_ERR("tfm %p", tfm); - return -EINVAL; - } - /* The most significant octet of key corresponds to k[0] */ swap_buf(k, tmp, 16); - err = crypto_cipher_setkey(tfm, tmp, 16); + err = aes_expandkey(&ctx, tmp, 16); if (err) { BT_ERR("cipher setkey failed: %d", err); return err; @@ -401,17 +396,18 @@ static int smp_e(struct crypto_cipher *tfm, const u8 *k, u8 *r) /* Most significant octet of plaintextData corresponds to data[0] */ swap_buf(r, data, 16); - crypto_cipher_encrypt_one(tfm, data, data); + aes_encrypt(&ctx, data, data); /* Most significant octet of encryptedData corresponds to data[0] */ swap_buf(data, r, 16); SMP_DBG("r %16phN", r); + memzero_explicit(&ctx, sizeof (ctx)); return err; } -static int smp_c1(struct crypto_cipher *tfm_aes, const u8 k[16], +static int smp_c1(const u8 k[16], const u8 r[16], const u8 preq[7], const u8 pres[7], u8 _iat, const bdaddr_t *ia, u8 _rat, const bdaddr_t *ra, u8 res[16]) { @@ -436,7 +432,7 @@ static int smp_c1(struct crypto_cipher *tfm_aes, const u8 k[16], u128_xor((u128 *) res, (u128 *) r, (u128 *) p1); /* res = e(k, res) */ - err = smp_e(tfm_aes, k, res); + err = smp_e(k, res); if (err) { BT_ERR("Encrypt data error"); return err; @@ -453,14 +449,14 @@ static int smp_c1(struct crypto_cipher *tfm_aes, const u8 k[16], u128_xor((u128 *) res, (u128 *) res, (u128 *) p2); /* res = e(k, res) */ - err = smp_e(tfm_aes, k, res); + err = smp_e(k, res); if (err) BT_ERR("Encrypt data error"); return err; } -static int smp_s1(struct crypto_cipher *tfm_aes, const u8 k[16], +static int smp_s1(const u8 k[16], const u8 r1[16], const u8 r2[16], u8 _r[16]) { int err; @@ -469,15 +465,14 @@ static int smp_s1(struct crypto_cipher *tfm_aes, const u8 k[16], memcpy(_r, r2, 8); memcpy(_r + 8, r1, 8); - err = smp_e(tfm_aes, k, _r); + err = smp_e(k, _r); if (err) BT_ERR("Encrypt data error"); return err; } -static int smp_ah(struct crypto_cipher *tfm, const u8 irk[16], - const u8 r[3], u8 res[3]) +static int smp_ah(const u8 irk[16], const u8 r[3], u8 res[3]) { u8 _res[16]; int err; @@ -486,7 +481,7 @@ static int smp_ah(struct crypto_cipher *tfm, const u8 irk[16], memcpy(_res, r, 3); memset(_res + 3, 0, 13); - err = smp_e(tfm, irk, _res); + err = smp_e(irk, _res); if (err) { BT_ERR("Encrypt error"); return err; @@ -507,18 +502,15 @@ bool smp_irk_matches(struct hci_dev *hdev, const u8 irk[16], const bdaddr_t *bdaddr) { struct l2cap_chan *chan = hdev->smp_data; - struct smp_dev *smp; u8 hash[3]; int err; if (!chan || !chan->data) return false; - smp = chan->data; - BT_DBG("RPA %pMR IRK %*phN", bdaddr, 16, irk); - err = smp_ah(smp->tfm_aes, irk, &bdaddr->b[3], hash); + err = smp_ah(irk, &bdaddr->b[3], hash); if (err) return false; @@ -528,20 +520,17 @@ bool smp_irk_matches(struct hci_dev *hdev, const u8 irk[16], int smp_generate_rpa(struct hci_dev *hdev, const u8 irk[16], bdaddr_t *rpa) { struct l2cap_chan *chan = hdev->smp_data; - struct smp_dev *smp; int err; if (!chan || !chan->data) return -EOPNOTSUPP; - smp = chan->data; - get_random_bytes(&rpa->b[3], 3); rpa->b[5] &= 0x3f; /* Clear two most significant bits */ rpa->b[5] |= 0x40; /* Set second most significant bit */ - err = smp_ah(smp->tfm_aes, irk, &rpa->b[3], rpa->b); + err = smp_ah(irk, &rpa->b[3], rpa->b); if (err < 0) return err; @@ -768,7 +757,6 @@ static void smp_chan_destroy(struct l2cap_conn *conn) kzfree(smp->slave_csrk); kzfree(smp->link_key); - crypto_free_cipher(smp->tfm_aes); crypto_free_shash(smp->tfm_cmac); crypto_free_kpp(smp->tfm_ecdh); @@ -957,7 +945,7 @@ static u8 smp_confirm(struct smp_chan *smp) BT_DBG("conn %p", conn); - ret = smp_c1(smp->tfm_aes, smp->tk, smp->prnd, smp->preq, smp->prsp, + ret = smp_c1(smp->tk, smp->prnd, smp->preq, smp->prsp, conn->hcon->init_addr_type, &conn->hcon->init_addr, conn->hcon->resp_addr_type, &conn->hcon->resp_addr, cp.confirm_val); @@ -983,12 +971,9 @@ static u8 smp_random(struct smp_chan *smp) u8 confirm[16]; int ret; - if (IS_ERR_OR_NULL(smp->tfm_aes)) - return SMP_UNSPECIFIED; - BT_DBG("conn %p %s", conn, conn->hcon->out ? "master" : "slave"); - ret = smp_c1(smp->tfm_aes, smp->tk, smp->rrnd, smp->preq, smp->prsp, + ret = smp_c1(smp->tk, smp->rrnd, smp->preq, smp->prsp, hcon->init_addr_type, &hcon->init_addr, hcon->resp_addr_type, &hcon->resp_addr, confirm); if (ret) @@ -1005,7 +990,7 @@ static u8 smp_random(struct smp_chan *smp) __le64 rand = 0; __le16 ediv = 0; - smp_s1(smp->tfm_aes, smp->tk, smp->rrnd, smp->prnd, stk); + smp_s1(smp->tk, smp->rrnd, smp->prnd, stk); if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &hcon->flags)) return SMP_UNSPECIFIED; @@ -1021,7 +1006,7 @@ static u8 smp_random(struct smp_chan *smp) smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); - smp_s1(smp->tfm_aes, smp->tk, smp->prnd, smp->rrnd, stk); + smp_s1(smp->tk, smp->prnd, smp->rrnd, stk); if (hcon->pending_sec_level == BT_SECURITY_HIGH) auth = 1; @@ -1389,16 +1374,10 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) if (!smp) return NULL; - smp->tfm_aes = crypto_alloc_cipher("aes", 0, 0); - if (IS_ERR(smp->tfm_aes)) { - BT_ERR("Unable to create AES crypto context"); - goto zfree_smp; - } - smp->tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0); if (IS_ERR(smp->tfm_cmac)) { BT_ERR("Unable to create CMAC crypto context"); - goto free_cipher; + goto zfree_smp; } smp->tfm_ecdh = crypto_alloc_kpp("ecdh", CRYPTO_ALG_INTERNAL, 0); @@ -1420,8 +1399,6 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) free_shash: crypto_free_shash(smp->tfm_cmac); -free_cipher: - crypto_free_cipher(smp->tfm_aes); zfree_smp: kzfree(smp); return NULL; @@ -2476,6 +2453,15 @@ static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb) if (skb->len < sizeof(*rp)) return SMP_INVALID_PARAMS; + /* Pairing is aborted if any blocked keys are distributed */ + if (hci_is_blocked_key(conn->hcon->hdev, HCI_BLOCKED_KEY_TYPE_LTK, + rp->ltk)) { + bt_dev_warn_ratelimited(conn->hcon->hdev, + "LTK blocked for %pMR", + &conn->hcon->dst); + return SMP_INVALID_PARAMS; + } + SMP_ALLOW_CMD(smp, SMP_CMD_MASTER_IDENT); skb_pull(skb, sizeof(*rp)); @@ -2532,6 +2518,15 @@ static int smp_cmd_ident_info(struct l2cap_conn *conn, struct sk_buff *skb) if (skb->len < sizeof(*info)) return SMP_INVALID_PARAMS; + /* Pairing is aborted if any blocked keys are distributed */ + if (hci_is_blocked_key(conn->hcon->hdev, HCI_BLOCKED_KEY_TYPE_IRK, + info->irk)) { + bt_dev_warn_ratelimited(conn->hcon->hdev, + "Identity key blocked for %pMR", + &conn->hcon->dst); + return SMP_INVALID_PARAMS; + } + SMP_ALLOW_CMD(smp, SMP_CMD_IDENT_ADDR_INFO); skb_pull(skb, sizeof(*info)); @@ -3232,7 +3227,6 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid) { struct l2cap_chan *chan; struct smp_dev *smp; - struct crypto_cipher *tfm_aes; struct crypto_shash *tfm_cmac; struct crypto_kpp *tfm_ecdh; @@ -3245,17 +3239,9 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid) if (!smp) return ERR_PTR(-ENOMEM); - tfm_aes = crypto_alloc_cipher("aes", 0, 0); - if (IS_ERR(tfm_aes)) { - BT_ERR("Unable to create AES crypto context"); - kzfree(smp); - return ERR_CAST(tfm_aes); - } - tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0); if (IS_ERR(tfm_cmac)) { BT_ERR("Unable to create CMAC crypto context"); - crypto_free_cipher(tfm_aes); kzfree(smp); return ERR_CAST(tfm_cmac); } @@ -3264,13 +3250,11 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid) if (IS_ERR(tfm_ecdh)) { BT_ERR("Unable to create ECDH crypto context"); crypto_free_shash(tfm_cmac); - crypto_free_cipher(tfm_aes); kzfree(smp); return ERR_CAST(tfm_ecdh); } smp->local_oob = false; - smp->tfm_aes = tfm_aes; smp->tfm_cmac = tfm_cmac; smp->tfm_ecdh = tfm_ecdh; @@ -3278,7 +3262,6 @@ create_chan: chan = l2cap_chan_create(); if (!chan) { if (smp) { - crypto_free_cipher(smp->tfm_aes); crypto_free_shash(smp->tfm_cmac); crypto_free_kpp(smp->tfm_ecdh); kzfree(smp); @@ -3326,7 +3309,6 @@ static void smp_del_chan(struct l2cap_chan *chan) smp = chan->data; if (smp) { chan->data = NULL; - crypto_free_cipher(smp->tfm_aes); crypto_free_shash(smp->tfm_cmac); crypto_free_kpp(smp->tfm_ecdh); kzfree(smp); @@ -3391,94 +3373,6 @@ static const struct file_operations force_bredr_smp_fops = { .llseek = default_llseek, }; -static ssize_t le_min_key_size_read(struct file *file, - char __user *user_buf, - size_t count, loff_t *ppos) -{ - struct hci_dev *hdev = file->private_data; - char buf[4]; - - snprintf(buf, sizeof(buf), "%2u\n", hdev->le_min_key_size); - - return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); -} - -static ssize_t le_min_key_size_write(struct file *file, - const char __user *user_buf, - size_t count, loff_t *ppos) -{ - struct hci_dev *hdev = file->private_data; - char buf[32]; - size_t buf_size = min(count, (sizeof(buf) - 1)); - u8 key_size; - - if (copy_from_user(buf, user_buf, buf_size)) - return -EFAULT; - - buf[buf_size] = '\0'; - - sscanf(buf, "%hhu", &key_size); - - if (key_size > hdev->le_max_key_size || - key_size < SMP_MIN_ENC_KEY_SIZE) - return -EINVAL; - - hdev->le_min_key_size = key_size; - - return count; -} - -static const struct file_operations le_min_key_size_fops = { - .open = simple_open, - .read = le_min_key_size_read, - .write = le_min_key_size_write, - .llseek = default_llseek, -}; - -static ssize_t le_max_key_size_read(struct file *file, - char __user *user_buf, - size_t count, loff_t *ppos) -{ - struct hci_dev *hdev = file->private_data; - char buf[4]; - - snprintf(buf, sizeof(buf), "%2u\n", hdev->le_max_key_size); - - return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); -} - -static ssize_t le_max_key_size_write(struct file *file, - const char __user *user_buf, - size_t count, loff_t *ppos) -{ - struct hci_dev *hdev = file->private_data; - char buf[32]; - size_t buf_size = min(count, (sizeof(buf) - 1)); - u8 key_size; - - if (copy_from_user(buf, user_buf, buf_size)) - return -EFAULT; - - buf[buf_size] = '\0'; - - sscanf(buf, "%hhu", &key_size); - - if (key_size > SMP_MAX_ENC_KEY_SIZE || - key_size < hdev->le_min_key_size) - return -EINVAL; - - hdev->le_max_key_size = key_size; - - return count; -} - -static const struct file_operations le_max_key_size_fops = { - .open = simple_open, - .read = le_max_key_size_read, - .write = le_max_key_size_write, - .llseek = default_llseek, -}; - int smp_register(struct hci_dev *hdev) { struct l2cap_chan *chan; @@ -3503,11 +3397,6 @@ int smp_register(struct hci_dev *hdev) hdev->smp_data = chan; - debugfs_create_file("le_min_key_size", 0644, hdev->debugfs, hdev, - &le_min_key_size_fops); - debugfs_create_file("le_max_key_size", 0644, hdev->debugfs, hdev, - &le_max_key_size_fops); - /* If the controller does not support BR/EDR Secure Connections * feature, then the BR/EDR SMP channel shall not be present. * @@ -3582,7 +3471,7 @@ static int __init test_debug_key(struct crypto_kpp *tfm_ecdh) return 0; } -static int __init test_ah(struct crypto_cipher *tfm_aes) +static int __init test_ah(void) { const u8 irk[16] = { 0x9b, 0x7d, 0x39, 0x0a, 0xa6, 0x10, 0x10, 0x34, @@ -3592,7 +3481,7 @@ static int __init test_ah(struct crypto_cipher *tfm_aes) u8 res[3]; int err; - err = smp_ah(tfm_aes, irk, r, res); + err = smp_ah(irk, r, res); if (err) return err; @@ -3602,7 +3491,7 @@ static int __init test_ah(struct crypto_cipher *tfm_aes) return 0; } -static int __init test_c1(struct crypto_cipher *tfm_aes) +static int __init test_c1(void) { const u8 k[16] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -3622,7 +3511,7 @@ static int __init test_c1(struct crypto_cipher *tfm_aes) u8 res[16]; int err; - err = smp_c1(tfm_aes, k, r, preq, pres, _iat, &ia, _rat, &ra, res); + err = smp_c1(k, r, preq, pres, _iat, &ia, _rat, &ra, res); if (err) return err; @@ -3632,7 +3521,7 @@ static int __init test_c1(struct crypto_cipher *tfm_aes) return 0; } -static int __init test_s1(struct crypto_cipher *tfm_aes) +static int __init test_s1(void) { const u8 k[16] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -3647,7 +3536,7 @@ static int __init test_s1(struct crypto_cipher *tfm_aes) u8 res[16]; int err; - err = smp_s1(tfm_aes, k, r1, r2, res); + err = smp_s1(k, r1, r2, res); if (err) return err; @@ -3828,8 +3717,7 @@ static const struct file_operations test_smp_fops = { .llseek = default_llseek, }; -static int __init run_selftests(struct crypto_cipher *tfm_aes, - struct crypto_shash *tfm_cmac, +static int __init run_selftests(struct crypto_shash *tfm_cmac, struct crypto_kpp *tfm_ecdh) { ktime_t calltime, delta, rettime; @@ -3844,19 +3732,19 @@ static int __init run_selftests(struct crypto_cipher *tfm_aes, goto done; } - err = test_ah(tfm_aes); + err = test_ah(); if (err) { BT_ERR("smp_ah test failed"); goto done; } - err = test_c1(tfm_aes); + err = test_c1(); if (err) { BT_ERR("smp_c1 test failed"); goto done; } - err = test_s1(tfm_aes); + err = test_s1(); if (err) { BT_ERR("smp_s1 test failed"); goto done; @@ -3913,21 +3801,13 @@ done: int __init bt_selftest_smp(void) { - struct crypto_cipher *tfm_aes; struct crypto_shash *tfm_cmac; struct crypto_kpp *tfm_ecdh; int err; - tfm_aes = crypto_alloc_cipher("aes", 0, 0); - if (IS_ERR(tfm_aes)) { - BT_ERR("Unable to create AES crypto context"); - return PTR_ERR(tfm_aes); - } - tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0); if (IS_ERR(tfm_cmac)) { BT_ERR("Unable to create CMAC crypto context"); - crypto_free_cipher(tfm_aes); return PTR_ERR(tfm_cmac); } @@ -3935,14 +3815,12 @@ int __init bt_selftest_smp(void) if (IS_ERR(tfm_ecdh)) { BT_ERR("Unable to create ECDH crypto context"); crypto_free_shash(tfm_cmac); - crypto_free_cipher(tfm_aes); return PTR_ERR(tfm_ecdh); } - err = run_selftests(tfm_aes, tfm_cmac, tfm_ecdh); + err = run_selftests(tfm_cmac, tfm_ecdh); crypto_free_shash(tfm_cmac); - crypto_free_cipher(tfm_aes); crypto_free_kpp(tfm_ecdh); return err; diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 80e6f3a6864d..d555c0d8657d 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -15,7 +15,7 @@ #include <trace/events/bpf_test_run.h> static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, - u32 *retval, u32 *time) + u32 *retval, u32 *time, bool xdp) { struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { NULL }; enum bpf_cgroup_storage_type stype; @@ -41,7 +41,11 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, time_start = ktime_get_ns(); for (i = 0; i < repeat; i++) { bpf_cgroup_storage_set(storage); - *retval = BPF_PROG_RUN(prog, ctx); + + if (xdp) + *retval = bpf_prog_run_xdp(prog, ctx); + else + *retval = BPF_PROG_RUN(prog, ctx); if (signal_pending(current)) { ret = -EINTR; @@ -105,6 +109,40 @@ out: return err; } +/* Integer types of various sizes and pointer combinations cover variety of + * architecture dependent calling conventions. 7+ can be supported in the + * future. + */ +int noinline bpf_fentry_test1(int a) +{ + return a + 1; +} + +int noinline bpf_fentry_test2(int a, u64 b) +{ + return a + b; +} + +int noinline bpf_fentry_test3(char a, int b, u64 c) +{ + return a + b + c; +} + +int noinline bpf_fentry_test4(void *a, char b, int c, u64 d) +{ + return (long)a + b + c + d; +} + +int noinline bpf_fentry_test5(u64 a, void *b, short c, int d, u64 e) +{ + return a + (long)b + c + d + e; +} + +int noinline bpf_fentry_test6(u64 a, void *b, short c, int d, void *e, u64 f) +{ + return a + (long)b + c + d + (long)e + f; +} + static void *bpf_test_init(const union bpf_attr *kattr, u32 size, u32 headroom, u32 tailroom) { @@ -122,6 +160,15 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 size, kfree(data); return ERR_PTR(-EFAULT); } + if (bpf_fentry_test1(1) != 2 || + bpf_fentry_test2(2, 3) != 5 || + bpf_fentry_test3(4, 5, 6) != 15 || + bpf_fentry_test4((void *)7, 8, 9, 10) != 34 || + bpf_fentry_test5(11, (void *)12, 13, 14, 15) != 65 || + bpf_fentry_test6(16, (void *)17, 18, 19, (void *)20, 21) != 111) { + kfree(data); + return ERR_PTR(-EFAULT); + } return data; } @@ -204,26 +251,53 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) return 0; /* make sure the fields we don't use are zeroed */ - if (!range_is_zero(__skb, 0, offsetof(struct __sk_buff, priority))) + if (!range_is_zero(__skb, 0, offsetof(struct __sk_buff, mark))) + return -EINVAL; + + /* mark is allowed */ + + if (!range_is_zero(__skb, offsetofend(struct __sk_buff, mark), + offsetof(struct __sk_buff, priority))) return -EINVAL; /* priority is allowed */ - if (!range_is_zero(__skb, offsetof(struct __sk_buff, priority) + - FIELD_SIZEOF(struct __sk_buff, priority), + if (!range_is_zero(__skb, offsetofend(struct __sk_buff, priority), offsetof(struct __sk_buff, cb))) return -EINVAL; /* cb is allowed */ - if (!range_is_zero(__skb, offsetof(struct __sk_buff, cb) + - FIELD_SIZEOF(struct __sk_buff, cb), + if (!range_is_zero(__skb, offsetofend(struct __sk_buff, cb), + offsetof(struct __sk_buff, tstamp))) + return -EINVAL; + + /* tstamp is allowed */ + /* wire_len is allowed */ + /* gso_segs is allowed */ + + if (!range_is_zero(__skb, offsetofend(struct __sk_buff, gso_segs), sizeof(struct __sk_buff))) return -EINVAL; + skb->mark = __skb->mark; skb->priority = __skb->priority; + skb->tstamp = __skb->tstamp; memcpy(&cb->data, __skb->cb, QDISC_CB_PRIV_LEN); + if (__skb->wire_len == 0) { + cb->pkt_len = skb->len; + } else { + if (__skb->wire_len < skb->len || + __skb->wire_len > GSO_MAX_SIZE) + return -EINVAL; + cb->pkt_len = __skb->wire_len; + } + + if (__skb->gso_segs > GSO_MAX_SEGS) + return -EINVAL; + skb_shinfo(skb)->gso_segs = __skb->gso_segs; + return 0; } @@ -234,8 +308,12 @@ static void convert_skb_to___skb(struct sk_buff *skb, struct __sk_buff *__skb) if (!__skb) return; + __skb->mark = skb->mark; __skb->priority = skb->priority; + __skb->tstamp = skb->tstamp; memcpy(__skb->cb, &cb->data, QDISC_CB_PRIV_LEN); + __skb->wire_len = cb->pkt_len; + __skb->gso_segs = skb_shinfo(skb)->gso_segs; } int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, @@ -307,7 +385,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, ret = convert___skb_to_skb(skb, ctx); if (ret) goto out; - ret = bpf_test_run(prog, skb, repeat, &retval, &duration); + ret = bpf_test_run(prog, skb, repeat, &retval, &duration, false); if (ret) goto out; if (!is_l2) { @@ -364,8 +442,8 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0); xdp.rxq = &rxqueue->xdp_rxq; - - ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration); + bpf_prog_change_xdp(NULL, prog); + ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true); if (ret) goto out; if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN || @@ -373,10 +451,26 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, size = xdp.data_end - xdp.data; ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration); out: + bpf_prog_change_xdp(prog, NULL); kfree(data); return ret; } +static int verify_user_bpf_flow_keys(struct bpf_flow_keys *ctx) +{ + /* make sure the fields we don't use are zeroed */ + if (!range_is_zero(ctx, 0, offsetof(struct bpf_flow_keys, flags))) + return -EINVAL; + + /* flags is allowed */ + + if (!range_is_zero(ctx, offsetofend(struct bpf_flow_keys, flags), + sizeof(struct bpf_flow_keys))) + return -EINVAL; + + return 0; +} + int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) @@ -384,9 +478,11 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, u32 size = kattr->test.data_size_in; struct bpf_flow_dissector ctx = {}; u32 repeat = kattr->test.repeat; + struct bpf_flow_keys *user_ctx; struct bpf_flow_keys flow_keys; u64 time_start, time_spent = 0; const struct ethhdr *eth; + unsigned int flags = 0; u32 retval, duration; void *data; int ret; @@ -395,9 +491,6 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, if (prog->type != BPF_PROG_TYPE_FLOW_DISSECTOR) return -EINVAL; - if (kattr->test.ctx_in || kattr->test.ctx_out) - return -EINVAL; - if (size < ETH_HLEN) return -EINVAL; @@ -410,6 +503,18 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, if (!repeat) repeat = 1; + user_ctx = bpf_ctx_init(kattr, sizeof(struct bpf_flow_keys)); + if (IS_ERR(user_ctx)) { + kfree(data); + return PTR_ERR(user_ctx); + } + if (user_ctx) { + ret = verify_user_bpf_flow_keys(user_ctx); + if (ret) + goto out; + flags = user_ctx->flags; + } + ctx.flow_keys = &flow_keys; ctx.data = data; ctx.data_end = (__u8 *)data + size; @@ -419,7 +524,7 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, time_start = ktime_get_ns(); for (i = 0; i < repeat; i++) { retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN, - size); + size, flags); if (signal_pending(current)) { preempt_enable(); @@ -450,8 +555,12 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, ret = bpf_test_finish(kattr, uattr, &flow_keys, sizeof(flow_keys), retval, duration); + if (!ret) + ret = bpf_ctx_finish(kattr, uattr, user_ctx, + sizeof(struct bpf_flow_keys)); out: + kfree(user_ctx); kfree(data); return ret; } diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile index aa945ab5b655..36580301da70 100644 --- a/net/bpfilter/Makefile +++ b/net/bpfilter/Makefile @@ -3,7 +3,7 @@ # Makefile for the Linux BPFILTER layer. # -hostprogs-y := bpfilter_umh +hostprogs := bpfilter_umh bpfilter_umh-objs := main.o KBUILD_HOSTCFLAGS += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi HOSTCC := $(CC) diff --git a/net/bridge/Makefile b/net/bridge/Makefile index ac9ef337f0fa..49da7ae6f077 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -20,7 +20,7 @@ obj-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o -bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o br_vlan_tunnel.o +bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o br_vlan_tunnel.o br_vlan_options.o bridge-$(CONFIG_NET_SWITCHDEV) += br_switchdev.o diff --git a/net/bridge/br.c b/net/bridge/br.c index 8a8f9e5f264f..b6fe30e3768f 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -312,7 +312,7 @@ static int __init br_init(void) { int err; - BUILD_BUG_ON(sizeof(struct br_input_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb)); + BUILD_BUG_ON(sizeof(struct br_input_skb_cb) > sizeof_field(struct sk_buff, cb)); err = stp_proto_register(&br_stp_proto); if (err < 0) { diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 681b72862c16..dc3d2c1dd9d5 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -24,8 +24,6 @@ const struct nf_br_ops __rcu *nf_br_ops __read_mostly; EXPORT_SYMBOL_GPL(nf_br_ops); -static struct lock_class_key bridge_netdev_addr_lock_key; - /* net device transmit always called with BH disabled */ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) { @@ -34,6 +32,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) struct net_bridge_mdb_entry *mdst; struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats); const struct nf_br_ops *nf_ops; + u8 state = BR_STATE_FORWARDING; const unsigned char *dest; struct ethhdr *eth; u16 vid = 0; @@ -58,7 +57,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) eth = eth_hdr(skb); skb_pull(skb, ETH_HLEN); - if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid)) + if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid, &state)) goto out; if (IS_ENABLED(CONFIG_INET) && @@ -108,11 +107,6 @@ out: return NETDEV_TX_OK; } -static void br_set_lockdep_class(struct net_device *dev) -{ - lockdep_set_class(&dev->addr_list_lock, &bridge_netdev_addr_lock_key); -} - static int br_dev_init(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); @@ -150,7 +144,6 @@ static int br_dev_init(struct net_device *dev) br_mdb_hash_fini(br); br_fdb_hash_fini(br); } - br_set_lockdep_class(dev); return err; } @@ -253,6 +246,12 @@ static int br_set_mac_address(struct net_device *dev, void *p) if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; + /* dev_set_mac_addr() can be called by a master device on bridge's + * NETDEV_UNREGISTER, but since it's being destroyed do nothing + */ + if (dev->reg_state != NETREG_REGISTERED) + return -EBUSY; + spin_lock_bh(&br->lock); if (!ether_addr_equal(dev->dev_addr, addr->sa_data)) { /* Mac address will be changed in br_stp_change_bridge_id(). */ @@ -271,6 +270,37 @@ static void br_getinfo(struct net_device *dev, struct ethtool_drvinfo *info) strlcpy(info->bus_info, "N/A", sizeof(info->bus_info)); } +static int br_get_link_ksettings(struct net_device *dev, + struct ethtool_link_ksettings *cmd) +{ + struct net_bridge *br = netdev_priv(dev); + struct net_bridge_port *p; + + cmd->base.duplex = DUPLEX_UNKNOWN; + cmd->base.port = PORT_OTHER; + cmd->base.speed = SPEED_UNKNOWN; + + list_for_each_entry(p, &br->port_list, list) { + struct ethtool_link_ksettings ecmd; + struct net_device *pdev = p->dev; + + if (!netif_running(pdev) || !netif_oper_up(pdev)) + continue; + + if (__ethtool_get_link_ksettings(pdev, &ecmd)) + continue; + + if (ecmd.base.speed == (__u32)SPEED_UNKNOWN) + continue; + + if (cmd->base.speed == (__u32)SPEED_UNKNOWN || + cmd->base.speed < ecmd.base.speed) + cmd->base.speed = ecmd.base.speed; + } + + return 0; +} + static netdev_features_t br_fix_features(struct net_device *dev, netdev_features_t features) { @@ -373,8 +403,9 @@ static int br_del_slave(struct net_device *dev, struct net_device *slave_dev) } static const struct ethtool_ops br_ethtool_ops = { - .get_drvinfo = br_getinfo, - .get_link = ethtool_op_get_link, + .get_drvinfo = br_getinfo, + .get_link = ethtool_op_get_link, + .get_link_ksettings = br_get_link_ksettings, }; static const struct net_device_ops br_netdev_ops = { diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index b1d3248c0252..4877a0db16c6 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -75,8 +75,9 @@ static inline unsigned long hold_time(const struct net_bridge *br) static inline int has_expired(const struct net_bridge *br, const struct net_bridge_fdb_entry *fdb) { - return !fdb->is_static && !fdb->added_by_external_learn && - time_before_eq(fdb->updated + hold_time(br), jiffies); + return !test_bit(BR_FDB_STATIC, &fdb->flags) && + !test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags) && + time_before_eq(fdb->updated + hold_time(br), jiffies); } static void fdb_rcu_free(struct rcu_head *head) @@ -197,7 +198,7 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f, { trace_fdb_delete(br, f); - if (f->is_static) + if (test_bit(BR_FDB_STATIC, &f->flags)) fdb_del_hw_addr(br, f->key.addr.addr); hlist_del_init_rcu(&f->fdb_node); @@ -224,7 +225,7 @@ static void fdb_delete_local(struct net_bridge *br, if (op != p && ether_addr_equal(op->dev->dev_addr, addr) && (!vid || br_vlan_find(vg, vid))) { f->dst = op; - f->added_by_user = 0; + clear_bit(BR_FDB_ADDED_BY_USER, &f->flags); return; } } @@ -235,7 +236,7 @@ static void fdb_delete_local(struct net_bridge *br, if (p && ether_addr_equal(br->dev->dev_addr, addr) && (!vid || (v && br_vlan_should_use(v)))) { f->dst = NULL; - f->added_by_user = 0; + clear_bit(BR_FDB_ADDED_BY_USER, &f->flags); return; } @@ -250,7 +251,8 @@ void br_fdb_find_delete_local(struct net_bridge *br, spin_lock_bh(&br->hash_lock); f = br_fdb_find(br, addr, vid); - if (f && f->is_local && !f->added_by_user && f->dst == p) + if (f && test_bit(BR_FDB_LOCAL, &f->flags) && + !test_bit(BR_FDB_ADDED_BY_USER, &f->flags) && f->dst == p) fdb_delete_local(br, p, f); spin_unlock_bh(&br->hash_lock); } @@ -265,7 +267,8 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) spin_lock_bh(&br->hash_lock); vg = nbp_vlan_group(p); hlist_for_each_entry(f, &br->fdb_list, fdb_node) { - if (f->dst == p && f->is_local && !f->added_by_user) { + if (f->dst == p && test_bit(BR_FDB_LOCAL, &f->flags) && + !test_bit(BR_FDB_ADDED_BY_USER, &f->flags)) { /* delete old one */ fdb_delete_local(br, p, f); @@ -306,7 +309,8 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) /* If old entry was unassociated with any port, then delete it. */ f = br_fdb_find(br, br->dev->dev_addr, 0); - if (f && f->is_local && !f->dst && !f->added_by_user) + if (f && test_bit(BR_FDB_LOCAL, &f->flags) && + !f->dst && !test_bit(BR_FDB_ADDED_BY_USER, &f->flags)) fdb_delete_local(br, NULL, f); fdb_insert(br, NULL, newaddr, 0); @@ -321,7 +325,8 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) if (!br_vlan_should_use(v)) continue; f = br_fdb_find(br, br->dev->dev_addr, v->vid); - if (f && f->is_local && !f->dst && !f->added_by_user) + if (f && test_bit(BR_FDB_LOCAL, &f->flags) && + !f->dst && !test_bit(BR_FDB_ADDED_BY_USER, &f->flags)) fdb_delete_local(br, NULL, f); fdb_insert(br, NULL, newaddr, v->vid); } @@ -346,7 +351,8 @@ void br_fdb_cleanup(struct work_struct *work) hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { unsigned long this_timer; - if (f->is_static || f->added_by_external_learn) + if (test_bit(BR_FDB_STATIC, &f->flags) || + test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &f->flags)) continue; this_timer = f->updated + delay; if (time_after(this_timer, now)) { @@ -373,7 +379,7 @@ void br_fdb_flush(struct net_bridge *br) spin_lock_bh(&br->hash_lock); hlist_for_each_entry_safe(f, tmp, &br->fdb_list, fdb_node) { - if (!f->is_static) + if (!test_bit(BR_FDB_STATIC, &f->flags)) fdb_delete(br, f, true); } spin_unlock_bh(&br->hash_lock); @@ -397,10 +403,11 @@ void br_fdb_delete_by_port(struct net_bridge *br, continue; if (!do_all) - if (f->is_static || (vid && f->key.vlan_id != vid)) + if (test_bit(BR_FDB_STATIC, &f->flags) || + (vid && f->key.vlan_id != vid)) continue; - if (f->is_local) + if (test_bit(BR_FDB_LOCAL, &f->flags)) fdb_delete_local(br, p, f); else fdb_delete(br, f, true); @@ -469,8 +476,8 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf, fe->port_no = f->dst->port_no; fe->port_hi = f->dst->port_no >> 8; - fe->is_local = f->is_local; - if (!f->is_static) + fe->is_local = test_bit(BR_FDB_LOCAL, &f->flags); + if (!test_bit(BR_FDB_STATIC, &f->flags)) fe->ageing_timer_value = jiffies_delta_to_clock_t(jiffies - f->updated); ++fe; ++num; @@ -484,8 +491,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, __u16 vid, - unsigned char is_local, - unsigned char is_static) + unsigned long flags) { struct net_bridge_fdb_entry *fdb; @@ -494,12 +500,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct net_bridge *br, memcpy(fdb->key.addr.addr, addr, ETH_ALEN); fdb->dst = source; fdb->key.vlan_id = vid; - fdb->is_local = is_local; - fdb->is_static = is_static; - fdb->added_by_user = 0; - fdb->added_by_external_learn = 0; - fdb->offloaded = 0; - fdb->is_sticky = 0; + fdb->flags = flags; fdb->updated = fdb->used = jiffies; if (rhashtable_lookup_insert_fast(&br->fdb_hash_tbl, &fdb->rhnode, @@ -526,14 +527,15 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, /* it is okay to have multiple ports with same * address, just use the first one. */ - if (fdb->is_local) + if (test_bit(BR_FDB_LOCAL, &fdb->flags)) return 0; br_warn(br, "adding interface %s with same address as a received packet (addr:%pM, vlan:%u)\n", source ? source->dev->name : br->dev->name, addr, vid); fdb_delete(br, fdb, true); } - fdb = fdb_create(br, source, addr, vid, 1, 1); + fdb = fdb_create(br, source, addr, vid, + BIT(BR_FDB_LOCAL) | BIT(BR_FDB_STATIC)); if (!fdb) return -ENOMEM; @@ -555,7 +557,7 @@ int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source, } void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, - const unsigned char *addr, u16 vid, bool added_by_user) + const unsigned char *addr, u16 vid, unsigned long flags) { struct net_bridge_fdb_entry *fdb; bool fdb_modified = false; @@ -564,15 +566,10 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, if (hold_time(br) == 0) return; - /* ignore packets unless we are using this port */ - if (!(source->state == BR_STATE_LEARNING || - source->state == BR_STATE_FORWARDING)) - return; - fdb = fdb_find_rcu(&br->fdb_hash_tbl, addr, vid); if (likely(fdb)) { /* attempt to update an entry for a local interface */ - if (unlikely(fdb->is_local)) { + if (unlikely(test_bit(BR_FDB_LOCAL, &fdb->flags))) { if (net_ratelimit()) br_warn(br, "received packet on %s with own address as source address (addr:%pM, vlan:%u)\n", source->dev->name, addr, vid); @@ -580,30 +577,30 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, unsigned long now = jiffies; /* fastpath: update of existing entry */ - if (unlikely(source != fdb->dst && !fdb->is_sticky)) { + if (unlikely(source != fdb->dst && + !test_bit(BR_FDB_STICKY, &fdb->flags))) { fdb->dst = source; fdb_modified = true; /* Take over HW learned entry */ - if (unlikely(fdb->added_by_external_learn)) - fdb->added_by_external_learn = 0; + if (unlikely(test_bit(BR_FDB_ADDED_BY_EXT_LEARN, + &fdb->flags))) + clear_bit(BR_FDB_ADDED_BY_EXT_LEARN, + &fdb->flags); } if (now != fdb->updated) fdb->updated = now; - if (unlikely(added_by_user)) - fdb->added_by_user = 1; + if (unlikely(test_bit(BR_FDB_ADDED_BY_USER, &flags))) + set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); if (unlikely(fdb_modified)) { - trace_br_fdb_update(br, source, addr, vid, added_by_user); + trace_br_fdb_update(br, source, addr, vid, flags); fdb_notify(br, fdb, RTM_NEWNEIGH, true); } } } else { spin_lock(&br->hash_lock); - fdb = fdb_create(br, source, addr, vid, 0, 0); + fdb = fdb_create(br, source, addr, vid, flags); if (fdb) { - if (unlikely(added_by_user)) - fdb->added_by_user = 1; - trace_br_fdb_update(br, source, addr, vid, - added_by_user); + trace_br_fdb_update(br, source, addr, vid, flags); fdb_notify(br, fdb, RTM_NEWNEIGH, true); } /* else we lose race and someone else inserts @@ -616,9 +613,9 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, static int fdb_to_nud(const struct net_bridge *br, const struct net_bridge_fdb_entry *fdb) { - if (fdb->is_local) + if (test_bit(BR_FDB_LOCAL, &fdb->flags)) return NUD_PERMANENT; - else if (fdb->is_static) + else if (test_bit(BR_FDB_STATIC, &fdb->flags)) return NUD_NOARP; else if (has_expired(br, fdb)) return NUD_STALE; @@ -648,11 +645,11 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, ndm->ndm_ifindex = fdb->dst ? fdb->dst->dev->ifindex : br->dev->ifindex; ndm->ndm_state = fdb_to_nud(br, fdb); - if (fdb->offloaded) + if (test_bit(BR_FDB_OFFLOADED, &fdb->flags)) ndm->ndm_flags |= NTF_OFFLOADED; - if (fdb->added_by_external_learn) + if (test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) ndm->ndm_flags |= NTF_EXT_LEARNED; - if (fdb->is_sticky) + if (test_bit(BR_FDB_STICKY, &fdb->flags)) ndm->ndm_flags |= NTF_STICKY; if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.addr)) @@ -799,7 +796,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, const u8 *addr, u16 state, u16 flags, u16 vid, u8 ndm_flags) { - u8 is_sticky = !!(ndm_flags & NTF_STICKY); + bool is_sticky = !!(ndm_flags & NTF_STICKY); struct net_bridge_fdb_entry *fdb; bool modified = false; @@ -823,7 +820,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, if (!(flags & NLM_F_CREATE)) return -ENOENT; - fdb = fdb_create(br, source, addr, vid, 0, 0); + fdb = fdb_create(br, source, addr, vid, 0); if (!fdb) return -ENOMEM; @@ -840,34 +837,28 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, if (fdb_to_nud(br, fdb) != state) { if (state & NUD_PERMANENT) { - fdb->is_local = 1; - if (!fdb->is_static) { - fdb->is_static = 1; + set_bit(BR_FDB_LOCAL, &fdb->flags); + if (!test_and_set_bit(BR_FDB_STATIC, &fdb->flags)) fdb_add_hw_addr(br, addr); - } } else if (state & NUD_NOARP) { - fdb->is_local = 0; - if (!fdb->is_static) { - fdb->is_static = 1; + clear_bit(BR_FDB_LOCAL, &fdb->flags); + if (!test_and_set_bit(BR_FDB_STATIC, &fdb->flags)) fdb_add_hw_addr(br, addr); - } } else { - fdb->is_local = 0; - if (fdb->is_static) { - fdb->is_static = 0; + clear_bit(BR_FDB_LOCAL, &fdb->flags); + if (test_and_clear_bit(BR_FDB_STATIC, &fdb->flags)) fdb_del_hw_addr(br, addr); - } } modified = true; } - if (is_sticky != fdb->is_sticky) { - fdb->is_sticky = is_sticky; + if (is_sticky != test_bit(BR_FDB_STICKY, &fdb->flags)) { + change_bit(BR_FDB_STICKY, &fdb->flags); modified = true; } - fdb->added_by_user = 1; + set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); fdb->used = jiffies; if (modified) { @@ -890,9 +881,12 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, br->dev->name); return -EINVAL; } + if (!nbp_state_should_learn(p)) + return 0; + local_bh_disable(); rcu_read_lock(); - br_fdb_update(br, p, addr, vid, true); + br_fdb_update(br, p, addr, vid, BIT(BR_FDB_ADDED_BY_USER)); rcu_read_unlock(); local_bh_enable(); } else if (ndm->ndm_flags & NTF_EXT_LEARNED) { @@ -1064,7 +1058,7 @@ int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p) rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { /* We only care for static entries */ - if (!f->is_static) + if (!test_bit(BR_FDB_STATIC, &f->flags)) continue; err = dev_uc_add(p->dev, f->key.addr.addr); if (err) @@ -1078,7 +1072,7 @@ done: rollback: hlist_for_each_entry_rcu(tmp, &br->fdb_list, fdb_node) { /* We only care for static entries */ - if (!tmp->is_static) + if (!test_bit(BR_FDB_STATIC, &tmp->flags)) continue; if (tmp == f) break; @@ -1097,7 +1091,7 @@ void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p) rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { /* We only care for static entries */ - if (!f->is_static) + if (!test_bit(BR_FDB_STATIC, &f->flags)) continue; dev_uc_del(p->dev, f->key.addr.addr); @@ -1119,14 +1113,15 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, fdb = br_fdb_find(br, addr, vid); if (!fdb) { - fdb = fdb_create(br, p, addr, vid, 0, 0); + unsigned long flags = BIT(BR_FDB_ADDED_BY_EXT_LEARN); + + if (swdev_notify) + flags |= BIT(BR_FDB_ADDED_BY_USER); + fdb = fdb_create(br, p, addr, vid, flags); if (!fdb) { err = -ENOMEM; goto err_unlock; } - if (swdev_notify) - fdb->added_by_user = 1; - fdb->added_by_external_learn = 1; fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify); } else { fdb->updated = jiffies; @@ -1136,17 +1131,17 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, modified = true; } - if (fdb->added_by_external_learn) { + if (test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) { /* Refresh entry */ fdb->used = jiffies; - } else if (!fdb->added_by_user) { + } else if (!test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags)) { /* Take over SW learned entry */ - fdb->added_by_external_learn = 1; + set_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags); modified = true; } if (swdev_notify) - fdb->added_by_user = 1; + set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); if (modified) fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify); @@ -1168,7 +1163,7 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p, spin_lock_bh(&br->hash_lock); fdb = br_fdb_find(br, addr, vid); - if (fdb && fdb->added_by_external_learn) + if (fdb && test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) fdb_delete(br, fdb, swdev_notify); else err = -ENOENT; @@ -1186,8 +1181,8 @@ void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p, spin_lock_bh(&br->hash_lock); fdb = br_fdb_find(br, addr, vid); - if (fdb) - fdb->offloaded = offloaded; + if (fdb && offloaded != test_bit(BR_FDB_OFFLOADED, &fdb->flags)) + change_bit(BR_FDB_OFFLOADED, &fdb->flags); spin_unlock_bh(&br->hash_lock); } @@ -1206,7 +1201,7 @@ void br_fdb_clear_offload(const struct net_device *dev, u16 vid) spin_lock_bh(&p->br->hash_lock); hlist_for_each_entry(f, &p->br->fdb_list, fdb_node) { if (f->dst == p && f->key.vlan_id == vid) - f->offloaded = 0; + clear_bit(BR_FDB_OFFLOADED, &f->flags); } spin_unlock_bh(&p->br->hash_lock); } diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 86637000f275..7629b63f6f30 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -25,7 +25,7 @@ static inline int should_deliver(const struct net_bridge_port *p, vg = nbp_vlan_group_rcu(p); return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) && - br_allowed_egress(vg, skb) && p->state == BR_STATE_FORWARDING && + p->state == BR_STATE_FORWARDING && br_allowed_egress(vg, skb) && nbp_switchdev_allowed_egress(p, skb) && !br_skb_isolated(p, skb); } diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 09b1dd8cd853..fcc260840028 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -76,11 +76,14 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb bool local_rcv, mcast_hit = false; struct net_bridge *br; u16 vid = 0; + u8 state; if (!p || p->state == BR_STATE_DISABLED) goto drop; - if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid)) + state = p->state; + if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid, + &state)) goto out; nbp_switchdev_frame_mark(p, skb); @@ -88,7 +91,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb /* insert into forwarding database after filtering to avoid spoofing */ br = p->br; if (p->flags & BR_LEARNING) - br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, false); + br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, 0); local_rcv = !!(br->dev->flags & IFF_PROMISC); if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) { @@ -103,7 +106,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb } } - if (p->state == BR_STATE_LEARNING) + if (state == BR_STATE_LEARNING) goto drop; BR_INPUT_SKB_CB(skb)->brdev = br->dev; @@ -151,7 +154,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if (dst) { unsigned long now = jiffies; - if (dst->is_local) + if (test_bit(BR_FDB_LOCAL, &dst->flags)) return br_pass_frame_up(skb); if (now != dst->used) @@ -182,9 +185,10 @@ static void __br_handle_local_finish(struct sk_buff *skb) /* check if vlan is allowed, to avoid spoofing */ if ((p->flags & BR_LEARNING) && + nbp_state_should_learn(p) && !br_opt_get(p->br, BROPT_NO_LL_LEARN) && br_should_learn(p, skb, &vid)) - br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, false); + br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, 0); } /* note: already called with rcu_read_lock */ diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index bf6acd34234d..da5ed4cf9233 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -60,6 +60,8 @@ static void __mdb_entry_fill_flags(struct br_mdb_entry *e, unsigned char flags) e->flags = 0; if (flags & MDB_PG_FLAGS_OFFLOAD) e->flags |= MDB_FLAGS_OFFLOAD; + if (flags & MDB_PG_FLAGS_FAST_LEAVE) + e->flags |= MDB_FLAGS_FAST_LEAVE; } static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip) @@ -75,6 +77,53 @@ static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip) #endif } +static int __mdb_fill_info(struct sk_buff *skb, + struct net_bridge_mdb_entry *mp, + struct net_bridge_port_group *p) +{ + struct timer_list *mtimer; + struct nlattr *nest_ent; + struct br_mdb_entry e; + u8 flags = 0; + int ifindex; + + memset(&e, 0, sizeof(e)); + if (p) { + ifindex = p->port->dev->ifindex; + mtimer = &p->timer; + flags = p->flags; + } else { + ifindex = mp->br->dev->ifindex; + mtimer = &mp->timer; + } + + __mdb_entry_fill_flags(&e, flags); + e.ifindex = ifindex; + e.vid = mp->addr.vid; + if (mp->addr.proto == htons(ETH_P_IP)) + e.addr.u.ip4 = mp->addr.u.ip4; +#if IS_ENABLED(CONFIG_IPV6) + if (mp->addr.proto == htons(ETH_P_IPV6)) + e.addr.u.ip6 = mp->addr.u.ip6; +#endif + e.addr.proto = mp->addr.proto; + nest_ent = nla_nest_start_noflag(skb, + MDBA_MDB_ENTRY_INFO); + if (!nest_ent) + return -EMSGSIZE; + + if (nla_put_nohdr(skb, sizeof(e), &e) || + nla_put_u32(skb, + MDBA_MDB_EATTR_TIMER, + br_timer_value(mtimer))) { + nla_nest_cancel(skb, nest_ent); + return -EMSGSIZE; + } + nla_nest_end(skb, nest_ent); + + return 0; +} + static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev) { @@ -93,7 +142,6 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, hlist_for_each_entry_rcu(mp, &br->mdb_list, mdb_node) { struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; - struct net_bridge_port *port; if (idx < s_idx) goto skip; @@ -104,43 +152,24 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, break; } + if (mp->host_joined) { + err = __mdb_fill_info(skb, mp, NULL); + if (err) { + nla_nest_cancel(skb, nest2); + break; + } + } + for (pp = &mp->ports; (p = rcu_dereference(*pp)) != NULL; pp = &p->next) { - struct nlattr *nest_ent; - struct br_mdb_entry e; - - port = p->port; - if (!port) + if (!p->port) continue; - memset(&e, 0, sizeof(e)); - e.ifindex = port->dev->ifindex; - e.vid = p->addr.vid; - __mdb_entry_fill_flags(&e, p->flags); - if (p->addr.proto == htons(ETH_P_IP)) - e.addr.u.ip4 = p->addr.u.ip4; -#if IS_ENABLED(CONFIG_IPV6) - if (p->addr.proto == htons(ETH_P_IPV6)) - e.addr.u.ip6 = p->addr.u.ip6; -#endif - e.addr.proto = p->addr.proto; - nest_ent = nla_nest_start_noflag(skb, - MDBA_MDB_ENTRY_INFO); - if (!nest_ent) { + err = __mdb_fill_info(skb, mp, p); + if (err) { nla_nest_cancel(skb, nest2); - err = -EMSGSIZE; goto out; } - if (nla_put_nohdr(skb, sizeof(e), &e) || - nla_put_u32(skb, - MDBA_MDB_EATTR_TIMER, - br_timer_value(&p->timer))) { - nla_nest_cancel(skb, nest_ent); - nla_nest_cancel(skb, nest2); - err = -EMSGSIZE; - goto out; - } - nla_nest_end(skb, nest_ent); } nla_nest_end(skb, nest2); skip: @@ -437,7 +466,7 @@ static int nlmsg_populate_rtr_fill(struct sk_buff *skb, struct nlmsghdr *nlh; struct nlattr *nest; - nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), NLM_F_MULTI); + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), 0); if (!nlh) return -EMSGSIZE; @@ -587,6 +616,19 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, return err; } + /* host join */ + if (!port) { + /* don't allow any flags for host-joined groups */ + if (state) + return -EINVAL; + if (mp->host_joined) + return -EEXIST; + + br_multicast_host_join(mp, false); + + return 0; + } + for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL; pp = &p->next) { @@ -611,19 +653,21 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br, { struct br_ip ip; struct net_device *dev; - struct net_bridge_port *p; + struct net_bridge_port *p = NULL; int ret; if (!netif_running(br->dev) || !br_opt_get(br, BROPT_MULTICAST_ENABLED)) return -EINVAL; - dev = __dev_get_by_index(net, entry->ifindex); - if (!dev) - return -ENODEV; + if (entry->ifindex != br->dev->ifindex) { + dev = __dev_get_by_index(net, entry->ifindex); + if (!dev) + return -ENODEV; - p = br_port_get_rtnl(dev); - if (!p || p->br != br || p->state == BR_STATE_DISABLED) - return -EINVAL; + p = br_port_get_rtnl(dev); + if (!p || p->br != br || p->state == BR_STATE_DISABLED) + return -EINVAL; + } __mdb_entry_to_br_ip(entry, &ip); @@ -638,9 +682,9 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, { struct net *net = sock_net(skb->sk); struct net_bridge_vlan_group *vg; + struct net_bridge_port *p = NULL; struct net_device *dev, *pdev; struct br_mdb_entry *entry; - struct net_bridge_port *p; struct net_bridge_vlan *v; struct net_bridge *br; int err; @@ -651,18 +695,22 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, br = netdev_priv(dev); + if (entry->ifindex != br->dev->ifindex) { + pdev = __dev_get_by_index(net, entry->ifindex); + if (!pdev) + return -ENODEV; + + p = br_port_get_rtnl(pdev); + if (!p || p->br != br || p->state == BR_STATE_DISABLED) + return -EINVAL; + vg = nbp_vlan_group(p); + } else { + vg = br_vlan_group(br); + } + /* If vlan filtering is enabled and VLAN is not specified * install mdb entry on all vlans configured on the port. */ - pdev = __dev_get_by_index(net, entry->ifindex); - if (!pdev) - return -ENODEV; - - p = br_port_get_rtnl(pdev); - if (!p || p->br != br || p->state == BR_STATE_DISABLED) - return -EINVAL; - - vg = nbp_vlan_group(p); if (br_vlan_enabled(br->dev) && vg && entry->vid == 0) { list_for_each_entry(v, &vg->vlan_list, vlist) { entry->vid = v->vid; @@ -698,6 +746,15 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) if (!mp) goto unlock; + /* host leave */ + if (entry->ifindex == mp->br->dev->ifindex && mp->host_joined) { + br_multicast_host_leave(mp, false); + err = 0; + if (!mp->ports && netif_running(br->dev)) + mod_timer(&mp->timer, jiffies); + goto unlock; + } + for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL; pp = &p->next) { @@ -730,9 +787,9 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, { struct net *net = sock_net(skb->sk); struct net_bridge_vlan_group *vg; + struct net_bridge_port *p = NULL; struct net_device *dev, *pdev; struct br_mdb_entry *entry; - struct net_bridge_port *p; struct net_bridge_vlan *v; struct net_bridge *br; int err; @@ -743,18 +800,22 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, br = netdev_priv(dev); + if (entry->ifindex != br->dev->ifindex) { + pdev = __dev_get_by_index(net, entry->ifindex); + if (!pdev) + return -ENODEV; + + p = br_port_get_rtnl(pdev); + if (!p || p->br != br || p->state == BR_STATE_DISABLED) + return -EINVAL; + vg = nbp_vlan_group(p); + } else { + vg = br_vlan_group(br); + } + /* If vlan filtering is enabled and VLAN is not specified * delete mdb entry on all vlans configured on the port. */ - pdev = __dev_get_by_index(net, entry->ifindex); - if (!pdev) - return -ENODEV; - - p = br_port_get_rtnl(pdev); - if (!p || p->br != br || p->state == BR_STATE_DISABLED) - return -EINVAL; - - vg = nbp_vlan_group(p); if (br_vlan_enabled(br->dev) && vg && entry->vid == 0) { list_for_each_entry(v, &vg->vlan_list, vlist) { entry->vid = v->vid; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index f8cac3702712..ad12fe3fca8c 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -148,8 +148,7 @@ static void br_multicast_group_expired(struct timer_list *t) if (!netif_running(br->dev) || timer_pending(&mp->timer)) goto out; - mp->host_joined = false; - br_mdb_notify(br->dev, NULL, &mp->addr, RTM_DELMDB, 0); + br_multicast_host_leave(mp, true); if (mp->ports) goto out; @@ -512,6 +511,27 @@ static bool br_port_group_equal(struct net_bridge_port_group *p, return ether_addr_equal(src, p->eth_addr); } +void br_multicast_host_join(struct net_bridge_mdb_entry *mp, bool notify) +{ + if (!mp->host_joined) { + mp->host_joined = true; + if (notify) + br_mdb_notify(mp->br->dev, NULL, &mp->addr, + RTM_NEWMDB, 0); + } + mod_timer(&mp->timer, jiffies + mp->br->multicast_membership_interval); +} + +void br_multicast_host_leave(struct net_bridge_mdb_entry *mp, bool notify) +{ + if (!mp->host_joined) + return; + + mp->host_joined = false; + if (notify) + br_mdb_notify(mp->br->dev, NULL, &mp->addr, RTM_DELMDB, 0); +} + static int br_multicast_add_group(struct net_bridge *br, struct net_bridge_port *port, struct br_ip *group, @@ -534,11 +554,7 @@ static int br_multicast_add_group(struct net_bridge *br, goto err; if (!port) { - if (!mp->host_joined) { - mp->host_joined = true; - br_mdb_notify(br->dev, NULL, &mp->addr, RTM_NEWMDB, 0); - } - mod_timer(&mp->timer, now + br->multicast_membership_interval); + br_multicast_host_join(mp, true); goto out; } @@ -1396,7 +1412,7 @@ br_multicast_leave_group(struct net_bridge *br, del_timer(&p->timer); kfree_rcu(p, rcu); br_mdb_notify(br->dev, port, group, RTM_DELMDB, - p->flags); + p->flags | MDB_PG_FLAGS_FAST_LEAVE); if (!mp->ports && !mp->host_joined && netif_running(br->dev)) diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index d3f9592f4ff8..59980ecfc962 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -496,6 +496,10 @@ static unsigned int br_nf_pre_routing(void *priv, if (!brnet->call_ip6tables && !br_opt_get(br, BROPT_NF_CALL_IP6TABLES)) return NF_ACCEPT; + if (!ipv6_mod_enabled()) { + pr_warn_once("Module ipv6 is disabled, so call_ip6tables is not supported."); + return NF_DROP; + } nf_bridge_pull_encap_header_rcsum(skb); return br_nf_pre_routing_ipv6(priv, skb, state); @@ -658,6 +662,9 @@ static unsigned int br_nf_forward_arp(void *priv, nf_bridge_pull_encap_header(skb); } + if (unlikely(!pskb_may_pull(skb, sizeof(struct arphdr)))) + return NF_DROP; + if (arp_hdr(skb)->ar_pln != 4) { if (is_vlan_arp(skb, state->net)) nf_bridge_push_encap_header(skb); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index a0a54482aabc..43dab4066f91 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -561,52 +561,73 @@ static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p, return err; } -static int br_process_vlan_info(struct net_bridge *br, - struct net_bridge_port *p, int cmd, - struct bridge_vlan_info *vinfo_curr, - struct bridge_vlan_info **vinfo_last, - bool *changed, - struct netlink_ext_ack *extack) +int br_process_vlan_info(struct net_bridge *br, + struct net_bridge_port *p, int cmd, + struct bridge_vlan_info *vinfo_curr, + struct bridge_vlan_info **vinfo_last, + bool *changed, + struct netlink_ext_ack *extack) { - if (!vinfo_curr->vid || vinfo_curr->vid >= VLAN_VID_MASK) + int err, rtm_cmd; + + if (!br_vlan_valid_id(vinfo_curr->vid, extack)) return -EINVAL; + /* needed for vlan-only NEWVLAN/DELVLAN notifications */ + rtm_cmd = br_afspec_cmd_to_rtm(cmd); + if (vinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) { - /* check if we are already processing a range */ - if (*vinfo_last) + if (!br_vlan_valid_range(vinfo_curr, *vinfo_last, extack)) return -EINVAL; *vinfo_last = vinfo_curr; - /* don't allow range of pvids */ - if ((*vinfo_last)->flags & BRIDGE_VLAN_INFO_PVID) - return -EINVAL; return 0; } if (*vinfo_last) { struct bridge_vlan_info tmp_vinfo; - int v, err; - - if (!(vinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_END)) - return -EINVAL; + int v, v_change_start = 0; - if (vinfo_curr->vid <= (*vinfo_last)->vid) + if (!br_vlan_valid_range(vinfo_curr, *vinfo_last, extack)) return -EINVAL; memcpy(&tmp_vinfo, *vinfo_last, sizeof(struct bridge_vlan_info)); for (v = (*vinfo_last)->vid; v <= vinfo_curr->vid; v++) { + bool curr_change = false; + tmp_vinfo.vid = v; - err = br_vlan_info(br, p, cmd, &tmp_vinfo, changed, + err = br_vlan_info(br, p, cmd, &tmp_vinfo, &curr_change, extack); if (err) break; + if (curr_change) { + *changed = curr_change; + if (!v_change_start) + v_change_start = v; + } else { + /* nothing to notify yet */ + if (!v_change_start) + continue; + br_vlan_notify(br, p, v_change_start, + v - 1, rtm_cmd); + v_change_start = 0; + } } + /* v_change_start is set only if the last/whole range changed */ + if (v_change_start) + br_vlan_notify(br, p, v_change_start, + v - 1, rtm_cmd); + *vinfo_last = NULL; return err; } - return br_vlan_info(br, p, cmd, vinfo_curr, changed, extack); + err = br_vlan_info(br, p, cmd, vinfo_curr, changed, extack); + if (*changed) + br_vlan_notify(br, p, vinfo_curr->vid, 0, rtm_cmd); + + return err; } static int br_afspec(struct net_bridge *br, @@ -1607,6 +1628,19 @@ static int br_fill_linkxstats(struct sk_buff *skb, br_multicast_get_stats(br, p, nla_data(nla)); } #endif + + if (p) { + nla = nla_reserve_64bit(skb, BRIDGE_XSTATS_STP, + sizeof(p->stp_xstats), + BRIDGE_XSTATS_PAD); + if (!nla) + goto nla_put_failure; + + spin_lock_bh(&br->lock); + memcpy(nla_data(nla), &p->stp_xstats, sizeof(p->stp_xstats)); + spin_unlock_bh(&br->lock); + } + nla_nest_end(skb, nest); *prividx = 0; @@ -1651,6 +1685,7 @@ int __init br_netlink_init(void) int err; br_mdb_init(); + br_vlan_rtnl_init(); rtnl_af_register(&br_af_ops); err = rtnl_link_register(&br_link_ops); @@ -1668,6 +1703,7 @@ out_af: void br_netlink_fini(void) { br_mdb_uninit(); + br_vlan_rtnl_uninit(); rtnl_af_unregister(&br_af_ops); rtnl_link_unregister(&br_link_ops); } diff --git a/net/bridge/br_nf_core.c b/net/bridge/br_nf_core.c index 2cdfc5d6c25d..8c69f0c95a8e 100644 --- a/net/bridge/br_nf_core.c +++ b/net/bridge/br_nf_core.c @@ -22,7 +22,8 @@ #endif static void fake_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) + struct sk_buff *skb, u32 mtu, + bool confirm_neigh) { } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 646504db0220..5153ffe79a01 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -113,6 +113,7 @@ enum { * @vid: VLAN id * @flags: bridge vlan flags * @priv_flags: private (in-kernel) bridge vlan flags + * @state: STP state (e.g. blocking, learning, forwarding) * @stats: per-cpu VLAN statistics * @br: if MASTER flag set, this points to a bridge struct * @port: if MASTER flag unset, this points to a port struct @@ -133,6 +134,7 @@ struct net_bridge_vlan { u16 vid; u16 flags; u16 priv_flags; + u8 state; struct br_vlan_stats __percpu *stats; union { struct net_bridge *br; @@ -157,6 +159,7 @@ struct net_bridge_vlan { * @vlan_list: sorted VLAN entry list * @num_vlans: number of total VLAN entries * @pvid: PVID VLAN id + * @pvid_state: PVID's STP state (e.g. forwarding, learning, blocking) * * IMPORTANT: Be careful when checking if there're VLAN entries using list * primitives because the bridge can have entries in its list which @@ -170,6 +173,17 @@ struct net_bridge_vlan_group { struct list_head vlan_list; u16 num_vlans; u16 pvid; + u8 pvid_state; +}; + +/* bridge fdb flags */ +enum { + BR_FDB_LOCAL, + BR_FDB_STATIC, + BR_FDB_STICKY, + BR_FDB_ADDED_BY_USER, + BR_FDB_ADDED_BY_EXT_LEARN, + BR_FDB_OFFLOADED, }; struct net_bridge_fdb_key { @@ -183,12 +197,7 @@ struct net_bridge_fdb_entry { struct net_bridge_fdb_key key; struct hlist_node fdb_node; - unsigned char is_local:1, - is_static:1, - is_sticky:1, - added_by_user:1, - added_by_external_learn:1, - offloaded:1; + unsigned long flags; /* write-heavy members should not affect lookups */ unsigned long updated ____cacheline_aligned_in_smp; @@ -199,6 +208,7 @@ struct net_bridge_fdb_entry { #define MDB_PG_FLAGS_PERMANENT BIT(0) #define MDB_PG_FLAGS_OFFLOAD BIT(1) +#define MDB_PG_FLAGS_FAST_LEAVE BIT(2) struct net_bridge_port_group { struct net_bridge_port *port; @@ -277,6 +287,8 @@ struct net_bridge_port { #endif u16 group_fwd_mask; u16 backup_redirected_cnt; + + struct bridge_stp_xstats stp_xstats; }; #define kobj_to_brport(obj) container_of(obj, struct net_bridge_port, kobj) @@ -494,6 +506,70 @@ static inline bool br_vlan_should_use(const struct net_bridge_vlan *v) return true; } +static inline bool nbp_state_should_learn(const struct net_bridge_port *p) +{ + return p->state == BR_STATE_LEARNING || p->state == BR_STATE_FORWARDING; +} + +static inline bool br_vlan_valid_id(u16 vid, struct netlink_ext_ack *extack) +{ + bool ret = vid > 0 && vid < VLAN_VID_MASK; + + if (!ret) + NL_SET_ERR_MSG_MOD(extack, "Vlan id is invalid"); + + return ret; +} + +static inline bool br_vlan_valid_range(const struct bridge_vlan_info *cur, + const struct bridge_vlan_info *last, + struct netlink_ext_ack *extack) +{ + /* pvid flag is not allowed in ranges */ + if (cur->flags & BRIDGE_VLAN_INFO_PVID) { + NL_SET_ERR_MSG_MOD(extack, "Pvid isn't allowed in a range"); + return false; + } + + /* when cur is the range end, check if: + * - it has range start flag + * - range ids are invalid (end is equal to or before start) + */ + if (last) { + if (cur->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) { + NL_SET_ERR_MSG_MOD(extack, "Found a new vlan range start while processing one"); + return false; + } else if (!(cur->flags & BRIDGE_VLAN_INFO_RANGE_END)) { + NL_SET_ERR_MSG_MOD(extack, "Vlan range end flag is missing"); + return false; + } else if (cur->vid <= last->vid) { + NL_SET_ERR_MSG_MOD(extack, "End vlan id is less than or equal to start vlan id"); + return false; + } + } + + /* check for required range flags */ + if (!(cur->flags & (BRIDGE_VLAN_INFO_RANGE_BEGIN | + BRIDGE_VLAN_INFO_RANGE_END))) { + NL_SET_ERR_MSG_MOD(extack, "Both vlan range flags are missing"); + return false; + } + + return true; +} + +static inline int br_afspec_cmd_to_rtm(int cmd) +{ + switch (cmd) { + case RTM_SETLINK: + return RTM_NEWVLAN; + case RTM_DELLINK: + return RTM_DELVLAN; + } + + return 0; +} + static inline int br_opt_get(const struct net_bridge *br, enum net_bridge_opts opt) { @@ -565,7 +641,7 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf, unsigned long count, int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid); void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, - const unsigned char *addr, u16 vid, bool added_by_user); + const unsigned char *addr, u16 vid, unsigned long flags); int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid); @@ -701,6 +777,8 @@ void br_multicast_get_stats(const struct net_bridge *br, struct br_mcast_stats *dest); void br_mdb_init(void); void br_mdb_uninit(void); +void br_multicast_host_join(struct net_bridge_mdb_entry *mp, bool notify); +void br_multicast_host_leave(struct net_bridge_mdb_entry *mp, bool notify); #define mlock_dereference(X, br) \ rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock)) @@ -861,7 +939,7 @@ static inline int br_multicast_igmp_type(const struct sk_buff *skb) #ifdef CONFIG_BRIDGE_VLAN_FILTERING bool br_allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, - u16 *vid); + u16 *vid, u8 *state); bool br_allowed_egress(struct net_bridge_vlan_group *vg, const struct sk_buff *skb); bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid); @@ -896,6 +974,14 @@ void br_vlan_get_stats(const struct net_bridge_vlan *v, void br_vlan_port_event(struct net_bridge_port *p, unsigned long event); int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr); +void br_vlan_rtnl_init(void); +void br_vlan_rtnl_uninit(void); +void br_vlan_notify(const struct net_bridge *br, + const struct net_bridge_port *p, + u16 vid, u16 vid_range, + int cmd); +bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, + const struct net_bridge_vlan *range_end); static inline struct net_bridge_vlan_group *br_vlan_group( const struct net_bridge *br) @@ -947,11 +1033,15 @@ static inline u16 br_get_pvid(const struct net_bridge_vlan_group *vg) return vg->pvid; } +static inline u16 br_vlan_flags(const struct net_bridge_vlan *v, u16 pvid) +{ + return v->vid == pvid ? v->flags | BRIDGE_VLAN_INFO_PVID : v->flags; +} #else static inline bool br_allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, - u16 *vid) + u16 *vid, u8 *state) { return true; } @@ -1090,6 +1180,70 @@ static inline int br_vlan_bridge_event(struct net_device *dev, { return 0; } + +static inline void br_vlan_rtnl_init(void) +{ +} + +static inline void br_vlan_rtnl_uninit(void) +{ +} + +static inline void br_vlan_notify(const struct net_bridge *br, + const struct net_bridge_port *p, + u16 vid, u16 vid_range, + int cmd) +{ +} +#endif + +/* br_vlan_options.c */ +#ifdef CONFIG_BRIDGE_VLAN_FILTERING +bool br_vlan_opts_eq(const struct net_bridge_vlan *v1, + const struct net_bridge_vlan *v2); +bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v); +size_t br_vlan_opts_nl_size(void); +int br_vlan_process_options(const struct net_bridge *br, + const struct net_bridge_port *p, + struct net_bridge_vlan *range_start, + struct net_bridge_vlan *range_end, + struct nlattr **tb, + struct netlink_ext_ack *extack); + +/* vlan state manipulation helpers using *_ONCE to annotate lock-free access */ +static inline u8 br_vlan_get_state(const struct net_bridge_vlan *v) +{ + return READ_ONCE(v->state); +} + +static inline void br_vlan_set_state(struct net_bridge_vlan *v, u8 state) +{ + WRITE_ONCE(v->state, state); +} + +static inline u8 br_vlan_get_pvid_state(const struct net_bridge_vlan_group *vg) +{ + return READ_ONCE(vg->pvid_state); +} + +static inline void br_vlan_set_pvid_state(struct net_bridge_vlan_group *vg, + u8 state) +{ + WRITE_ONCE(vg->pvid_state, state); +} + +/* learn_allow is true at ingress and false at egress */ +static inline bool br_vlan_state_allowed(u8 state, bool learn_allow) +{ + switch (state) { + case BR_STATE_LEARNING: + return learn_allow; + case BR_STATE_FORWARDING: + return true; + default: + return false; + } +} #endif struct nf_br_ops { @@ -1161,6 +1315,12 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags, int br_dellink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags); int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, u32 filter_mask, int nlflags); +int br_process_vlan_info(struct net_bridge *br, + struct net_bridge_port *p, int cmd, + struct bridge_vlan_info *vinfo_curr, + struct bridge_vlan_info **vinfo_last, + bool *changed, + struct netlink_ext_ack *extack); #ifdef CONFIG_SYSFS /* br_sysfs_if.c */ diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index 1f1410f8d312..6856a6d9282b 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -45,6 +45,17 @@ void br_set_state(struct net_bridge_port *p, unsigned int state) br_info(p->br, "port %u(%s) entered %s state\n", (unsigned int) p->port_no, p->dev->name, br_port_state_names[p->state]); + + if (p->br->stp_enabled == BR_KERNEL_STP) { + switch (p->state) { + case BR_STATE_BLOCKING: + p->stp_xstats.transition_blk++; + break; + case BR_STATE_FORWARDING: + p->stp_xstats.transition_fwd++; + break; + } + } } /* called under bridge lock */ @@ -484,6 +495,8 @@ void br_received_config_bpdu(struct net_bridge_port *p, struct net_bridge *br; int was_root; + p->stp_xstats.rx_bpdu++; + br = p->br; was_root = br_is_root_bridge(br); @@ -517,6 +530,8 @@ void br_received_config_bpdu(struct net_bridge_port *p, /* called under bridge lock */ void br_received_tcn_bpdu(struct net_bridge_port *p) { + p->stp_xstats.rx_tcn++; + if (br_is_designated_port(p)) { br_info(p->br, "port %u(%s) received tcn bpdu\n", (unsigned int) p->port_no, p->dev->name); diff --git a/net/bridge/br_stp_bpdu.c b/net/bridge/br_stp_bpdu.c index 7796dd9d42d7..0e4572f31330 100644 --- a/net/bridge/br_stp_bpdu.c +++ b/net/bridge/br_stp_bpdu.c @@ -118,6 +118,8 @@ void br_send_config_bpdu(struct net_bridge_port *p, struct br_config_bpdu *bpdu) br_set_ticks(buf+33, bpdu->forward_delay); br_send_bpdu(p, buf, 35); + + p->stp_xstats.tx_bpdu++; } /* called under bridge lock */ @@ -133,6 +135,8 @@ void br_send_tcn_bpdu(struct net_bridge_port *p) buf[2] = 0; buf[3] = BPDU_TYPE_TCN; br_send_bpdu(p, buf, 4); + + p->stp_xstats.tx_tcn++; } /* diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index 921310d3cbae..015209bf44aa 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -129,15 +129,19 @@ br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type) br_switchdev_fdb_call_notifiers(false, fdb->key.addr.addr, fdb->key.vlan_id, fdb->dst->dev, - fdb->added_by_user, - fdb->offloaded); + test_bit(BR_FDB_ADDED_BY_USER, + &fdb->flags), + test_bit(BR_FDB_OFFLOADED, + &fdb->flags)); break; case RTM_NEWNEIGH: br_switchdev_fdb_call_notifiers(true, fdb->key.addr.addr, fdb->key.vlan_id, fdb->dst->dev, - fdb->added_by_user, - fdb->offloaded); + test_bit(BR_FDB_ADDED_BY_USER, + &fdb->flags), + test_bit(BR_FDB_OFFLOADED, + &fdb->flags)); break; } } diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index f5b2aeebbfe9..6b5deca08b89 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -34,13 +34,15 @@ static struct net_bridge_vlan *br_vlan_lookup(struct rhashtable *tbl, u16 vid) return rhashtable_lookup_fast(tbl, &vid, br_vlan_rht_params); } -static bool __vlan_add_pvid(struct net_bridge_vlan_group *vg, u16 vid) +static bool __vlan_add_pvid(struct net_bridge_vlan_group *vg, + const struct net_bridge_vlan *v) { - if (vg->pvid == vid) + if (vg->pvid == v->vid) return false; smp_wmb(); - vg->pvid = vid; + br_vlan_set_pvid_state(vg, v->state); + vg->pvid = v->vid; return true; } @@ -69,7 +71,7 @@ static bool __vlan_add_flags(struct net_bridge_vlan *v, u16 flags) vg = nbp_vlan_group(v->port); if (flags & BRIDGE_VLAN_INFO_PVID) - ret = __vlan_add_pvid(vg, v->vid); + ret = __vlan_add_pvid(vg, v); else ret = __vlan_delete_pvid(vg, v->vid); @@ -257,6 +259,10 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags, &changed, extack); if (err) goto out_filt; + + if (changed) + br_vlan_notify(br, NULL, v->vid, 0, + RTM_NEWVLAN); } masterv = br_vlan_get_master(br, v->vid, extack); @@ -289,6 +295,9 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags, vg->num_vlans++; } + /* set the state before publishing */ + v->state = BR_STATE_FORWARDING; + err = rhashtable_lookup_insert_fast(&vg->vlan_hash, &v->vnode, br_vlan_rht_params); if (err) @@ -380,13 +389,31 @@ static void __vlan_group_free(struct net_bridge_vlan_group *vg) kfree(vg); } -static void __vlan_flush(struct net_bridge_vlan_group *vg) +static void __vlan_flush(const struct net_bridge *br, + const struct net_bridge_port *p, + struct net_bridge_vlan_group *vg) { struct net_bridge_vlan *vlan, *tmp; + u16 v_start = 0, v_end = 0; __vlan_delete_pvid(vg, vg->pvid); - list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist) + list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist) { + /* take care of disjoint ranges */ + if (!v_start) { + v_start = vlan->vid; + } else if (vlan->vid - v_end != 1) { + /* found range end, notify and start next one */ + br_vlan_notify(br, p, v_start, v_end, RTM_DELVLAN); + v_start = vlan->vid; + } + v_end = vlan->vid; + __vlan_del(vlan); + } + + /* notify about the last/whole vlan range */ + if (v_start) + br_vlan_notify(br, p, v_start, v_end, RTM_DELVLAN); } struct sk_buff *br_handle_vlan(struct net_bridge *br, @@ -444,7 +471,8 @@ out: /* Called under RCU */ static bool __allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, - struct sk_buff *skb, u16 *vid) + struct sk_buff *skb, u16 *vid, + u8 *state) { struct br_vlan_stats *stats; struct net_bridge_vlan *v; @@ -510,13 +538,25 @@ static bool __allowed_ingress(const struct net_bridge *br, skb->vlan_tci |= pvid; /* if stats are disabled we can avoid the lookup */ - if (!br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) - return true; + if (!br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { + if (*state == BR_STATE_FORWARDING) { + *state = br_vlan_get_pvid_state(vg); + return br_vlan_state_allowed(*state, true); + } else { + return true; + } + } } v = br_vlan_find(vg, *vid); if (!v || !br_vlan_should_use(v)) goto drop; + if (*state == BR_STATE_FORWARDING) { + *state = br_vlan_get_state(v); + if (!br_vlan_state_allowed(*state, true)) + goto drop; + } + if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { stats = this_cpu_ptr(v->stats); u64_stats_update_begin(&stats->syncp); @@ -534,7 +574,7 @@ drop: bool br_allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, - u16 *vid) + u16 *vid, u8 *state) { /* If VLAN filtering is disabled on the bridge, all packets are * permitted. @@ -544,7 +584,7 @@ bool br_allowed_ingress(const struct net_bridge *br, return true; } - return __allowed_ingress(br, vg, skb, vid); + return __allowed_ingress(br, vg, skb, vid, state); } /* Called under RCU. */ @@ -560,7 +600,8 @@ bool br_allowed_egress(struct net_bridge_vlan_group *vg, br_vlan_get_tag(skb, &vid); v = br_vlan_find(vg, vid); - if (v && br_vlan_should_use(v)) + if (v && br_vlan_should_use(v) && + br_vlan_state_allowed(br_vlan_get_state(v), false)) return true; return false; @@ -571,6 +612,7 @@ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid) { struct net_bridge_vlan_group *vg; struct net_bridge *br = p->br; + struct net_bridge_vlan *v; /* If filtering was disabled at input, let it pass. */ if (!br_opt_get(br, BROPT_VLAN_ENABLED)) @@ -585,13 +627,15 @@ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid) if (!*vid) { *vid = br_get_pvid(vg); - if (!*vid) + if (!*vid || + !br_vlan_state_allowed(br_vlan_get_pvid_state(vg), true)) return false; return true; } - if (br_vlan_find(vg, *vid)) + v = br_vlan_find(vg, *vid); + if (v && br_vlan_state_allowed(br_vlan_get_state(v), true)) return true; return false; @@ -716,7 +760,7 @@ void br_vlan_flush(struct net_bridge *br) ASSERT_RTNL(); vg = br_vlan_group(br); - __vlan_flush(vg); + __vlan_flush(br, NULL, vg); RCU_INIT_POINTER(br->vlgrp, NULL); synchronize_rcu(); __vlan_group_free(vg); @@ -925,12 +969,15 @@ static void br_vlan_disable_default_pvid(struct net_bridge *br) /* Disable default_pvid on all ports where it is still * configured. */ - if (vlan_default_pvid(br_vlan_group(br), pvid)) - br_vlan_delete(br, pvid); + if (vlan_default_pvid(br_vlan_group(br), pvid)) { + if (!br_vlan_delete(br, pvid)) + br_vlan_notify(br, NULL, pvid, 0, RTM_DELVLAN); + } list_for_each_entry(p, &br->port_list, list) { - if (vlan_default_pvid(nbp_vlan_group(p), pvid)) - nbp_vlan_delete(p, pvid); + if (vlan_default_pvid(nbp_vlan_group(p), pvid) && + !nbp_vlan_delete(p, pvid)) + br_vlan_notify(br, p, pvid, 0, RTM_DELVLAN); } br->default_pvid = 0; @@ -972,7 +1019,10 @@ int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid, &vlchange, extack); if (err) goto out; - br_vlan_delete(br, old_pvid); + + if (br_vlan_delete(br, old_pvid)) + br_vlan_notify(br, NULL, old_pvid, 0, RTM_DELVLAN); + br_vlan_notify(br, NULL, pvid, 0, RTM_NEWVLAN); set_bit(0, changed); } @@ -992,7 +1042,9 @@ int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid, &vlchange, extack); if (err) goto err_port; - nbp_vlan_delete(p, old_pvid); + if (nbp_vlan_delete(p, old_pvid)) + br_vlan_notify(br, p, old_pvid, 0, RTM_DELVLAN); + br_vlan_notify(p->br, p, pvid, 0, RTM_NEWVLAN); set_bit(p->port_no, changed); } @@ -1007,22 +1059,28 @@ err_port: if (!test_bit(p->port_no, changed)) continue; - if (old_pvid) + if (old_pvid) { nbp_vlan_add(p, old_pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED, &vlchange, NULL); + br_vlan_notify(p->br, p, old_pvid, 0, RTM_NEWVLAN); + } nbp_vlan_delete(p, pvid); + br_vlan_notify(br, p, pvid, 0, RTM_DELVLAN); } if (test_bit(0, changed)) { - if (old_pvid) + if (old_pvid) { br_vlan_add(br, old_pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED | BRIDGE_VLAN_INFO_BRENTRY, &vlchange, NULL); + br_vlan_notify(br, NULL, old_pvid, 0, RTM_NEWVLAN); + } br_vlan_delete(br, pvid); + br_vlan_notify(br, NULL, pvid, 0, RTM_DELVLAN); } goto out; } @@ -1115,6 +1173,7 @@ int nbp_vlan_init(struct net_bridge_port *p, struct netlink_ext_ack *extack) &changed, extack); if (ret) goto err_vlan_add; + br_vlan_notify(p->br, p, p->br->default_pvid, 0, RTM_NEWVLAN); } out: return ret; @@ -1196,7 +1255,7 @@ void nbp_vlan_flush(struct net_bridge_port *port) ASSERT_RTNL(); vg = nbp_vlan_group(port); - __vlan_flush(vg); + __vlan_flush(port->br, port, vg); RCU_INIT_POINTER(port->vlgrp, NULL); synchronize_rcu(); __vlan_group_free(vg); @@ -1281,6 +1340,8 @@ int br_vlan_get_info(const struct net_device *dev, u16 vid, p_vinfo->vid = vid; p_vinfo->flags = v->flags; + if (vid == br_get_pvid(vg)) + p_vinfo->flags |= BRIDGE_VLAN_INFO_PVID; return 0; } EXPORT_SYMBOL_GPL(br_vlan_get_info); @@ -1460,8 +1521,8 @@ int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr) { struct netdev_notifier_changeupper_info *info; struct net_bridge *br = netdev_priv(dev); - bool changed; - int ret = 0; + int vlcmd = 0, ret = 0; + bool changed = false; switch (event) { case NETDEV_REGISTER: @@ -1469,9 +1530,11 @@ int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr) BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED | BRIDGE_VLAN_INFO_BRENTRY, &changed, NULL); + vlcmd = RTM_NEWVLAN; break; case NETDEV_UNREGISTER: - br_vlan_delete(br, br->default_pvid); + changed = !br_vlan_delete(br, br->default_pvid); + vlcmd = RTM_DELVLAN; break; case NETDEV_CHANGEUPPER: info = ptr; @@ -1485,6 +1548,8 @@ int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr) br_vlan_link_state_change(dev, br); break; } + if (changed) + br_vlan_notify(br, NULL, br->default_pvid, 0, vlcmd); return ret; } @@ -1503,3 +1568,441 @@ void br_vlan_port_event(struct net_bridge_port *p, unsigned long event) break; } } + +/* v_opts is used to dump the options which must be equal in the whole range */ +static bool br_vlan_fill_vids(struct sk_buff *skb, u16 vid, u16 vid_range, + const struct net_bridge_vlan *v_opts, + u16 flags) +{ + struct bridge_vlan_info info; + struct nlattr *nest; + + nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY); + if (!nest) + return false; + + memset(&info, 0, sizeof(info)); + info.vid = vid; + if (flags & BRIDGE_VLAN_INFO_UNTAGGED) + info.flags |= BRIDGE_VLAN_INFO_UNTAGGED; + if (flags & BRIDGE_VLAN_INFO_PVID) + info.flags |= BRIDGE_VLAN_INFO_PVID; + + if (nla_put(skb, BRIDGE_VLANDB_ENTRY_INFO, sizeof(info), &info)) + goto out_err; + + if (vid_range && vid < vid_range && + !(flags & BRIDGE_VLAN_INFO_PVID) && + nla_put_u16(skb, BRIDGE_VLANDB_ENTRY_RANGE, vid_range)) + goto out_err; + + if (v_opts && !br_vlan_opts_fill(skb, v_opts)) + goto out_err; + + nla_nest_end(skb, nest); + + return true; + +out_err: + nla_nest_cancel(skb, nest); + return false; +} + +static size_t rtnl_vlan_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct br_vlan_msg)) + + nla_total_size(0) /* BRIDGE_VLANDB_ENTRY */ + + nla_total_size(sizeof(u16)) /* BRIDGE_VLANDB_ENTRY_RANGE */ + + nla_total_size(sizeof(struct bridge_vlan_info)) /* BRIDGE_VLANDB_ENTRY_INFO */ + + br_vlan_opts_nl_size(); /* bridge vlan options */ +} + +void br_vlan_notify(const struct net_bridge *br, + const struct net_bridge_port *p, + u16 vid, u16 vid_range, + int cmd) +{ + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v = NULL; + struct br_vlan_msg *bvm; + struct nlmsghdr *nlh; + struct sk_buff *skb; + int err = -ENOBUFS; + struct net *net; + u16 flags = 0; + int ifindex; + + /* right now notifications are done only with rtnl held */ + ASSERT_RTNL(); + + if (p) { + ifindex = p->dev->ifindex; + vg = nbp_vlan_group(p); + net = dev_net(p->dev); + } else { + ifindex = br->dev->ifindex; + vg = br_vlan_group(br); + net = dev_net(br->dev); + } + + skb = nlmsg_new(rtnl_vlan_nlmsg_size(), GFP_KERNEL); + if (!skb) + goto out_err; + + err = -EMSGSIZE; + nlh = nlmsg_put(skb, 0, 0, cmd, sizeof(*bvm), 0); + if (!nlh) + goto out_err; + bvm = nlmsg_data(nlh); + memset(bvm, 0, sizeof(*bvm)); + bvm->family = AF_BRIDGE; + bvm->ifindex = ifindex; + + switch (cmd) { + case RTM_NEWVLAN: + /* need to find the vlan due to flags/options */ + v = br_vlan_find(vg, vid); + if (!v || !br_vlan_should_use(v)) + goto out_kfree; + + flags = v->flags; + if (br_get_pvid(vg) == v->vid) + flags |= BRIDGE_VLAN_INFO_PVID; + break; + case RTM_DELVLAN: + break; + default: + goto out_kfree; + } + + if (!br_vlan_fill_vids(skb, vid, vid_range, v, flags)) + goto out_err; + + nlmsg_end(skb, nlh); + rtnl_notify(skb, net, 0, RTNLGRP_BRVLAN, NULL, GFP_KERNEL); + return; + +out_err: + rtnl_set_sk_err(net, RTNLGRP_BRVLAN, err); +out_kfree: + kfree_skb(skb); +} + +/* check if v_curr can enter a range ending in range_end */ +bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, + const struct net_bridge_vlan *range_end) +{ + return v_curr->vid - range_end->vid == 1 && + range_end->flags == v_curr->flags && + br_vlan_opts_eq(v_curr, range_end); +} + +static int br_vlan_dump_dev(const struct net_device *dev, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct net_bridge_vlan *v, *range_start = NULL, *range_end = NULL; + struct net_bridge_vlan_group *vg; + int idx = 0, s_idx = cb->args[1]; + struct nlmsghdr *nlh = NULL; + struct net_bridge_port *p; + struct br_vlan_msg *bvm; + struct net_bridge *br; + int err = 0; + u16 pvid; + + if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev)) + return -EINVAL; + + if (netif_is_bridge_master(dev)) { + br = netdev_priv(dev); + vg = br_vlan_group_rcu(br); + p = NULL; + } else { + p = br_port_get_rcu(dev); + if (WARN_ON(!p)) + return -EINVAL; + vg = nbp_vlan_group_rcu(p); + br = p->br; + } + + if (!vg) + return 0; + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + RTM_NEWVLAN, sizeof(*bvm), NLM_F_MULTI); + if (!nlh) + return -EMSGSIZE; + bvm = nlmsg_data(nlh); + memset(bvm, 0, sizeof(*bvm)); + bvm->family = PF_BRIDGE; + bvm->ifindex = dev->ifindex; + pvid = br_get_pvid(vg); + + /* idx must stay at range's beginning until it is filled in */ + list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { + if (!br_vlan_should_use(v)) + continue; + if (idx < s_idx) { + idx++; + continue; + } + + if (!range_start) { + range_start = v; + range_end = v; + continue; + } + + if (v->vid == pvid || !br_vlan_can_enter_range(v, range_end)) { + u16 flags = br_vlan_flags(range_start, pvid); + + if (!br_vlan_fill_vids(skb, range_start->vid, + range_end->vid, range_start, + flags)) { + err = -EMSGSIZE; + break; + } + /* advance number of filled vlans */ + idx += range_end->vid - range_start->vid + 1; + + range_start = v; + } + range_end = v; + } + + /* err will be 0 and range_start will be set in 3 cases here: + * - first vlan (range_start == range_end) + * - last vlan (range_start == range_end, not in range) + * - last vlan range (range_start != range_end, in range) + */ + if (!err && range_start && + !br_vlan_fill_vids(skb, range_start->vid, range_end->vid, + range_start, br_vlan_flags(range_start, pvid))) + err = -EMSGSIZE; + + cb->args[1] = err ? idx : 0; + + nlmsg_end(skb, nlh); + + return err; +} + +static int br_vlan_rtm_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx = 0, err = 0, s_idx = cb->args[0]; + struct net *net = sock_net(skb->sk); + struct br_vlan_msg *bvm; + struct net_device *dev; + + err = nlmsg_parse(cb->nlh, sizeof(*bvm), NULL, 0, NULL, cb->extack); + if (err < 0) + return err; + + bvm = nlmsg_data(cb->nlh); + + rcu_read_lock(); + if (bvm->ifindex) { + dev = dev_get_by_index_rcu(net, bvm->ifindex); + if (!dev) { + err = -ENODEV; + goto out_err; + } + err = br_vlan_dump_dev(dev, skb, cb); + if (err && err != -EMSGSIZE) + goto out_err; + } else { + for_each_netdev_rcu(net, dev) { + if (idx < s_idx) + goto skip; + + err = br_vlan_dump_dev(dev, skb, cb); + if (err == -EMSGSIZE) + break; +skip: + idx++; + } + } + cb->args[0] = idx; + rcu_read_unlock(); + + return skb->len; + +out_err: + rcu_read_unlock(); + + return err; +} + +static const struct nla_policy br_vlan_db_policy[BRIDGE_VLANDB_ENTRY_MAX + 1] = { + [BRIDGE_VLANDB_ENTRY_INFO] = { .type = NLA_EXACT_LEN, + .len = sizeof(struct bridge_vlan_info) }, + [BRIDGE_VLANDB_ENTRY_RANGE] = { .type = NLA_U16 }, + [BRIDGE_VLANDB_ENTRY_STATE] = { .type = NLA_U8 }, +}; + +static int br_vlan_rtm_process_one(struct net_device *dev, + const struct nlattr *attr, + int cmd, struct netlink_ext_ack *extack) +{ + struct bridge_vlan_info *vinfo, vrange_end, *vinfo_last = NULL; + struct nlattr *tb[BRIDGE_VLANDB_ENTRY_MAX + 1]; + bool changed = false, skip_processing = false; + struct net_bridge_vlan_group *vg; + struct net_bridge_port *p = NULL; + int err = 0, cmdmap = 0; + struct net_bridge *br; + + if (netif_is_bridge_master(dev)) { + br = netdev_priv(dev); + vg = br_vlan_group(br); + } else { + p = br_port_get_rtnl(dev); + if (WARN_ON(!p)) + return -ENODEV; + br = p->br; + vg = nbp_vlan_group(p); + } + + if (WARN_ON(!vg)) + return -ENODEV; + + err = nla_parse_nested(tb, BRIDGE_VLANDB_ENTRY_MAX, attr, + br_vlan_db_policy, extack); + if (err) + return err; + + if (!tb[BRIDGE_VLANDB_ENTRY_INFO]) { + NL_SET_ERR_MSG_MOD(extack, "Missing vlan entry info"); + return -EINVAL; + } + memset(&vrange_end, 0, sizeof(vrange_end)); + + vinfo = nla_data(tb[BRIDGE_VLANDB_ENTRY_INFO]); + if (vinfo->flags & (BRIDGE_VLAN_INFO_RANGE_BEGIN | + BRIDGE_VLAN_INFO_RANGE_END)) { + NL_SET_ERR_MSG_MOD(extack, "Old-style vlan ranges are not allowed when using RTM vlan calls"); + return -EINVAL; + } + if (!br_vlan_valid_id(vinfo->vid, extack)) + return -EINVAL; + + if (tb[BRIDGE_VLANDB_ENTRY_RANGE]) { + vrange_end.vid = nla_get_u16(tb[BRIDGE_VLANDB_ENTRY_RANGE]); + /* validate user-provided flags without RANGE_BEGIN */ + vrange_end.flags = BRIDGE_VLAN_INFO_RANGE_END | vinfo->flags; + vinfo->flags |= BRIDGE_VLAN_INFO_RANGE_BEGIN; + + /* vinfo_last is the range start, vinfo the range end */ + vinfo_last = vinfo; + vinfo = &vrange_end; + + if (!br_vlan_valid_id(vinfo->vid, extack) || + !br_vlan_valid_range(vinfo, vinfo_last, extack)) + return -EINVAL; + } + + switch (cmd) { + case RTM_NEWVLAN: + cmdmap = RTM_SETLINK; + skip_processing = !!(vinfo->flags & BRIDGE_VLAN_INFO_ONLY_OPTS); + break; + case RTM_DELVLAN: + cmdmap = RTM_DELLINK; + break; + } + + if (!skip_processing) { + struct bridge_vlan_info *tmp_last = vinfo_last; + + /* br_process_vlan_info may overwrite vinfo_last */ + err = br_process_vlan_info(br, p, cmdmap, vinfo, &tmp_last, + &changed, extack); + + /* notify first if anything changed */ + if (changed) + br_ifinfo_notify(cmdmap, br, p); + + if (err) + return err; + } + + /* deal with options */ + if (cmd == RTM_NEWVLAN) { + struct net_bridge_vlan *range_start, *range_end; + + if (vinfo_last) { + range_start = br_vlan_find(vg, vinfo_last->vid); + range_end = br_vlan_find(vg, vinfo->vid); + } else { + range_start = br_vlan_find(vg, vinfo->vid); + range_end = range_start; + } + + err = br_vlan_process_options(br, p, range_start, range_end, + tb, extack); + } + + return err; +} + +static int br_vlan_rtm_process(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct br_vlan_msg *bvm; + struct net_device *dev; + struct nlattr *attr; + int err, vlans = 0; + int rem; + + /* this should validate the header and check for remaining bytes */ + err = nlmsg_parse(nlh, sizeof(*bvm), NULL, BRIDGE_VLANDB_MAX, NULL, + extack); + if (err < 0) + return err; + + bvm = nlmsg_data(nlh); + dev = __dev_get_by_index(net, bvm->ifindex); + if (!dev) + return -ENODEV; + + if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev)) { + NL_SET_ERR_MSG_MOD(extack, "The device is not a valid bridge or bridge port"); + return -EINVAL; + } + + nlmsg_for_each_attr(attr, nlh, sizeof(*bvm), rem) { + if (nla_type(attr) != BRIDGE_VLANDB_ENTRY) + continue; + + vlans++; + err = br_vlan_rtm_process_one(dev, attr, nlh->nlmsg_type, + extack); + if (err) + break; + } + if (!vlans) { + NL_SET_ERR_MSG_MOD(extack, "No vlans found to process"); + err = -EINVAL; + } + + return err; +} + +void br_vlan_rtnl_init(void) +{ + rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_GETVLAN, NULL, + br_vlan_rtm_dump, 0); + rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_NEWVLAN, + br_vlan_rtm_process, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_DELVLAN, + br_vlan_rtm_process, NULL, 0); +} + +void br_vlan_rtnl_uninit(void) +{ + rtnl_unregister(PF_BRIDGE, RTM_GETVLAN); + rtnl_unregister(PF_BRIDGE, RTM_NEWVLAN); + rtnl_unregister(PF_BRIDGE, RTM_DELVLAN); +} diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c new file mode 100644 index 000000000000..cd2eb194eb98 --- /dev/null +++ b/net/bridge/br_vlan_options.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright (c) 2020, Nikolay Aleksandrov <nikolay@cumulusnetworks.com> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/slab.h> + +#include "br_private.h" + +/* check if the options between two vlans are equal */ +bool br_vlan_opts_eq(const struct net_bridge_vlan *v1, + const struct net_bridge_vlan *v2) +{ + return v1->state == v2->state; +} + +bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v) +{ + return !nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_STATE, + br_vlan_get_state(v)); +} + +size_t br_vlan_opts_nl_size(void) +{ + return nla_total_size(sizeof(u8)); /* BRIDGE_VLANDB_ENTRY_STATE */ +} + +static int br_vlan_modify_state(struct net_bridge_vlan_group *vg, + struct net_bridge_vlan *v, + u8 state, + bool *changed, + struct netlink_ext_ack *extack) +{ + struct net_bridge *br; + + ASSERT_RTNL(); + + if (state > BR_STATE_BLOCKING) { + NL_SET_ERR_MSG_MOD(extack, "Invalid vlan state"); + return -EINVAL; + } + + if (br_vlan_is_brentry(v)) + br = v->br; + else + br = v->port->br; + + if (br->stp_enabled == BR_KERNEL_STP) { + NL_SET_ERR_MSG_MOD(extack, "Can't modify vlan state when using kernel STP"); + return -EBUSY; + } + + if (v->state == state) + return 0; + + if (v->vid == br_get_pvid(vg)) + br_vlan_set_pvid_state(vg, state); + + br_vlan_set_state(v, state); + *changed = true; + + return 0; +} + +static int br_vlan_process_one_opts(const struct net_bridge *br, + const struct net_bridge_port *p, + struct net_bridge_vlan_group *vg, + struct net_bridge_vlan *v, + struct nlattr **tb, + bool *changed, + struct netlink_ext_ack *extack) +{ + int err; + + *changed = false; + if (tb[BRIDGE_VLANDB_ENTRY_STATE]) { + u8 state = nla_get_u8(tb[BRIDGE_VLANDB_ENTRY_STATE]); + + err = br_vlan_modify_state(vg, v, state, changed, extack); + if (err) + return err; + } + + return 0; +} + +int br_vlan_process_options(const struct net_bridge *br, + const struct net_bridge_port *p, + struct net_bridge_vlan *range_start, + struct net_bridge_vlan *range_end, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct net_bridge_vlan *v, *curr_start = NULL, *curr_end = NULL; + struct net_bridge_vlan_group *vg; + int vid, err = 0; + u16 pvid; + + if (p) + vg = nbp_vlan_group(p); + else + vg = br_vlan_group(br); + + if (!range_start || !br_vlan_should_use(range_start)) { + NL_SET_ERR_MSG_MOD(extack, "Vlan range start doesn't exist, can't process options"); + return -ENOENT; + } + if (!range_end || !br_vlan_should_use(range_end)) { + NL_SET_ERR_MSG_MOD(extack, "Vlan range end doesn't exist, can't process options"); + return -ENOENT; + } + + pvid = br_get_pvid(vg); + for (vid = range_start->vid; vid <= range_end->vid; vid++) { + bool changed = false; + + v = br_vlan_find(vg, vid); + if (!v || !br_vlan_should_use(v)) { + NL_SET_ERR_MSG_MOD(extack, "Vlan in range doesn't exist, can't process options"); + err = -ENOENT; + break; + } + + err = br_vlan_process_one_opts(br, p, vg, v, tb, &changed, + extack); + if (err) + break; + + if (changed) { + /* vlan options changed, check for range */ + if (!curr_start) { + curr_start = v; + curr_end = v; + continue; + } + + if (v->vid == pvid || + !br_vlan_can_enter_range(v, curr_end)) { + br_vlan_notify(br, p, curr_start->vid, + curr_end->vid, RTM_NEWVLAN); + curr_start = v; + } + curr_end = v; + } else { + /* nothing changed and nothing to notify yet */ + if (!curr_start) + continue; + + br_vlan_notify(br, p, curr_start->vid, curr_end->vid, + RTM_NEWVLAN); + curr_start = NULL; + curr_end = NULL; + } + } + if (curr_start) + br_vlan_notify(br, p, curr_start->vid, curr_end->vid, + RTM_NEWVLAN); + + return err; +} diff --git a/net/bridge/netfilter/ebt_802_3.c b/net/bridge/netfilter/ebt_802_3.c index 2c8fe24400e5..68c2519bdc52 100644 --- a/net/bridge/netfilter/ebt_802_3.c +++ b/net/bridge/netfilter/ebt_802_3.c @@ -11,7 +11,13 @@ #include <linux/module.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_bridge/ebtables.h> -#include <linux/netfilter_bridge/ebt_802_3.h> +#include <linux/skbuff.h> +#include <uapi/linux/netfilter_bridge/ebt_802_3.h> + +static struct ebt_802_3_hdr *ebt_802_3_hdr(const struct sk_buff *skb) +{ + return (struct ebt_802_3_hdr *)skb_mac_header(skb); +} static bool ebt_802_3_mt(const struct sk_buff *skb, struct xt_action_param *par) diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c index ed91ea31978a..12a4f4d93681 100644 --- a/net/bridge/netfilter/ebt_dnat.c +++ b/net/bridge/netfilter/ebt_dnat.c @@ -20,7 +20,6 @@ static unsigned int ebt_dnat_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ebt_nat_info *info = par->targinfo; - struct net_device *dev; if (skb_ensure_writable(skb, ETH_ALEN)) return EBT_DROP; @@ -33,10 +32,22 @@ ebt_dnat_tg(struct sk_buff *skb, const struct xt_action_param *par) else skb->pkt_type = PACKET_MULTICAST; } else { - if (xt_hooknum(par) != NF_BR_BROUTING) - dev = br_port_get_rcu(xt_in(par))->br->dev; - else + const struct net_device *dev; + + switch (xt_hooknum(par)) { + case NF_BR_BROUTING: dev = xt_in(par); + break; + case NF_BR_PRE_ROUTING: + dev = br_port_get_rcu(xt_in(par))->br->dev; + break; + default: + dev = NULL; + break; + } + + if (!dev) /* NF_BR_LOCAL_OUT */ + return info->target; if (ether_addr_equal(info->mac, dev->dev_addr)) skb->pkt_type = PACKET_HOST; diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index c8177a89f52c..e1256e03a9a8 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -221,7 +221,7 @@ unsigned int ebt_do_table(struct sk_buff *skb, return NF_DROP; } - ADD_COUNTER(*(counter_base + i), 1, skb->len); + ADD_COUNTER(*(counter_base + i), skb->len, 1); /* these should only watch: not modify, nor tell us * what to do with the packet @@ -959,8 +959,8 @@ static void get_counters(const struct ebt_counter *oldcounters, continue; counter_base = COUNTER_BASE(oldcounters, nentries, cpu); for (i = 0; i < nentries; i++) - ADD_COUNTER(counters[i], counter_base[i].pcnt, - counter_base[i].bcnt); + ADD_COUNTER(counters[i], counter_base[i].bcnt, + counter_base[i].pcnt); } } @@ -1280,7 +1280,7 @@ static int do_update_counters(struct net *net, const char *name, /* we add to the counters of the first cpu */ for (i = 0; i < num_counters; i++) - ADD_COUNTER(t->private->counters[i], tmp[i].pcnt, tmp[i].bcnt); + ADD_COUNTER(t->private->counters[i], tmp[i].bcnt, tmp[i].pcnt); write_unlock_bh(&t->lock); ret = 0; @@ -1867,7 +1867,7 @@ static int ebt_buf_count(struct ebt_entries_buf_state *state, unsigned int sz) } static int ebt_buf_add(struct ebt_entries_buf_state *state, - void *data, unsigned int sz) + const void *data, unsigned int sz) { if (state->buf_kern_start == NULL) goto count_only; @@ -1901,7 +1901,7 @@ enum compat_mwt { EBT_COMPAT_TARGET, }; -static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt, +static int compat_mtw_from_user(const struct compat_ebt_entry_mwt *mwt, enum compat_mwt compat_mwt, struct ebt_entries_buf_state *state, const unsigned char *base) @@ -1979,22 +1979,23 @@ static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt, /* return size of all matches, watchers or target, including necessary * alignment and padding. */ -static int ebt_size_mwt(struct compat_ebt_entry_mwt *match32, +static int ebt_size_mwt(const struct compat_ebt_entry_mwt *match32, unsigned int size_left, enum compat_mwt type, struct ebt_entries_buf_state *state, const void *base) { + const char *buf = (const char *)match32; int growth = 0; - char *buf; if (size_left == 0) return 0; - buf = (char *) match32; - - while (size_left >= sizeof(*match32)) { + do { struct ebt_entry_match *match_kern; int ret; + if (size_left < sizeof(*match32)) + return -EINVAL; + match_kern = (struct ebt_entry_match *) state->buf_kern_start; if (match_kern) { char *tmp; @@ -2031,22 +2032,18 @@ static int ebt_size_mwt(struct compat_ebt_entry_mwt *match32, if (match_kern) match_kern->match_size = ret; - /* rule should have no remaining data after target */ - if (type == EBT_COMPAT_TARGET && size_left) - return -EINVAL; - match32 = (struct compat_ebt_entry_mwt *) buf; - } + } while (size_left); return growth; } /* called for all ebt_entry structures. */ -static int size_entry_mwt(struct ebt_entry *entry, const unsigned char *base, +static int size_entry_mwt(const struct ebt_entry *entry, const unsigned char *base, unsigned int *total, struct ebt_entries_buf_state *state) { - unsigned int i, j, startoff, new_offset = 0; + unsigned int i, j, startoff, next_expected_off, new_offset = 0; /* stores match/watchers/targets & offset of next struct ebt_entry: */ unsigned int offsets[4]; unsigned int *offsets_update = NULL; @@ -2132,11 +2129,13 @@ static int size_entry_mwt(struct ebt_entry *entry, const unsigned char *base, return ret; } - startoff = state->buf_user_offset - startoff; + next_expected_off = state->buf_user_offset - startoff; + if (next_expected_off != entry->next_offset) + return -EINVAL; - if (WARN_ON(*total < startoff)) + if (*total < entry->next_offset) return -EINVAL; - *total -= startoff; + *total -= entry->next_offset; return 0; } diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index 4f5444d2a526..809673222382 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -17,7 +17,6 @@ #include <net/netfilter/nf_conntrack_bridge.h> #include <linux/netfilter/nf_tables.h> -#include <net/netfilter/ipv6/nf_defrag_ipv6.h> #include <net/netfilter/nf_tables.h> #include "../br_private.h" @@ -27,13 +26,14 @@ */ static int nf_br_ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, - struct nf_ct_bridge_frag_data *data, + struct nf_bridge_frag_data *data, int (*output)(struct net *, struct sock *sk, - const struct nf_ct_bridge_frag_data *data, + const struct nf_bridge_frag_data *data, struct sk_buff *)) { int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; unsigned int hlen, ll_rs, mtu; + ktime_t tstamp = skb->tstamp; struct ip_frag_state state; struct iphdr *iph; int err; @@ -81,6 +81,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk, if (iter.frag) ip_fraglist_prepare(skb, &iter); + skb->tstamp = tstamp; err = output(net, sk, data, skb); if (err || !iter.frag) break; @@ -94,7 +95,7 @@ slow_path: * This may also be a clone skbuff, we could preserve the geometry for * the copies but probably not worth the effort. */ - ip_frag_init(skb, hlen, ll_rs, frag_max_size, &state); + ip_frag_init(skb, hlen, ll_rs, frag_max_size, false, &state); while (state.left > 0) { struct sk_buff *skb2; @@ -105,6 +106,7 @@ slow_path: goto blackhole; } + skb2->tstamp = tstamp; err = output(net, sk, data, skb2); if (err) goto blackhole; @@ -279,7 +281,7 @@ static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb, } static void nf_ct_bridge_frag_save(struct sk_buff *skb, - struct nf_ct_bridge_frag_data *data) + struct nf_bridge_frag_data *data) { if (skb_vlan_tag_present(skb)) { data->vlan_present = true; @@ -294,10 +296,10 @@ static void nf_ct_bridge_frag_save(struct sk_buff *skb, static unsigned int nf_ct_bridge_refrag(struct sk_buff *skb, const struct nf_hook_state *state, int (*output)(struct net *, struct sock *sk, - const struct nf_ct_bridge_frag_data *data, + const struct nf_bridge_frag_data *data, struct sk_buff *)) { - struct nf_ct_bridge_frag_data data; + struct nf_bridge_frag_data data; if (!BR_INPUT_SKB_CB(skb)->frag_max_size) return NF_ACCEPT; @@ -320,7 +322,7 @@ nf_ct_bridge_refrag(struct sk_buff *skb, const struct nf_hook_state *state, /* Actually only slow path refragmentation needs this. */ static int nf_ct_bridge_frag_restore(struct sk_buff *skb, - const struct nf_ct_bridge_frag_data *data) + const struct nf_bridge_frag_data *data) { int err; @@ -341,7 +343,7 @@ static int nf_ct_bridge_frag_restore(struct sk_buff *skb, } static int nf_ct_bridge_refrag_post(struct net *net, struct sock *sk, - const struct nf_ct_bridge_frag_data *data, + const struct nf_bridge_frag_data *data, struct sk_buff *skb) { int err; diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c index 1804e867f715..7c9e92b2f806 100644 --- a/net/bridge/netfilter/nft_meta_bridge.c +++ b/net/bridge/netfilter/nft_meta_bridge.c @@ -53,7 +53,7 @@ static void nft_meta_bridge_get_eval(const struct nft_expr *expr, goto err; br_vlan_get_proto(br_dev, &p_proto); - nft_reg_store16(dest, p_proto); + nft_reg_store16(dest, htons(p_proto)); return; } default: diff --git a/net/caif/Kconfig b/net/caif/Kconfig index eb83051c8330..b7532a79ca7a 100644 --- a/net/caif/Kconfig +++ b/net/caif/Kconfig @@ -13,11 +13,11 @@ menuconfig CAIF with its modems. It is accessed from user space as sockets (PF_CAIF). Say Y (or M) here if you build for a phone product (e.g. Android or - MeeGo ) that uses CAIF as transport, if unsure say N. + MeeGo) that uses CAIF as transport. If unsure say N. If you select to build it as module then CAIF_NETDEV also needs to be - built as modules. You will also need to say yes to any CAIF physical - devices that your platform requires. + built as a module. You will also need to say Y (or M) to any CAIF + physical devices that your platform requires. See Documentation/networking/caif for a further explanation on how to use and configure CAIF. @@ -37,7 +37,7 @@ config CAIF_NETDEV default CAIF ---help--- Say Y if you will be using a CAIF based GPRS network device. - This can be either built-in or a loadable module, + This can be either built-in or a loadable module. If you select to build it as a built-in then the main CAIF device must also be a built-in. If unsure say Y. @@ -48,7 +48,7 @@ config CAIF_USB default n ---help--- Say Y if you are using CAIF over USB CDC NCM. - This can be either built-in or a loadable module, + This can be either built-in or a loadable module. If you select to build it as a built-in then the main CAIF device must also be a built-in. If unsure say N. diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 13ea920600ae..ef14da50a981 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -953,7 +953,7 @@ static __poll_t caif_poll(struct file *file, mask |= EPOLLRDHUP; /* readable? */ - if (!skb_queue_empty(&sk->sk_receive_queue) || + if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || (sk->sk_shutdown & RCV_SHUTDOWN)) mask |= EPOLLIN | EPOLLRDNORM; diff --git a/net/caif/caif_usb.c b/net/caif/caif_usb.c index 76bd67891fb3..a0116b9503d9 100644 --- a/net/caif/caif_usb.c +++ b/net/caif/caif_usb.c @@ -62,7 +62,7 @@ static int cfusbl_transmit(struct cflayer *layr, struct cfpkt *pkt) hpad = (info->hdr_len + CFUSB_PAD_DESCR_SZ) & (CFUSB_ALIGNMENT - 1); if (skb_headroom(skb) < ETH_HLEN + CFUSB_PAD_DESCR_SZ + hpad) { - pr_warn("Headroom to small\n"); + pr_warn("Headroom too small\n"); kfree_skb(skb); return -EIO; } diff --git a/net/can/Kconfig b/net/can/Kconfig index 0f9fe846ddef..d77042752457 100644 --- a/net/can/Kconfig +++ b/net/can/Kconfig @@ -8,11 +8,12 @@ menuconfig CAN tristate "CAN bus subsystem support" ---help--- Controller Area Network (CAN) is a slow (up to 1Mbit/s) serial - communications protocol which was developed by Bosch in - 1991, mainly for automotive, but now widely used in marine - (NMEA2000), industrial, and medical applications. - More information on the CAN network protocol family PF_CAN - is contained in <Documentation/networking/can.rst>. + communications protocol. Development of the CAN bus started in + 1983 at Robert Bosch GmbH, and the protocol was officially + released in 1986. The CAN bus was originally mainly for automotive, + but is now widely used in marine (NMEA2000), industrial, and medical + applications. More information on the CAN network protocol family + PF_CAN is contained in <Documentation/networking/can.rst>. If you want CAN support you should say Y here and also to the specific driver for your controller(s) below. @@ -52,6 +53,8 @@ config CAN_GW They can be modified with AND/OR/XOR/SET operations as configured by the netlink configuration interface known e.g. from iptables. +source "net/can/j1939/Kconfig" + source "drivers/net/can/Kconfig" endif diff --git a/net/can/Makefile b/net/can/Makefile index 1242bbbfe57f..08bd217fc051 100644 --- a/net/can/Makefile +++ b/net/can/Makefile @@ -15,3 +15,5 @@ can-bcm-y := bcm.o obj-$(CONFIG_CAN_GW) += can-gw.o can-gw-y := gw.o + +obj-$(CONFIG_CAN_J1939) += j1939/ diff --git a/net/can/af_can.c b/net/can/af_can.c index 80281ef2ccbd..128d37a4c2e0 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -1,5 +1,5 @@ -/* - * af_can.c - Protocol family CAN core module +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* af_can.c - Protocol family CAN core module * (used by different CAN protocol modules) * * Copyright (c) 2002-2017 Volkswagen Group Electronic Research @@ -58,6 +58,7 @@ #include <linux/can.h> #include <linux/can/core.h> #include <linux/can/skb.h> +#include <linux/can/can-ml.h> #include <linux/ratelimit.h> #include <net/net_namespace.h> #include <net/sock.h> @@ -83,24 +84,14 @@ static DEFINE_MUTEX(proto_tab_lock); static atomic_t skbcounter = ATOMIC_INIT(0); -/* - * af_can socket functions - */ - -int can_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) -{ - switch (cmd) { - default: - return -ENOIOCTLCMD; - } -} -EXPORT_SYMBOL(can_ioctl); +/* af_can socket functions */ -static void can_sock_destruct(struct sock *sk) +void can_sock_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_error_queue); } +EXPORT_SYMBOL(can_sock_destruct); static const struct can_proto *can_get_proto(int protocol) { @@ -140,14 +131,13 @@ static int can_create(struct net *net, struct socket *sock, int protocol, err = request_module("can-proto-%d", protocol); - /* - * In case of error we only print a message but don't + /* In case of error we only print a message but don't * return the error code immediately. Below we will * return -EPROTONOSUPPORT */ if (err) - printk_ratelimited(KERN_ERR "can: request_module " - "(can-proto-%d) failed.\n", protocol); + pr_err_ratelimited("can: request_module (can-proto-%d) failed.\n", + protocol); cp = can_get_proto(protocol); } @@ -188,9 +178,7 @@ static int can_create(struct net *net, struct socket *sock, int protocol, return err; } -/* - * af_can tx path - */ +/* af_can tx path */ /** * can_send - transmit a CAN frame (optional with local loopback) @@ -212,7 +200,7 @@ int can_send(struct sk_buff *skb, int loop) { struct sk_buff *newskb = NULL; struct canfd_frame *cfd = (struct canfd_frame *)skb->data; - struct s_stats *can_stats = dev_net(skb->dev)->can.can_stats; + struct can_pkg_stats *pkg_stats = dev_net(skb->dev)->can.pkg_stats; int err = -EINVAL; if (skb->len == CAN_MTU) { @@ -223,11 +211,11 @@ int can_send(struct sk_buff *skb, int loop) skb->protocol = htons(ETH_P_CANFD); if (unlikely(cfd->len > CANFD_MAX_DLEN)) goto inval_skb; - } else + } else { goto inval_skb; + } - /* - * Make sure the CAN frame can pass the selected CAN netdevice. + /* Make sure the CAN frame can pass the selected CAN netdevice. * As structs can_frame and canfd_frame are similar, we can provide * CAN FD frames to legacy CAN drivers as long as the length is <= 8 */ @@ -258,8 +246,7 @@ int can_send(struct sk_buff *skb, int loop) /* indication for the CAN driver: do loopback */ skb->pkt_type = PACKET_LOOPBACK; - /* - * The reference to the originating sock may be required + /* The reference to the originating sock may be required * by the receiving socket to check whether the frame is * its own. Example: can_raw sockopt CAN_RAW_RECV_OWN_MSGS * Therefore we have to ensure that skb->sk remains the @@ -268,8 +255,7 @@ int can_send(struct sk_buff *skb, int loop) */ if (!(skb->dev->flags & IFF_ECHO)) { - /* - * If the interface is not capable to do loopback + /* If the interface is not capable to do loopback * itself, we do it here. */ newskb = skb_clone(skb, GFP_ATOMIC); @@ -301,8 +287,8 @@ int can_send(struct sk_buff *skb, int loop) netif_rx_ni(newskb); /* update statistics */ - can_stats->tx_frames++; - can_stats->tx_frames_delta++; + pkg_stats->tx_frames++; + pkg_stats->tx_frames_delta++; return 0; @@ -312,17 +298,17 @@ inval_skb: } EXPORT_SYMBOL(can_send); -/* - * af_can rx path - */ +/* af_can rx path */ -static struct can_dev_rcv_lists *find_dev_rcv_lists(struct net *net, - struct net_device *dev) +static struct can_dev_rcv_lists *can_dev_rcv_lists_find(struct net *net, + struct net_device *dev) { - if (!dev) - return net->can.can_rx_alldev_list; - else - return (struct can_dev_rcv_lists *)dev->ml_priv; + if (dev) { + struct can_ml_priv *ml_priv = dev->ml_priv; + return &ml_priv->dev_rcv_lists; + } else { + return net->can.rx_alldev_list; + } } /** @@ -349,7 +335,7 @@ static unsigned int effhash(canid_t can_id) } /** - * find_rcv_list - determine optimal filterlist inside device filter struct + * can_rcv_list_find - determine optimal filterlist inside device filter struct * @can_id: pointer to CAN identifier of a given can_filter * @mask: pointer to CAN mask of a given can_filter * @d: pointer to the device filter struct @@ -375,8 +361,8 @@ static unsigned int effhash(canid_t can_id) * Constistency checked mask. * Reduced can_id to have a preprocessed filter compare value. */ -static struct hlist_head *find_rcv_list(canid_t *can_id, canid_t *mask, - struct can_dev_rcv_lists *d) +static struct hlist_head *can_rcv_list_find(canid_t *can_id, canid_t *mask, + struct can_dev_rcv_lists *dev_rcv_lists) { canid_t inv = *can_id & CAN_INV_FILTER; /* save flag before masking */ @@ -384,7 +370,7 @@ static struct hlist_head *find_rcv_list(canid_t *can_id, canid_t *mask, if (*mask & CAN_ERR_FLAG) { /* clear CAN_ERR_FLAG in filter entry */ *mask &= CAN_ERR_MASK; - return &d->rx[RX_ERR]; + return &dev_rcv_lists->rx[RX_ERR]; } /* with cleared CAN_ERR_FLAG we have a simple mask/value filterpair */ @@ -400,27 +386,26 @@ static struct hlist_head *find_rcv_list(canid_t *can_id, canid_t *mask, /* inverse can_id/can_mask filter */ if (inv) - return &d->rx[RX_INV]; + return &dev_rcv_lists->rx[RX_INV]; /* mask == 0 => no condition testing at receive time */ if (!(*mask)) - return &d->rx[RX_ALL]; + return &dev_rcv_lists->rx[RX_ALL]; /* extra filterlists for the subscription of a single non-RTR can_id */ if (((*mask & CAN_EFF_RTR_FLAGS) == CAN_EFF_RTR_FLAGS) && !(*can_id & CAN_RTR_FLAG)) { - if (*can_id & CAN_EFF_FLAG) { if (*mask == (CAN_EFF_MASK | CAN_EFF_RTR_FLAGS)) - return &d->rx_eff[effhash(*can_id)]; + return &dev_rcv_lists->rx_eff[effhash(*can_id)]; } else { if (*mask == (CAN_SFF_MASK | CAN_EFF_RTR_FLAGS)) - return &d->rx_sff[*can_id]; + return &dev_rcv_lists->rx_sff[*can_id]; } } /* default: filter via can_id/can_mask */ - return &d->rx[RX_FIL]; + return &dev_rcv_lists->rx[RX_FIL]; } /** @@ -457,10 +442,10 @@ int can_rx_register(struct net *net, struct net_device *dev, canid_t can_id, canid_t mask, void (*func)(struct sk_buff *, void *), void *data, char *ident, struct sock *sk) { - struct receiver *r; - struct hlist_head *rl; - struct can_dev_rcv_lists *d; - struct s_pstats *can_pstats = net->can.can_pstats; + struct receiver *rcv; + struct hlist_head *rcv_list; + struct can_dev_rcv_lists *dev_rcv_lists; + struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats; int err = 0; /* insert new receiver (dev,canid,mask) -> (func,data) */ @@ -471,50 +456,42 @@ int can_rx_register(struct net *net, struct net_device *dev, canid_t can_id, if (dev && !net_eq(net, dev_net(dev))) return -ENODEV; - r = kmem_cache_alloc(rcv_cache, GFP_KERNEL); - if (!r) + rcv = kmem_cache_alloc(rcv_cache, GFP_KERNEL); + if (!rcv) return -ENOMEM; - spin_lock(&net->can.can_rcvlists_lock); + spin_lock_bh(&net->can.rcvlists_lock); - d = find_dev_rcv_lists(net, dev); - if (d) { - rl = find_rcv_list(&can_id, &mask, d); + dev_rcv_lists = can_dev_rcv_lists_find(net, dev); + rcv_list = can_rcv_list_find(&can_id, &mask, dev_rcv_lists); - r->can_id = can_id; - r->mask = mask; - r->matches = 0; - r->func = func; - r->data = data; - r->ident = ident; - r->sk = sk; + rcv->can_id = can_id; + rcv->mask = mask; + rcv->matches = 0; + rcv->func = func; + rcv->data = data; + rcv->ident = ident; + rcv->sk = sk; - hlist_add_head_rcu(&r->list, rl); - d->entries++; + hlist_add_head_rcu(&rcv->list, rcv_list); + dev_rcv_lists->entries++; - can_pstats->rcv_entries++; - if (can_pstats->rcv_entries_max < can_pstats->rcv_entries) - can_pstats->rcv_entries_max = can_pstats->rcv_entries; - } else { - kmem_cache_free(rcv_cache, r); - err = -ENODEV; - } - - spin_unlock(&net->can.can_rcvlists_lock); + rcv_lists_stats->rcv_entries++; + rcv_lists_stats->rcv_entries_max = max(rcv_lists_stats->rcv_entries_max, + rcv_lists_stats->rcv_entries); + spin_unlock_bh(&net->can.rcvlists_lock); return err; } EXPORT_SYMBOL(can_rx_register); -/* - * can_rx_delete_receiver - rcu callback for single receiver entry removal - */ +/* can_rx_delete_receiver - rcu callback for single receiver entry removal */ static void can_rx_delete_receiver(struct rcu_head *rp) { - struct receiver *r = container_of(rp, struct receiver, rcu); - struct sock *sk = r->sk; + struct receiver *rcv = container_of(rp, struct receiver, rcu); + struct sock *sk = rcv->sk; - kmem_cache_free(rcv_cache, r); + kmem_cache_free(rcv_cache, rcv); if (sk) sock_put(sk); } @@ -534,10 +511,10 @@ void can_rx_unregister(struct net *net, struct net_device *dev, canid_t can_id, canid_t mask, void (*func)(struct sk_buff *, void *), void *data) { - struct receiver *r = NULL; - struct hlist_head *rl; - struct s_pstats *can_pstats = net->can.can_pstats; - struct can_dev_rcv_lists *d; + struct receiver *rcv = NULL; + struct hlist_head *rcv_list; + struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats; + struct can_dev_rcv_lists *dev_rcv_lists; if (dev && dev->type != ARPHRD_CAN) return; @@ -545,86 +522,69 @@ void can_rx_unregister(struct net *net, struct net_device *dev, canid_t can_id, if (dev && !net_eq(net, dev_net(dev))) return; - spin_lock(&net->can.can_rcvlists_lock); + spin_lock_bh(&net->can.rcvlists_lock); - d = find_dev_rcv_lists(net, dev); - if (!d) { - pr_err("BUG: receive list not found for " - "dev %s, id %03X, mask %03X\n", - DNAME(dev), can_id, mask); - goto out; - } + dev_rcv_lists = can_dev_rcv_lists_find(net, dev); + rcv_list = can_rcv_list_find(&can_id, &mask, dev_rcv_lists); - rl = find_rcv_list(&can_id, &mask, d); - - /* - * Search the receiver list for the item to delete. This should + /* Search the receiver list for the item to delete. This should * exist, since no receiver may be unregistered that hasn't * been registered before. */ - - hlist_for_each_entry_rcu(r, rl, list) { - if (r->can_id == can_id && r->mask == mask && - r->func == func && r->data == data) + hlist_for_each_entry_rcu(rcv, rcv_list, list) { + if (rcv->can_id == can_id && rcv->mask == mask && + rcv->func == func && rcv->data == data) break; } - /* - * Check for bugs in CAN protocol implementations using af_can.c: - * 'r' will be NULL if no matching list item was found for removal. + /* Check for bugs in CAN protocol implementations using af_can.c: + * 'rcv' will be NULL if no matching list item was found for removal. */ - - if (!r) { - WARN(1, "BUG: receive list entry not found for dev %s, " - "id %03X, mask %03X\n", DNAME(dev), can_id, mask); + if (!rcv) { + WARN(1, "BUG: receive list entry not found for dev %s, id %03X, mask %03X\n", + DNAME(dev), can_id, mask); goto out; } - hlist_del_rcu(&r->list); - d->entries--; + hlist_del_rcu(&rcv->list); + dev_rcv_lists->entries--; - if (can_pstats->rcv_entries > 0) - can_pstats->rcv_entries--; - - /* remove device structure requested by NETDEV_UNREGISTER */ - if (d->remove_on_zero_entries && !d->entries) { - kfree(d); - dev->ml_priv = NULL; - } + if (rcv_lists_stats->rcv_entries > 0) + rcv_lists_stats->rcv_entries--; out: - spin_unlock(&net->can.can_rcvlists_lock); + spin_unlock_bh(&net->can.rcvlists_lock); /* schedule the receiver item for deletion */ - if (r) { - if (r->sk) - sock_hold(r->sk); - call_rcu(&r->rcu, can_rx_delete_receiver); + if (rcv) { + if (rcv->sk) + sock_hold(rcv->sk); + call_rcu(&rcv->rcu, can_rx_delete_receiver); } } EXPORT_SYMBOL(can_rx_unregister); -static inline void deliver(struct sk_buff *skb, struct receiver *r) +static inline void deliver(struct sk_buff *skb, struct receiver *rcv) { - r->func(skb, r->data); - r->matches++; + rcv->func(skb, rcv->data); + rcv->matches++; } -static int can_rcv_filter(struct can_dev_rcv_lists *d, struct sk_buff *skb) +static int can_rcv_filter(struct can_dev_rcv_lists *dev_rcv_lists, struct sk_buff *skb) { - struct receiver *r; + struct receiver *rcv; int matches = 0; struct can_frame *cf = (struct can_frame *)skb->data; canid_t can_id = cf->can_id; - if (d->entries == 0) + if (dev_rcv_lists->entries == 0) return 0; if (can_id & CAN_ERR_FLAG) { /* check for error message frame entries only */ - hlist_for_each_entry_rcu(r, &d->rx[RX_ERR], list) { - if (can_id & r->mask) { - deliver(skb, r); + hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_ERR], list) { + if (can_id & rcv->mask) { + deliver(skb, rcv); matches++; } } @@ -632,23 +592,23 @@ static int can_rcv_filter(struct can_dev_rcv_lists *d, struct sk_buff *skb) } /* check for unfiltered entries */ - hlist_for_each_entry_rcu(r, &d->rx[RX_ALL], list) { - deliver(skb, r); + hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_ALL], list) { + deliver(skb, rcv); matches++; } /* check for can_id/mask entries */ - hlist_for_each_entry_rcu(r, &d->rx[RX_FIL], list) { - if ((can_id & r->mask) == r->can_id) { - deliver(skb, r); + hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_FIL], list) { + if ((can_id & rcv->mask) == rcv->can_id) { + deliver(skb, rcv); matches++; } } /* check for inverted can_id/mask entries */ - hlist_for_each_entry_rcu(r, &d->rx[RX_INV], list) { - if ((can_id & r->mask) != r->can_id) { - deliver(skb, r); + hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_INV], list) { + if ((can_id & rcv->mask) != rcv->can_id) { + deliver(skb, rcv); matches++; } } @@ -658,16 +618,16 @@ static int can_rcv_filter(struct can_dev_rcv_lists *d, struct sk_buff *skb) return matches; if (can_id & CAN_EFF_FLAG) { - hlist_for_each_entry_rcu(r, &d->rx_eff[effhash(can_id)], list) { - if (r->can_id == can_id) { - deliver(skb, r); + hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx_eff[effhash(can_id)], list) { + if (rcv->can_id == can_id) { + deliver(skb, rcv); matches++; } } } else { can_id &= CAN_SFF_MASK; - hlist_for_each_entry_rcu(r, &d->rx_sff[can_id], list) { - deliver(skb, r); + hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx_sff[can_id], list) { + deliver(skb, rcv); matches++; } } @@ -677,14 +637,14 @@ static int can_rcv_filter(struct can_dev_rcv_lists *d, struct sk_buff *skb) static void can_receive(struct sk_buff *skb, struct net_device *dev) { - struct can_dev_rcv_lists *d; + struct can_dev_rcv_lists *dev_rcv_lists; struct net *net = dev_net(dev); - struct s_stats *can_stats = net->can.can_stats; + struct can_pkg_stats *pkg_stats = net->can.pkg_stats; int matches; /* update statistics */ - can_stats->rx_frames++; - can_stats->rx_frames_delta++; + pkg_stats->rx_frames++; + pkg_stats->rx_frames_delta++; /* create non-zero unique skb identifier together with *skb */ while (!(can_skb_prv(skb)->skbcnt)) @@ -693,12 +653,11 @@ static void can_receive(struct sk_buff *skb, struct net_device *dev) rcu_read_lock(); /* deliver the packet to sockets listening on all devices */ - matches = can_rcv_filter(net->can.can_rx_alldev_list, skb); + matches = can_rcv_filter(net->can.rx_alldev_list, skb); /* find receive list for this device */ - d = find_dev_rcv_lists(net, dev); - if (d) - matches += can_rcv_filter(d, skb); + dev_rcv_lists = can_dev_rcv_lists_find(net, dev); + matches += can_rcv_filter(dev_rcv_lists, skb); rcu_read_unlock(); @@ -706,8 +665,8 @@ static void can_receive(struct sk_buff *skb, struct net_device *dev) consume_skb(skb); if (matches > 0) { - can_stats->matches++; - can_stats->matches_delta++; + pkg_stats->matches++; + pkg_stats->matches_delta++; } } @@ -729,7 +688,7 @@ static int can_rcv(struct sk_buff *skb, struct net_device *dev, } static int canfd_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, struct net_device *orig_dev) + struct packet_type *pt, struct net_device *orig_dev) { struct canfd_frame *cfd = (struct canfd_frame *)skb->data; @@ -745,9 +704,7 @@ static int canfd_rcv(struct sk_buff *skb, struct net_device *dev, return NET_RX_SUCCESS; } -/* - * af_can protocol functions - */ +/* af_can protocol functions */ /** * can_proto_register - register CAN transport protocol @@ -778,8 +735,9 @@ int can_proto_register(const struct can_proto *cp) if (rcu_access_pointer(proto_tab[proto])) { pr_err("can: protocol %d already registered\n", proto); err = -EBUSY; - } else + } else { RCU_INIT_POINTER(proto_tab[proto], cp); + } mutex_unlock(&proto_tab_lock); @@ -809,48 +767,19 @@ void can_proto_unregister(const struct can_proto *cp) } EXPORT_SYMBOL(can_proto_unregister); -/* - * af_can notifier to create/remove CAN netdevice specific structs - */ +/* af_can notifier to create/remove CAN netdevice specific structs */ static int can_notifier(struct notifier_block *nb, unsigned long msg, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct can_dev_rcv_lists *d; if (dev->type != ARPHRD_CAN) return NOTIFY_DONE; switch (msg) { - case NETDEV_REGISTER: - - /* create new dev_rcv_lists for this device */ - d = kzalloc(sizeof(*d), GFP_KERNEL); - if (!d) - return NOTIFY_DONE; - BUG_ON(dev->ml_priv); - dev->ml_priv = d; - - break; - - case NETDEV_UNREGISTER: - spin_lock(&dev_net(dev)->can.can_rcvlists_lock); - - d = dev->ml_priv; - if (d) { - if (d->entries) - d->remove_on_zero_entries = 1; - else { - kfree(d); - dev->ml_priv = NULL; - } - } else - pr_err("can: notifier: receive list not found for dev " - "%s\n", dev->name); - - spin_unlock(&dev_net(dev)->can.can_rcvlists_lock); - + WARN(!dev->ml_priv, + "No CAN mid layer private allocated, please fix your driver and use alloc_candev()!\n"); break; } @@ -859,71 +788,54 @@ static int can_notifier(struct notifier_block *nb, unsigned long msg, static int can_pernet_init(struct net *net) { - spin_lock_init(&net->can.can_rcvlists_lock); - net->can.can_rx_alldev_list = - kzalloc(sizeof(struct can_dev_rcv_lists), GFP_KERNEL); - if (!net->can.can_rx_alldev_list) + spin_lock_init(&net->can.rcvlists_lock); + net->can.rx_alldev_list = + kzalloc(sizeof(*net->can.rx_alldev_list), GFP_KERNEL); + if (!net->can.rx_alldev_list) goto out; - net->can.can_stats = kzalloc(sizeof(struct s_stats), GFP_KERNEL); - if (!net->can.can_stats) - goto out_free_alldev_list; - net->can.can_pstats = kzalloc(sizeof(struct s_pstats), GFP_KERNEL); - if (!net->can.can_pstats) - goto out_free_can_stats; + net->can.pkg_stats = kzalloc(sizeof(*net->can.pkg_stats), GFP_KERNEL); + if (!net->can.pkg_stats) + goto out_free_rx_alldev_list; + net->can.rcv_lists_stats = kzalloc(sizeof(*net->can.rcv_lists_stats), GFP_KERNEL); + if (!net->can.rcv_lists_stats) + goto out_free_pkg_stats; if (IS_ENABLED(CONFIG_PROC_FS)) { /* the statistics are updated every second (timer triggered) */ if (stats_timer) { - timer_setup(&net->can.can_stattimer, can_stat_update, + timer_setup(&net->can.stattimer, can_stat_update, 0); - mod_timer(&net->can.can_stattimer, + mod_timer(&net->can.stattimer, round_jiffies(jiffies + HZ)); } - net->can.can_stats->jiffies_init = jiffies; + net->can.pkg_stats->jiffies_init = jiffies; can_init_proc(net); } return 0; - out_free_can_stats: - kfree(net->can.can_stats); - out_free_alldev_list: - kfree(net->can.can_rx_alldev_list); + out_free_pkg_stats: + kfree(net->can.pkg_stats); + out_free_rx_alldev_list: + kfree(net->can.rx_alldev_list); out: return -ENOMEM; } static void can_pernet_exit(struct net *net) { - struct net_device *dev; - if (IS_ENABLED(CONFIG_PROC_FS)) { can_remove_proc(net); if (stats_timer) - del_timer_sync(&net->can.can_stattimer); + del_timer_sync(&net->can.stattimer); } - /* remove created dev_rcv_lists from still registered CAN devices */ - rcu_read_lock(); - for_each_netdev_rcu(net, dev) { - if (dev->type == ARPHRD_CAN && dev->ml_priv) { - struct can_dev_rcv_lists *d = dev->ml_priv; - - BUG_ON(d->entries); - kfree(d); - dev->ml_priv = NULL; - } - } - rcu_read_unlock(); - - kfree(net->can.can_rx_alldev_list); - kfree(net->can.can_stats); - kfree(net->can.can_pstats); + kfree(net->can.rx_alldev_list); + kfree(net->can.pkg_stats); + kfree(net->can.rcv_lists_stats); } -/* - * af_can module init/exit functions - */ +/* af_can module init/exit functions */ static struct packet_type can_packet __read_mostly = { .type = cpu_to_be16(ETH_P_CAN), diff --git a/net/can/af_can.h b/net/can/af_can.h index 9cb3719632bd..7c2d9161e224 100644 --- a/net/can/af_can.h +++ b/net/can/af_can.h @@ -1,5 +1,5 @@ -/* - * Copyright (c) 2002-2007 Volkswagen Group Electronic Research +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ +/* Copyright (c) 2002-2007 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -53,32 +53,17 @@ struct receiver { canid_t can_id; canid_t mask; unsigned long matches; - void (*func)(struct sk_buff *, void *); + void (*func)(struct sk_buff *skb, void *data); void *data; char *ident; struct sock *sk; struct rcu_head rcu; }; -#define CAN_SFF_RCV_ARRAY_SZ (1 << CAN_SFF_ID_BITS) -#define CAN_EFF_RCV_HASH_BITS 10 -#define CAN_EFF_RCV_ARRAY_SZ (1 << CAN_EFF_RCV_HASH_BITS) - -enum { RX_ERR, RX_ALL, RX_FIL, RX_INV, RX_MAX }; - -/* per device receive filters linked at dev->ml_priv */ -struct can_dev_rcv_lists { - struct hlist_head rx[RX_MAX]; - struct hlist_head rx_sff[CAN_SFF_RCV_ARRAY_SZ]; - struct hlist_head rx_eff[CAN_EFF_RCV_ARRAY_SZ]; - int remove_on_zero_entries; - int entries; -}; - /* statistic structures */ /* can be reset e.g. by can_init_stats() */ -struct s_stats { +struct can_pkg_stats { unsigned long jiffies_init; unsigned long rx_frames; @@ -103,7 +88,7 @@ struct s_stats { }; /* persistent statistics */ -struct s_pstats { +struct can_rcv_lists_stats { unsigned long stats_reset; unsigned long user_reset; unsigned long rcv_entries; diff --git a/net/can/bcm.c b/net/can/bcm.c index a34ee52f19ea..c96fa0f33db3 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) /* * bcm.c - Broadcast Manager to filter/send (cyclic) CAN content * @@ -105,7 +106,6 @@ struct bcm_op { unsigned long frames_abs, frames_filtered; struct bcm_timeval ival1, ival2; struct hrtimer timer, thrtimer; - struct tasklet_struct tsklet, thrtsklet; ktime_t rx_stamp, kt_ival1, kt_ival2, kt_lastmsg; int rx_ifindex; int cfsiz; @@ -370,25 +370,34 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head, } } -static void bcm_tx_start_timer(struct bcm_op *op) +static bool bcm_tx_set_expiry(struct bcm_op *op, struct hrtimer *hrt) { + ktime_t ival; + if (op->kt_ival1 && op->count) - hrtimer_start(&op->timer, - ktime_add(ktime_get(), op->kt_ival1), - HRTIMER_MODE_ABS); + ival = op->kt_ival1; else if (op->kt_ival2) - hrtimer_start(&op->timer, - ktime_add(ktime_get(), op->kt_ival2), - HRTIMER_MODE_ABS); + ival = op->kt_ival2; + else + return false; + + hrtimer_set_expires(hrt, ktime_add(ktime_get(), ival)); + return true; } -static void bcm_tx_timeout_tsklet(unsigned long data) +static void bcm_tx_start_timer(struct bcm_op *op) { - struct bcm_op *op = (struct bcm_op *)data; + if (bcm_tx_set_expiry(op, &op->timer)) + hrtimer_start_expires(&op->timer, HRTIMER_MODE_ABS_SOFT); +} + +/* bcm_tx_timeout_handler - performs cyclic CAN frame transmissions */ +static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer) +{ + struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer); struct bcm_msg_head msg_head; if (op->kt_ival1 && (op->count > 0)) { - op->count--; if (!op->count && (op->flags & TX_COUNTEVT)) { @@ -405,22 +414,12 @@ static void bcm_tx_timeout_tsklet(unsigned long data) } bcm_can_tx(op); - } else if (op->kt_ival2) + } else if (op->kt_ival2) { bcm_can_tx(op); + } - bcm_tx_start_timer(op); -} - -/* - * bcm_tx_timeout_handler - performs cyclic CAN frame transmissions - */ -static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer) -{ - struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer); - - tasklet_schedule(&op->tsklet); - - return HRTIMER_NORESTART; + return bcm_tx_set_expiry(op, &op->timer) ? + HRTIMER_RESTART : HRTIMER_NORESTART; } /* @@ -486,7 +485,7 @@ static void bcm_rx_update_and_send(struct bcm_op *op, /* do not send the saved data - only start throttle timer */ hrtimer_start(&op->thrtimer, ktime_add(op->kt_lastmsg, op->kt_ival2), - HRTIMER_MODE_ABS); + HRTIMER_MODE_ABS_SOFT); return; } @@ -545,14 +544,21 @@ static void bcm_rx_starttimer(struct bcm_op *op) return; if (op->kt_ival1) - hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL); + hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL_SOFT); } -static void bcm_rx_timeout_tsklet(unsigned long data) +/* bcm_rx_timeout_handler - when the (cyclic) CAN frame reception timed out */ +static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer) { - struct bcm_op *op = (struct bcm_op *)data; + struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer); struct bcm_msg_head msg_head; + /* if user wants to be informed, when cyclic CAN-Messages come back */ + if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) { + /* clear received CAN frames to indicate 'nothing received' */ + memset(op->last_frames, 0, op->nframes * op->cfsiz); + } + /* create notification to user */ msg_head.opcode = RX_TIMEOUT; msg_head.flags = op->flags; @@ -563,25 +569,6 @@ static void bcm_rx_timeout_tsklet(unsigned long data) msg_head.nframes = 0; bcm_send_to_user(op, &msg_head, NULL, 0); -} - -/* - * bcm_rx_timeout_handler - when the (cyclic) CAN frame reception timed out - */ -static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer) -{ - struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer); - - /* schedule before NET_RX_SOFTIRQ */ - tasklet_hi_schedule(&op->tsklet); - - /* no restart of the timer is done here! */ - - /* if user wants to be informed, when cyclic CAN-Messages come back */ - if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) { - /* clear received CAN frames to indicate 'nothing received' */ - memset(op->last_frames, 0, op->nframes * op->cfsiz); - } return HRTIMER_NORESTART; } @@ -589,14 +576,12 @@ static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer) /* * bcm_rx_do_flush - helper for bcm_rx_thr_flush */ -static inline int bcm_rx_do_flush(struct bcm_op *op, int update, - unsigned int index) +static inline int bcm_rx_do_flush(struct bcm_op *op, unsigned int index) { struct canfd_frame *lcf = op->last_frames + op->cfsiz * index; if ((op->last_frames) && (lcf->flags & RX_THR)) { - if (update) - bcm_rx_changed(op, lcf); + bcm_rx_changed(op, lcf); return 1; } return 0; @@ -604,11 +589,8 @@ static inline int bcm_rx_do_flush(struct bcm_op *op, int update, /* * bcm_rx_thr_flush - Check for throttled data and send it to the userspace - * - * update == 0 : just check if throttled data is available (any irq context) - * update == 1 : check and send throttled data to userspace (soft_irq context) */ -static int bcm_rx_thr_flush(struct bcm_op *op, int update) +static int bcm_rx_thr_flush(struct bcm_op *op) { int updated = 0; @@ -617,24 +599,16 @@ static int bcm_rx_thr_flush(struct bcm_op *op, int update) /* for MUX filter we start at index 1 */ for (i = 1; i < op->nframes; i++) - updated += bcm_rx_do_flush(op, update, i); + updated += bcm_rx_do_flush(op, i); } else { /* for RX_FILTER_ID and simple filter */ - updated += bcm_rx_do_flush(op, update, 0); + updated += bcm_rx_do_flush(op, 0); } return updated; } -static void bcm_rx_thr_tsklet(unsigned long data) -{ - struct bcm_op *op = (struct bcm_op *)data; - - /* push the changed data to the userspace */ - bcm_rx_thr_flush(op, 1); -} - /* * bcm_rx_thr_handler - the time for blocked content updates is over now: * Check for throttled data and send it to the userspace @@ -643,9 +617,7 @@ static enum hrtimer_restart bcm_rx_thr_handler(struct hrtimer *hrtimer) { struct bcm_op *op = container_of(hrtimer, struct bcm_op, thrtimer); - tasklet_schedule(&op->thrtsklet); - - if (bcm_rx_thr_flush(op, 0)) { + if (bcm_rx_thr_flush(op)) { hrtimer_forward(hrtimer, ktime_get(), op->kt_ival2); return HRTIMER_RESTART; } else { @@ -741,23 +713,8 @@ static struct bcm_op *bcm_find_op(struct list_head *ops, static void bcm_remove_op(struct bcm_op *op) { - if (op->tsklet.func) { - while (test_bit(TASKLET_STATE_SCHED, &op->tsklet.state) || - test_bit(TASKLET_STATE_RUN, &op->tsklet.state) || - hrtimer_active(&op->timer)) { - hrtimer_cancel(&op->timer); - tasklet_kill(&op->tsklet); - } - } - - if (op->thrtsklet.func) { - while (test_bit(TASKLET_STATE_SCHED, &op->thrtsklet.state) || - test_bit(TASKLET_STATE_RUN, &op->thrtsklet.state) || - hrtimer_active(&op->thrtimer)) { - hrtimer_cancel(&op->thrtimer); - tasklet_kill(&op->thrtsklet); - } - } + hrtimer_cancel(&op->timer); + hrtimer_cancel(&op->thrtimer); if ((op->frames) && (op->frames != &op->sframe)) kfree(op->frames); @@ -990,15 +947,13 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, op->ifindex = ifindex; /* initialize uninitialized (kzalloc) structure */ - hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(&op->timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_SOFT); op->timer.function = bcm_tx_timeout_handler; - /* initialize tasklet for tx countevent notification */ - tasklet_init(&op->tsklet, bcm_tx_timeout_tsklet, - (unsigned long) op); - /* currently unused in tx_ops */ - hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_SOFT); /* add this bcm_op to the list of the tx_ops */ list_add(&op->list, &bo->tx_ops); @@ -1167,20 +1122,14 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, op->rx_ifindex = ifindex; /* initialize uninitialized (kzalloc) structure */ - hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(&op->timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_SOFT); op->timer.function = bcm_rx_timeout_handler; - /* initialize tasklet for rx timeout notification */ - tasklet_init(&op->tsklet, bcm_rx_timeout_tsklet, - (unsigned long) op); - - hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_SOFT); op->thrtimer.function = bcm_rx_thr_handler; - /* initialize tasklet for rx throttle handling */ - tasklet_init(&op->thrtsklet, bcm_rx_thr_tsklet, - (unsigned long) op); - /* add this bcm_op to the list of the rx_ops */ list_add(&op->list, &bo->rx_ops); @@ -1226,12 +1175,12 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, */ op->kt_lastmsg = 0; hrtimer_cancel(&op->thrtimer); - bcm_rx_thr_flush(op, 1); + bcm_rx_thr_flush(op); } if ((op->flags & STARTTIMER) && op->kt_ival1) hrtimer_start(&op->timer, op->kt_ival1, - HRTIMER_MODE_REL); + HRTIMER_MODE_REL_SOFT); } /* now we can register for can_ids, if we added a new bcm_op */ @@ -1345,7 +1294,7 @@ static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) /* no bound device as default => check msg_name */ DECLARE_SOCKADDR(struct sockaddr_can *, addr, msg->msg_name); - if (msg->msg_namelen < sizeof(*addr)) + if (msg->msg_namelen < CAN_REQUIRED_SIZE(*addr, can_ifindex)) return -EINVAL; if (addr->can_family != AF_CAN) @@ -1587,7 +1536,7 @@ static int bcm_connect(struct socket *sock, struct sockaddr *uaddr, int len, struct net *net = sock_net(sk); int ret = 0; - if (len < sizeof(*addr)) + if (len < CAN_REQUIRED_SIZE(*addr, can_ifindex)) return -EINVAL; lock_sock(sk); @@ -1679,6 +1628,13 @@ static int bcm_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, return size; } +static int bcm_sock_no_ioctlcmd(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + /* no ioctls for socket layer -> hand it down to NIC layer */ + return -ENOIOCTLCMD; +} + static const struct proto_ops bcm_ops = { .family = PF_CAN, .release = bcm_release, @@ -1688,7 +1644,7 @@ static const struct proto_ops bcm_ops = { .accept = sock_no_accept, .getname = sock_no_getname, .poll = datagram_poll, - .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */ + .ioctl = bcm_sock_no_ioctlcmd, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/can/gw.c b/net/can/gw.c index 72711053ebe6..65d60c93af29 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -1,7 +1,7 @@ -/* - * gw.c - CAN frame Gateway/Router/Bridge with netlink interface +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* gw.c - CAN frame Gateway/Router/Bridge with netlink interface * - * Copyright (c) 2017 Volkswagen Group Electronic Research + * Copyright (c) 2019 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -59,7 +59,7 @@ #include <net/net_namespace.h> #include <net/sock.h> -#define CAN_GW_VERSION "20170425" +#define CAN_GW_VERSION "20190810" #define CAN_GW_NAME "can-gw" MODULE_DESCRIPTION("PF_CAN netlink gateway"); @@ -85,10 +85,10 @@ static struct kmem_cache *cgw_cache __read_mostly; /* structure that contains the (on-the-fly) CAN frame modifications */ struct cf_mod { struct { - struct can_frame and; - struct can_frame or; - struct can_frame xor; - struct can_frame set; + struct canfd_frame and; + struct canfd_frame or; + struct canfd_frame xor; + struct canfd_frame set; } modframe; struct { u8 and; @@ -96,7 +96,7 @@ struct cf_mod { u8 xor; u8 set; } modtype; - void (*modfunc[MAX_MODFUNCTIONS])(struct can_frame *cf, + void (*modfunc[MAX_MODFUNCTIONS])(struct canfd_frame *cf, struct cf_mod *mod); /* CAN frame checksum calculation after CAN frame modifications */ @@ -105,15 +105,15 @@ struct cf_mod { struct cgw_csum_crc8 crc8; } csum; struct { - void (*xor)(struct can_frame *cf, struct cgw_csum_xor *xor); - void (*crc8)(struct can_frame *cf, struct cgw_csum_crc8 *crc8); + void (*xor)(struct canfd_frame *cf, + struct cgw_csum_xor *xor); + void (*crc8)(struct canfd_frame *cf, + struct cgw_csum_crc8 *crc8); } csumfunc; u32 uid; }; - -/* - * So far we just support CAN -> CAN routing and frame modifications. +/* So far we just support CAN -> CAN routing and frame modifications. * * The internal can_can_gw structure contains data and attributes for * a CAN -> CAN gateway job. @@ -151,39 +151,88 @@ struct cgw_job { /* modification functions that are invoked in the hot path in can_can_gw_rcv */ -#define MODFUNC(func, op) static void func(struct can_frame *cf, \ +#define MODFUNC(func, op) static void func(struct canfd_frame *cf, \ struct cf_mod *mod) { op ; } MODFUNC(mod_and_id, cf->can_id &= mod->modframe.and.can_id) -MODFUNC(mod_and_dlc, cf->can_dlc &= mod->modframe.and.can_dlc) +MODFUNC(mod_and_len, cf->len &= mod->modframe.and.len) +MODFUNC(mod_and_flags, cf->flags &= mod->modframe.and.flags) MODFUNC(mod_and_data, *(u64 *)cf->data &= *(u64 *)mod->modframe.and.data) MODFUNC(mod_or_id, cf->can_id |= mod->modframe.or.can_id) -MODFUNC(mod_or_dlc, cf->can_dlc |= mod->modframe.or.can_dlc) +MODFUNC(mod_or_len, cf->len |= mod->modframe.or.len) +MODFUNC(mod_or_flags, cf->flags |= mod->modframe.or.flags) MODFUNC(mod_or_data, *(u64 *)cf->data |= *(u64 *)mod->modframe.or.data) MODFUNC(mod_xor_id, cf->can_id ^= mod->modframe.xor.can_id) -MODFUNC(mod_xor_dlc, cf->can_dlc ^= mod->modframe.xor.can_dlc) +MODFUNC(mod_xor_len, cf->len ^= mod->modframe.xor.len) +MODFUNC(mod_xor_flags, cf->flags ^= mod->modframe.xor.flags) MODFUNC(mod_xor_data, *(u64 *)cf->data ^= *(u64 *)mod->modframe.xor.data) MODFUNC(mod_set_id, cf->can_id = mod->modframe.set.can_id) -MODFUNC(mod_set_dlc, cf->can_dlc = mod->modframe.set.can_dlc) +MODFUNC(mod_set_len, cf->len = mod->modframe.set.len) +MODFUNC(mod_set_flags, cf->flags = mod->modframe.set.flags) MODFUNC(mod_set_data, *(u64 *)cf->data = *(u64 *)mod->modframe.set.data) -static inline void canframecpy(struct can_frame *dst, struct can_frame *src) +static void mod_and_fddata(struct canfd_frame *cf, struct cf_mod *mod) +{ + int i; + + for (i = 0; i < CANFD_MAX_DLEN; i += 8) + *(u64 *)(cf->data + i) &= *(u64 *)(mod->modframe.and.data + i); +} + +static void mod_or_fddata(struct canfd_frame *cf, struct cf_mod *mod) +{ + int i; + + for (i = 0; i < CANFD_MAX_DLEN; i += 8) + *(u64 *)(cf->data + i) |= *(u64 *)(mod->modframe.or.data + i); +} + +static void mod_xor_fddata(struct canfd_frame *cf, struct cf_mod *mod) +{ + int i; + + for (i = 0; i < CANFD_MAX_DLEN; i += 8) + *(u64 *)(cf->data + i) ^= *(u64 *)(mod->modframe.xor.data + i); +} + +static void mod_set_fddata(struct canfd_frame *cf, struct cf_mod *mod) +{ + memcpy(cf->data, mod->modframe.set.data, CANFD_MAX_DLEN); +} + +static void canframecpy(struct canfd_frame *dst, struct can_frame *src) { - /* - * Copy the struct members separately to ensure that no uninitialized + /* Copy the struct members separately to ensure that no uninitialized * data are copied in the 3 bytes hole of the struct. This is needed * to make easy compares of the data in the struct cf_mod. */ dst->can_id = src->can_id; - dst->can_dlc = src->can_dlc; + dst->len = src->can_dlc; *(u64 *)dst->data = *(u64 *)src->data; } -static int cgw_chk_csum_parms(s8 fr, s8 to, s8 re) +static void canfdframecpy(struct canfd_frame *dst, struct canfd_frame *src) { - /* - * absolute dlc values 0 .. 7 => 0 .. 7, e.g. data [0] + /* Copy the struct members separately to ensure that no uninitialized + * data are copied in the 2 bytes hole of the struct. This is needed + * to make easy compares of the data in the struct cf_mod. + */ + + dst->can_id = src->can_id; + dst->flags = src->flags; + dst->len = src->len; + memcpy(dst->data, src->data, CANFD_MAX_DLEN); +} + +static int cgw_chk_csum_parms(s8 fr, s8 to, s8 re, struct rtcanmsg *r) +{ + s8 dlen = CAN_MAX_DLEN; + + if (r->flags & CGW_FLAGS_CAN_FD) + dlen = CANFD_MAX_DLEN; + + /* absolute dlc values 0 .. 7 => 0 .. 7, e.g. data [0] * relative to received dlc -1 .. -8 : * e.g. for received dlc = 8 * -1 => index = 7 (data[7]) @@ -191,27 +240,27 @@ static int cgw_chk_csum_parms(s8 fr, s8 to, s8 re) * -8 => index = 0 (data[0]) */ - if (fr > -9 && fr < 8 && - to > -9 && to < 8 && - re > -9 && re < 8) + if (fr >= -dlen && fr < dlen && + to >= -dlen && to < dlen && + re >= -dlen && re < dlen) return 0; else return -EINVAL; } -static inline int calc_idx(int idx, int rx_dlc) +static inline int calc_idx(int idx, int rx_len) { if (idx < 0) - return rx_dlc + idx; + return rx_len + idx; else return idx; } -static void cgw_csum_xor_rel(struct can_frame *cf, struct cgw_csum_xor *xor) +static void cgw_csum_xor_rel(struct canfd_frame *cf, struct cgw_csum_xor *xor) { - int from = calc_idx(xor->from_idx, cf->can_dlc); - int to = calc_idx(xor->to_idx, cf->can_dlc); - int res = calc_idx(xor->result_idx, cf->can_dlc); + int from = calc_idx(xor->from_idx, cf->len); + int to = calc_idx(xor->to_idx, cf->len); + int res = calc_idx(xor->result_idx, cf->len); u8 val = xor->init_xor_val; int i; @@ -229,7 +278,7 @@ static void cgw_csum_xor_rel(struct can_frame *cf, struct cgw_csum_xor *xor) cf->data[res] = val; } -static void cgw_csum_xor_pos(struct can_frame *cf, struct cgw_csum_xor *xor) +static void cgw_csum_xor_pos(struct canfd_frame *cf, struct cgw_csum_xor *xor) { u8 val = xor->init_xor_val; int i; @@ -240,7 +289,7 @@ static void cgw_csum_xor_pos(struct can_frame *cf, struct cgw_csum_xor *xor) cf->data[xor->result_idx] = val; } -static void cgw_csum_xor_neg(struct can_frame *cf, struct cgw_csum_xor *xor) +static void cgw_csum_xor_neg(struct canfd_frame *cf, struct cgw_csum_xor *xor) { u8 val = xor->init_xor_val; int i; @@ -251,11 +300,12 @@ static void cgw_csum_xor_neg(struct can_frame *cf, struct cgw_csum_xor *xor) cf->data[xor->result_idx] = val; } -static void cgw_csum_crc8_rel(struct can_frame *cf, struct cgw_csum_crc8 *crc8) +static void cgw_csum_crc8_rel(struct canfd_frame *cf, + struct cgw_csum_crc8 *crc8) { - int from = calc_idx(crc8->from_idx, cf->can_dlc); - int to = calc_idx(crc8->to_idx, cf->can_dlc); - int res = calc_idx(crc8->result_idx, cf->can_dlc); + int from = calc_idx(crc8->from_idx, cf->len); + int to = calc_idx(crc8->to_idx, cf->len); + int res = calc_idx(crc8->result_idx, cf->len); u8 crc = crc8->init_crc_val; int i; @@ -264,96 +314,102 @@ static void cgw_csum_crc8_rel(struct can_frame *cf, struct cgw_csum_crc8 *crc8) if (from <= to) { for (i = crc8->from_idx; i <= crc8->to_idx; i++) - crc = crc8->crctab[crc^cf->data[i]]; + crc = crc8->crctab[crc ^ cf->data[i]]; } else { for (i = crc8->from_idx; i >= crc8->to_idx; i--) - crc = crc8->crctab[crc^cf->data[i]]; + crc = crc8->crctab[crc ^ cf->data[i]]; } switch (crc8->profile) { - case CGW_CRC8PRF_1U8: - crc = crc8->crctab[crc^crc8->profile_data[0]]; + crc = crc8->crctab[crc ^ crc8->profile_data[0]]; break; case CGW_CRC8PRF_16U8: - crc = crc8->crctab[crc^crc8->profile_data[cf->data[1] & 0xF]]; + crc = crc8->crctab[crc ^ crc8->profile_data[cf->data[1] & 0xF]]; break; case CGW_CRC8PRF_SFFID_XOR: - crc = crc8->crctab[crc^(cf->can_id & 0xFF)^ + crc = crc8->crctab[crc ^ (cf->can_id & 0xFF) ^ (cf->can_id >> 8 & 0xFF)]; break; - } - cf->data[crc8->result_idx] = crc^crc8->final_xor_val; + cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val; } -static void cgw_csum_crc8_pos(struct can_frame *cf, struct cgw_csum_crc8 *crc8) +static void cgw_csum_crc8_pos(struct canfd_frame *cf, + struct cgw_csum_crc8 *crc8) { u8 crc = crc8->init_crc_val; int i; for (i = crc8->from_idx; i <= crc8->to_idx; i++) - crc = crc8->crctab[crc^cf->data[i]]; + crc = crc8->crctab[crc ^ cf->data[i]]; switch (crc8->profile) { - case CGW_CRC8PRF_1U8: - crc = crc8->crctab[crc^crc8->profile_data[0]]; + crc = crc8->crctab[crc ^ crc8->profile_data[0]]; break; case CGW_CRC8PRF_16U8: - crc = crc8->crctab[crc^crc8->profile_data[cf->data[1] & 0xF]]; + crc = crc8->crctab[crc ^ crc8->profile_data[cf->data[1] & 0xF]]; break; case CGW_CRC8PRF_SFFID_XOR: - crc = crc8->crctab[crc^(cf->can_id & 0xFF)^ + crc = crc8->crctab[crc ^ (cf->can_id & 0xFF) ^ (cf->can_id >> 8 & 0xFF)]; break; } - cf->data[crc8->result_idx] = crc^crc8->final_xor_val; + cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val; } -static void cgw_csum_crc8_neg(struct can_frame *cf, struct cgw_csum_crc8 *crc8) +static void cgw_csum_crc8_neg(struct canfd_frame *cf, + struct cgw_csum_crc8 *crc8) { u8 crc = crc8->init_crc_val; int i; for (i = crc8->from_idx; i >= crc8->to_idx; i--) - crc = crc8->crctab[crc^cf->data[i]]; + crc = crc8->crctab[crc ^ cf->data[i]]; switch (crc8->profile) { - case CGW_CRC8PRF_1U8: - crc = crc8->crctab[crc^crc8->profile_data[0]]; + crc = crc8->crctab[crc ^ crc8->profile_data[0]]; break; case CGW_CRC8PRF_16U8: - crc = crc8->crctab[crc^crc8->profile_data[cf->data[1] & 0xF]]; + crc = crc8->crctab[crc ^ crc8->profile_data[cf->data[1] & 0xF]]; break; case CGW_CRC8PRF_SFFID_XOR: - crc = crc8->crctab[crc^(cf->can_id & 0xFF)^ + crc = crc8->crctab[crc ^ (cf->can_id & 0xFF) ^ (cf->can_id >> 8 & 0xFF)]; break; } - cf->data[crc8->result_idx] = crc^crc8->final_xor_val; + cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val; } /* the receive & process & send function */ static void can_can_gw_rcv(struct sk_buff *skb, void *data) { struct cgw_job *gwj = (struct cgw_job *)data; - struct can_frame *cf; + struct canfd_frame *cf; struct sk_buff *nskb; int modidx = 0; - /* - * Do not handle CAN frames routed more than 'max_hops' times. + /* process strictly Classic CAN or CAN FD frames */ + if (gwj->flags & CGW_FLAGS_CAN_FD) { + if (skb->len != CANFD_MTU) + return; + } else { + if (skb->len != CAN_MTU) + return; + } + + /* Do not handle CAN frames routed more than 'max_hops' times. * In general we should never catch this delimiter which is intended * to cover a misconfiguration protection (e.g. circular CAN routes). * @@ -384,8 +440,7 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data) can_skb_prv(skb)->ifindex == gwj->dst.dev->ifindex) return; - /* - * clone the given skb, which has not been done in can_rcv() + /* clone the given skb, which has not been done in can_rcv() * * When there is at least one modification function activated, * we need to copy the skb as we want to modify skb->data. @@ -410,7 +465,7 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data) nskb->dev = gwj->dst.dev; /* pointer to modifiable CAN frame */ - cf = (struct can_frame *)nskb->data; + cf = (struct canfd_frame *)nskb->data; /* perform preprocessed modification functions if there are any */ while (modidx < MAX_MODFUNCTIONS && gwj->mod.modfunc[modidx]) @@ -419,26 +474,22 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data) /* Has the CAN frame been modified? */ if (modidx) { /* get available space for the processed CAN frame type */ - int max_len = nskb->len - offsetof(struct can_frame, data); + int max_len = nskb->len - offsetof(struct canfd_frame, data); /* dlc may have changed, make sure it fits to the CAN frame */ - if (cf->can_dlc > max_len) - goto out_delete; - - /* check for checksum updates in classic CAN length only */ - if (gwj->mod.csumfunc.crc8) { - if (cf->can_dlc > 8) - goto out_delete; - - (*gwj->mod.csumfunc.crc8)(cf, &gwj->mod.csum.crc8); + if (cf->len > max_len) { + /* delete frame due to misconfiguration */ + gwj->deleted_frames++; + kfree_skb(nskb); + return; } - if (gwj->mod.csumfunc.xor) { - if (cf->can_dlc > 8) - goto out_delete; + /* check for checksum updates */ + if (gwj->mod.csumfunc.crc8) + (*gwj->mod.csumfunc.crc8)(cf, &gwj->mod.csum.crc8); + if (gwj->mod.csumfunc.xor) (*gwj->mod.csumfunc.xor)(cf, &gwj->mod.csum.xor); - } } /* clear the skb timestamp if not configured the other way */ @@ -450,14 +501,6 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data) gwj->dropped_frames++; else gwj->handled_frames++; - - return; - - out_delete: - /* delete frame due to misconfiguration */ - gwj->deleted_frames++; - kfree_skb(nskb); - return; } static inline int cgw_register_filter(struct net *net, struct cgw_job *gwj) @@ -483,14 +526,12 @@ static int cgw_notifier(struct notifier_block *nb, return NOTIFY_DONE; if (msg == NETDEV_UNREGISTER) { - struct cgw_job *gwj = NULL; struct hlist_node *nx; ASSERT_RTNL(); hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) { - if (gwj->src.dev == dev || gwj->dst.dev == dev) { hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); @@ -505,7 +546,6 @@ static int cgw_notifier(struct notifier_block *nb, static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type, u32 pid, u32 seq, int flags) { - struct cgw_frame_mod mb; struct rtcanmsg *rtcan; struct nlmsghdr *nlh; @@ -542,32 +582,66 @@ static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type, goto cancel; } - if (gwj->mod.modtype.and) { - memcpy(&mb.cf, &gwj->mod.modframe.and, sizeof(mb.cf)); - mb.modtype = gwj->mod.modtype.and; - if (nla_put(skb, CGW_MOD_AND, sizeof(mb), &mb) < 0) - goto cancel; - } + if (gwj->flags & CGW_FLAGS_CAN_FD) { + struct cgw_fdframe_mod mb; - if (gwj->mod.modtype.or) { - memcpy(&mb.cf, &gwj->mod.modframe.or, sizeof(mb.cf)); - mb.modtype = gwj->mod.modtype.or; - if (nla_put(skb, CGW_MOD_OR, sizeof(mb), &mb) < 0) - goto cancel; - } + if (gwj->mod.modtype.and) { + memcpy(&mb.cf, &gwj->mod.modframe.and, sizeof(mb.cf)); + mb.modtype = gwj->mod.modtype.and; + if (nla_put(skb, CGW_FDMOD_AND, sizeof(mb), &mb) < 0) + goto cancel; + } - if (gwj->mod.modtype.xor) { - memcpy(&mb.cf, &gwj->mod.modframe.xor, sizeof(mb.cf)); - mb.modtype = gwj->mod.modtype.xor; - if (nla_put(skb, CGW_MOD_XOR, sizeof(mb), &mb) < 0) - goto cancel; - } + if (gwj->mod.modtype.or) { + memcpy(&mb.cf, &gwj->mod.modframe.or, sizeof(mb.cf)); + mb.modtype = gwj->mod.modtype.or; + if (nla_put(skb, CGW_FDMOD_OR, sizeof(mb), &mb) < 0) + goto cancel; + } - if (gwj->mod.modtype.set) { - memcpy(&mb.cf, &gwj->mod.modframe.set, sizeof(mb.cf)); - mb.modtype = gwj->mod.modtype.set; - if (nla_put(skb, CGW_MOD_SET, sizeof(mb), &mb) < 0) - goto cancel; + if (gwj->mod.modtype.xor) { + memcpy(&mb.cf, &gwj->mod.modframe.xor, sizeof(mb.cf)); + mb.modtype = gwj->mod.modtype.xor; + if (nla_put(skb, CGW_FDMOD_XOR, sizeof(mb), &mb) < 0) + goto cancel; + } + + if (gwj->mod.modtype.set) { + memcpy(&mb.cf, &gwj->mod.modframe.set, sizeof(mb.cf)); + mb.modtype = gwj->mod.modtype.set; + if (nla_put(skb, CGW_FDMOD_SET, sizeof(mb), &mb) < 0) + goto cancel; + } + } else { + struct cgw_frame_mod mb; + + if (gwj->mod.modtype.and) { + memcpy(&mb.cf, &gwj->mod.modframe.and, sizeof(mb.cf)); + mb.modtype = gwj->mod.modtype.and; + if (nla_put(skb, CGW_MOD_AND, sizeof(mb), &mb) < 0) + goto cancel; + } + + if (gwj->mod.modtype.or) { + memcpy(&mb.cf, &gwj->mod.modframe.or, sizeof(mb.cf)); + mb.modtype = gwj->mod.modtype.or; + if (nla_put(skb, CGW_MOD_OR, sizeof(mb), &mb) < 0) + goto cancel; + } + + if (gwj->mod.modtype.xor) { + memcpy(&mb.cf, &gwj->mod.modframe.xor, sizeof(mb.cf)); + mb.modtype = gwj->mod.modtype.xor; + if (nla_put(skb, CGW_MOD_XOR, sizeof(mb), &mb) < 0) + goto cancel; + } + + if (gwj->mod.modtype.set) { + memcpy(&mb.cf, &gwj->mod.modframe.set, sizeof(mb.cf)); + mb.modtype = gwj->mod.modtype.set; + if (nla_put(skb, CGW_MOD_SET, sizeof(mb), &mb) < 0) + goto cancel; + } } if (gwj->mod.uid) { @@ -588,7 +662,6 @@ static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type, } if (gwj->gwtype == CGW_TYPE_CAN_CAN) { - if (gwj->ccgw.filter.can_id || gwj->ccgw.filter.can_mask) { if (nla_put(skb, CGW_FILTER, sizeof(struct can_filter), &gwj->ccgw.filter) < 0) @@ -623,8 +696,9 @@ static int cgw_dump_jobs(struct sk_buff *skb, struct netlink_callback *cb) if (idx < s_idx) goto cont; - if (cgw_put_job(skb, gwj, RTM_NEWROUTE, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI) < 0) + if (cgw_put_job(skb, gwj, RTM_NEWROUTE, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI) < 0) break; cont: idx++; @@ -636,7 +710,7 @@ cont: return skb->len; } -static const struct nla_policy cgw_policy[CGW_MAX+1] = { +static const struct nla_policy cgw_policy[CGW_MAX + 1] = { [CGW_MOD_AND] = { .len = sizeof(struct cgw_frame_mod) }, [CGW_MOD_OR] = { .len = sizeof(struct cgw_frame_mod) }, [CGW_MOD_XOR] = { .len = sizeof(struct cgw_frame_mod) }, @@ -648,14 +722,18 @@ static const struct nla_policy cgw_policy[CGW_MAX+1] = { [CGW_FILTER] = { .len = sizeof(struct can_filter) }, [CGW_LIM_HOPS] = { .type = NLA_U8 }, [CGW_MOD_UID] = { .type = NLA_U32 }, + [CGW_FDMOD_AND] = { .len = sizeof(struct cgw_fdframe_mod) }, + [CGW_FDMOD_OR] = { .len = sizeof(struct cgw_fdframe_mod) }, + [CGW_FDMOD_XOR] = { .len = sizeof(struct cgw_fdframe_mod) }, + [CGW_FDMOD_SET] = { .len = sizeof(struct cgw_fdframe_mod) }, }; /* check for common and gwtype specific attributes */ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod, u8 gwtype, void *gwtypeattr, u8 *limhops) { - struct nlattr *tb[CGW_MAX+1]; - struct cgw_frame_mod mb; + struct nlattr *tb[CGW_MAX + 1]; + struct rtcanmsg *r = nlmsg_data(nlh); int modidx = 0; int err = 0; @@ -675,87 +753,166 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod, } /* check for AND/OR/XOR/SET modifications */ + if (r->flags & CGW_FLAGS_CAN_FD) { + struct cgw_fdframe_mod mb; - if (tb[CGW_MOD_AND]) { - nla_memcpy(&mb, tb[CGW_MOD_AND], CGW_MODATTR_LEN); + if (tb[CGW_FDMOD_AND]) { + nla_memcpy(&mb, tb[CGW_FDMOD_AND], CGW_FDMODATTR_LEN); - canframecpy(&mod->modframe.and, &mb.cf); - mod->modtype.and = mb.modtype; + canfdframecpy(&mod->modframe.and, &mb.cf); + mod->modtype.and = mb.modtype; - if (mb.modtype & CGW_MOD_ID) - mod->modfunc[modidx++] = mod_and_id; + if (mb.modtype & CGW_MOD_ID) + mod->modfunc[modidx++] = mod_and_id; - if (mb.modtype & CGW_MOD_DLC) - mod->modfunc[modidx++] = mod_and_dlc; + if (mb.modtype & CGW_MOD_LEN) + mod->modfunc[modidx++] = mod_and_len; - if (mb.modtype & CGW_MOD_DATA) - mod->modfunc[modidx++] = mod_and_data; - } + if (mb.modtype & CGW_MOD_FLAGS) + mod->modfunc[modidx++] = mod_and_flags; + + if (mb.modtype & CGW_MOD_DATA) + mod->modfunc[modidx++] = mod_and_fddata; + } - if (tb[CGW_MOD_OR]) { - nla_memcpy(&mb, tb[CGW_MOD_OR], CGW_MODATTR_LEN); + if (tb[CGW_FDMOD_OR]) { + nla_memcpy(&mb, tb[CGW_FDMOD_OR], CGW_FDMODATTR_LEN); - canframecpy(&mod->modframe.or, &mb.cf); - mod->modtype.or = mb.modtype; + canfdframecpy(&mod->modframe.or, &mb.cf); + mod->modtype.or = mb.modtype; - if (mb.modtype & CGW_MOD_ID) - mod->modfunc[modidx++] = mod_or_id; + if (mb.modtype & CGW_MOD_ID) + mod->modfunc[modidx++] = mod_or_id; - if (mb.modtype & CGW_MOD_DLC) - mod->modfunc[modidx++] = mod_or_dlc; + if (mb.modtype & CGW_MOD_LEN) + mod->modfunc[modidx++] = mod_or_len; - if (mb.modtype & CGW_MOD_DATA) - mod->modfunc[modidx++] = mod_or_data; - } + if (mb.modtype & CGW_MOD_FLAGS) + mod->modfunc[modidx++] = mod_or_flags; - if (tb[CGW_MOD_XOR]) { - nla_memcpy(&mb, tb[CGW_MOD_XOR], CGW_MODATTR_LEN); + if (mb.modtype & CGW_MOD_DATA) + mod->modfunc[modidx++] = mod_or_fddata; + } - canframecpy(&mod->modframe.xor, &mb.cf); - mod->modtype.xor = mb.modtype; + if (tb[CGW_FDMOD_XOR]) { + nla_memcpy(&mb, tb[CGW_FDMOD_XOR], CGW_FDMODATTR_LEN); - if (mb.modtype & CGW_MOD_ID) - mod->modfunc[modidx++] = mod_xor_id; + canfdframecpy(&mod->modframe.xor, &mb.cf); + mod->modtype.xor = mb.modtype; - if (mb.modtype & CGW_MOD_DLC) - mod->modfunc[modidx++] = mod_xor_dlc; + if (mb.modtype & CGW_MOD_ID) + mod->modfunc[modidx++] = mod_xor_id; - if (mb.modtype & CGW_MOD_DATA) - mod->modfunc[modidx++] = mod_xor_data; - } + if (mb.modtype & CGW_MOD_LEN) + mod->modfunc[modidx++] = mod_xor_len; - if (tb[CGW_MOD_SET]) { - nla_memcpy(&mb, tb[CGW_MOD_SET], CGW_MODATTR_LEN); + if (mb.modtype & CGW_MOD_FLAGS) + mod->modfunc[modidx++] = mod_xor_flags; + + if (mb.modtype & CGW_MOD_DATA) + mod->modfunc[modidx++] = mod_xor_fddata; + } - canframecpy(&mod->modframe.set, &mb.cf); - mod->modtype.set = mb.modtype; + if (tb[CGW_FDMOD_SET]) { + nla_memcpy(&mb, tb[CGW_FDMOD_SET], CGW_FDMODATTR_LEN); + + canfdframecpy(&mod->modframe.set, &mb.cf); + mod->modtype.set = mb.modtype; + + if (mb.modtype & CGW_MOD_ID) + mod->modfunc[modidx++] = mod_set_id; + + if (mb.modtype & CGW_MOD_LEN) + mod->modfunc[modidx++] = mod_set_len; + + if (mb.modtype & CGW_MOD_FLAGS) + mod->modfunc[modidx++] = mod_set_flags; + + if (mb.modtype & CGW_MOD_DATA) + mod->modfunc[modidx++] = mod_set_fddata; + } + } else { + struct cgw_frame_mod mb; - if (mb.modtype & CGW_MOD_ID) - mod->modfunc[modidx++] = mod_set_id; + if (tb[CGW_MOD_AND]) { + nla_memcpy(&mb, tb[CGW_MOD_AND], CGW_MODATTR_LEN); - if (mb.modtype & CGW_MOD_DLC) - mod->modfunc[modidx++] = mod_set_dlc; + canframecpy(&mod->modframe.and, &mb.cf); + mod->modtype.and = mb.modtype; - if (mb.modtype & CGW_MOD_DATA) - mod->modfunc[modidx++] = mod_set_data; + if (mb.modtype & CGW_MOD_ID) + mod->modfunc[modidx++] = mod_and_id; + + if (mb.modtype & CGW_MOD_LEN) + mod->modfunc[modidx++] = mod_and_len; + + if (mb.modtype & CGW_MOD_DATA) + mod->modfunc[modidx++] = mod_and_data; + } + + if (tb[CGW_MOD_OR]) { + nla_memcpy(&mb, tb[CGW_MOD_OR], CGW_MODATTR_LEN); + + canframecpy(&mod->modframe.or, &mb.cf); + mod->modtype.or = mb.modtype; + + if (mb.modtype & CGW_MOD_ID) + mod->modfunc[modidx++] = mod_or_id; + + if (mb.modtype & CGW_MOD_LEN) + mod->modfunc[modidx++] = mod_or_len; + + if (mb.modtype & CGW_MOD_DATA) + mod->modfunc[modidx++] = mod_or_data; + } + + if (tb[CGW_MOD_XOR]) { + nla_memcpy(&mb, tb[CGW_MOD_XOR], CGW_MODATTR_LEN); + + canframecpy(&mod->modframe.xor, &mb.cf); + mod->modtype.xor = mb.modtype; + + if (mb.modtype & CGW_MOD_ID) + mod->modfunc[modidx++] = mod_xor_id; + + if (mb.modtype & CGW_MOD_LEN) + mod->modfunc[modidx++] = mod_xor_len; + + if (mb.modtype & CGW_MOD_DATA) + mod->modfunc[modidx++] = mod_xor_data; + } + + if (tb[CGW_MOD_SET]) { + nla_memcpy(&mb, tb[CGW_MOD_SET], CGW_MODATTR_LEN); + + canframecpy(&mod->modframe.set, &mb.cf); + mod->modtype.set = mb.modtype; + + if (mb.modtype & CGW_MOD_ID) + mod->modfunc[modidx++] = mod_set_id; + + if (mb.modtype & CGW_MOD_LEN) + mod->modfunc[modidx++] = mod_set_len; + + if (mb.modtype & CGW_MOD_DATA) + mod->modfunc[modidx++] = mod_set_data; + } } /* check for checksum operations after CAN frame modifications */ if (modidx) { - if (tb[CGW_CS_CRC8]) { struct cgw_csum_crc8 *c = nla_data(tb[CGW_CS_CRC8]); err = cgw_chk_csum_parms(c->from_idx, c->to_idx, - c->result_idx); + c->result_idx, r); if (err) return err; nla_memcpy(&mod->csum.crc8, tb[CGW_CS_CRC8], CGW_CS_CRC8_LEN); - /* - * select dedicated processing function to reduce + /* select dedicated processing function to reduce * runtime operations in receive hot path. */ if (c->from_idx < 0 || c->to_idx < 0 || @@ -771,15 +928,14 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod, struct cgw_csum_xor *c = nla_data(tb[CGW_CS_XOR]); err = cgw_chk_csum_parms(c->from_idx, c->to_idx, - c->result_idx); + c->result_idx, r); if (err) return err; nla_memcpy(&mod->csum.xor, tb[CGW_CS_XOR], CGW_CS_XOR_LEN); - /* - * select dedicated processing function to reduce + /* select dedicated processing function to reduce * runtime operations in receive hot path. */ if (c->from_idx < 0 || c->to_idx < 0 || @@ -791,16 +947,14 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod, mod->csumfunc.xor = cgw_csum_xor_neg; } - if (tb[CGW_MOD_UID]) { + if (tb[CGW_MOD_UID]) nla_memcpy(&mod->uid, tb[CGW_MOD_UID], sizeof(u32)); - } } if (gwtype == CGW_TYPE_CAN_CAN) { - /* check CGW_TYPE_CAN_CAN specific attributes */ - struct can_can_gw *ccgw = (struct can_can_gw *)gwtypeattr; + memset(ccgw, 0, sizeof(*ccgw)); /* check for can_filter in attributes */ @@ -861,12 +1015,10 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh, return err; if (mod.uid) { - ASSERT_RTNL(); /* check for updating an existing job with identical uid */ hlist_for_each_entry(gwj, &net->can.cgw_list, list) { - if (gwj->mod.uid != mod.uid) continue; @@ -987,7 +1139,6 @@ static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh, /* remove only the first matching entry */ hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) { - if (gwj->flags != r->flags) continue; diff --git a/net/can/j1939/Kconfig b/net/can/j1939/Kconfig new file mode 100644 index 000000000000..2998298b71ec --- /dev/null +++ b/net/can/j1939/Kconfig @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# SAE J1939 network layer core configuration +# + +config CAN_J1939 + tristate "SAE J1939" + depends on CAN + help + SAE J1939 + Say Y to have in-kernel support for j1939 socket type. This + allows communication according to SAE j1939. + The relevant parts in kernel are + SAE j1939-21 (datalink & transport protocol) + & SAE j1939-81 (network management). diff --git a/net/can/j1939/Makefile b/net/can/j1939/Makefile new file mode 100644 index 000000000000..19181bdae173 --- /dev/null +++ b/net/can/j1939/Makefile @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_CAN_J1939) += can-j1939.o + +can-j1939-objs := \ + address-claim.o \ + bus.o \ + main.o \ + socket.o \ + transport.o diff --git a/net/can/j1939/address-claim.c b/net/can/j1939/address-claim.c new file mode 100644 index 000000000000..f33c47327927 --- /dev/null +++ b/net/can/j1939/address-claim.c @@ -0,0 +1,230 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2010-2011 EIA Electronics, +// Kurt Van Dijck <kurt.van.dijck@eia.be> +// Copyright (c) 2010-2011 EIA Electronics, +// Pieter Beyens <pieter.beyens@eia.be> +// Copyright (c) 2017-2019 Pengutronix, +// Marc Kleine-Budde <kernel@pengutronix.de> +// Copyright (c) 2017-2019 Pengutronix, +// Oleksij Rempel <kernel@pengutronix.de> + +/* J1939 Address Claiming. + * Address Claiming in the kernel + * - keeps track of the AC states of ECU's, + * - resolves NAME<=>SA taking into account the AC states of ECU's. + * + * All Address Claim msgs (including host-originated msg) are processed + * at the receive path (a sent msg is always received again via CAN echo). + * As such, the processing of AC msgs is done in the order on which msgs + * are sent on the bus. + * + * This module doesn't send msgs itself (e.g. replies on Address Claims), + * this is the responsibility of a user space application or daemon. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/netdevice.h> +#include <linux/skbuff.h> + +#include "j1939-priv.h" + +static inline name_t j1939_skb_to_name(const struct sk_buff *skb) +{ + return le64_to_cpup((__le64 *)skb->data); +} + +static inline bool j1939_ac_msg_is_request(struct sk_buff *skb) +{ + struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); + int req_pgn; + + if (skb->len < 3 || skcb->addr.pgn != J1939_PGN_REQUEST) + return false; + + req_pgn = skb->data[0] | (skb->data[1] << 8) | (skb->data[2] << 16); + + return req_pgn == J1939_PGN_ADDRESS_CLAIMED; +} + +static int j1939_ac_verify_outgoing(struct j1939_priv *priv, + struct sk_buff *skb) +{ + struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); + + if (skb->len != 8) { + netdev_notice(priv->ndev, "tx address claim with dlc %i\n", + skb->len); + return -EPROTO; + } + + if (skcb->addr.src_name != j1939_skb_to_name(skb)) { + netdev_notice(priv->ndev, "tx address claim with different name\n"); + return -EPROTO; + } + + if (skcb->addr.sa == J1939_NO_ADDR) { + netdev_notice(priv->ndev, "tx address claim with broadcast sa\n"); + return -EPROTO; + } + + /* ac must always be a broadcast */ + if (skcb->addr.dst_name || skcb->addr.da != J1939_NO_ADDR) { + netdev_notice(priv->ndev, "tx address claim with dest, not broadcast\n"); + return -EPROTO; + } + return 0; +} + +int j1939_ac_fixup(struct j1939_priv *priv, struct sk_buff *skb) +{ + struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); + int ret; + u8 addr; + + /* network mgmt: address claiming msgs */ + if (skcb->addr.pgn == J1939_PGN_ADDRESS_CLAIMED) { + struct j1939_ecu *ecu; + + ret = j1939_ac_verify_outgoing(priv, skb); + /* return both when failure & when successful */ + if (ret < 0) + return ret; + ecu = j1939_ecu_get_by_name(priv, skcb->addr.src_name); + if (!ecu) + return -ENODEV; + + if (ecu->addr != skcb->addr.sa) + /* hold further traffic for ecu, remove from parent */ + j1939_ecu_unmap(ecu); + j1939_ecu_put(ecu); + } else if (skcb->addr.src_name) { + /* assign source address */ + addr = j1939_name_to_addr(priv, skcb->addr.src_name); + if (!j1939_address_is_unicast(addr) && + !j1939_ac_msg_is_request(skb)) { + netdev_notice(priv->ndev, "tx drop: invalid sa for name 0x%016llx\n", + skcb->addr.src_name); + return -EADDRNOTAVAIL; + } + skcb->addr.sa = addr; + } + + /* assign destination address */ + if (skcb->addr.dst_name) { + addr = j1939_name_to_addr(priv, skcb->addr.dst_name); + if (!j1939_address_is_unicast(addr)) { + netdev_notice(priv->ndev, "tx drop: invalid da for name 0x%016llx\n", + skcb->addr.dst_name); + return -EADDRNOTAVAIL; + } + skcb->addr.da = addr; + } + return 0; +} + +static void j1939_ac_process(struct j1939_priv *priv, struct sk_buff *skb) +{ + struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); + struct j1939_ecu *ecu, *prev; + name_t name; + + if (skb->len != 8) { + netdev_notice(priv->ndev, "rx address claim with wrong dlc %i\n", + skb->len); + return; + } + + name = j1939_skb_to_name(skb); + skcb->addr.src_name = name; + if (!name) { + netdev_notice(priv->ndev, "rx address claim without name\n"); + return; + } + + if (!j1939_address_is_valid(skcb->addr.sa)) { + netdev_notice(priv->ndev, "rx address claim with broadcast sa\n"); + return; + } + + write_lock_bh(&priv->lock); + + /* Few words on the ECU ref counting: + * + * First we get an ECU handle, either with + * j1939_ecu_get_by_name_locked() (increments the ref counter) + * or j1939_ecu_create_locked() (initializes an ECU object + * with a ref counter of 1). + * + * j1939_ecu_unmap_locked() will decrement the ref counter, + * but only if the ECU was mapped before. So "ecu" still + * belongs to us. + * + * j1939_ecu_timer_start() will increment the ref counter + * before it starts the timer, so we can put the ecu when + * leaving this function. + */ + ecu = j1939_ecu_get_by_name_locked(priv, name); + if (!ecu && j1939_address_is_unicast(skcb->addr.sa)) + ecu = j1939_ecu_create_locked(priv, name); + + if (IS_ERR_OR_NULL(ecu)) + goto out_unlock_bh; + + /* cancel pending (previous) address claim */ + j1939_ecu_timer_cancel(ecu); + + if (j1939_address_is_idle(skcb->addr.sa)) { + j1939_ecu_unmap_locked(ecu); + goto out_ecu_put; + } + + /* save new addr */ + if (ecu->addr != skcb->addr.sa) + j1939_ecu_unmap_locked(ecu); + ecu->addr = skcb->addr.sa; + + prev = j1939_ecu_get_by_addr_locked(priv, skcb->addr.sa); + if (prev) { + if (ecu->name > prev->name) { + j1939_ecu_unmap_locked(ecu); + j1939_ecu_put(prev); + goto out_ecu_put; + } else { + /* kick prev if less or equal */ + j1939_ecu_unmap_locked(prev); + j1939_ecu_put(prev); + } + } + + j1939_ecu_timer_start(ecu); + out_ecu_put: + j1939_ecu_put(ecu); + out_unlock_bh: + write_unlock_bh(&priv->lock); +} + +void j1939_ac_recv(struct j1939_priv *priv, struct sk_buff *skb) +{ + struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); + struct j1939_ecu *ecu; + + /* network mgmt */ + if (skcb->addr.pgn == J1939_PGN_ADDRESS_CLAIMED) { + j1939_ac_process(priv, skb); + } else if (j1939_address_is_unicast(skcb->addr.sa)) { + /* assign source name */ + ecu = j1939_ecu_get_by_addr(priv, skcb->addr.sa); + if (ecu) { + skcb->addr.src_name = ecu->name; + j1939_ecu_put(ecu); + } + } + + /* assign destination name */ + ecu = j1939_ecu_get_by_addr(priv, skcb->addr.da); + if (ecu) { + skcb->addr.dst_name = ecu->name; + j1939_ecu_put(ecu); + } +} diff --git a/net/can/j1939/bus.c b/net/can/j1939/bus.c new file mode 100644 index 000000000000..486687901602 --- /dev/null +++ b/net/can/j1939/bus.c @@ -0,0 +1,333 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2010-2011 EIA Electronics, +// Kurt Van Dijck <kurt.van.dijck@eia.be> +// Copyright (c) 2017-2019 Pengutronix, +// Marc Kleine-Budde <kernel@pengutronix.de> +// Copyright (c) 2017-2019 Pengutronix, +// Oleksij Rempel <kernel@pengutronix.de> + +/* bus for j1939 remote devices + * Since rtnetlink, no real bus is used. + */ + +#include <net/sock.h> + +#include "j1939-priv.h" + +static void __j1939_ecu_release(struct kref *kref) +{ + struct j1939_ecu *ecu = container_of(kref, struct j1939_ecu, kref); + struct j1939_priv *priv = ecu->priv; + + list_del(&ecu->list); + kfree(ecu); + j1939_priv_put(priv); +} + +void j1939_ecu_put(struct j1939_ecu *ecu) +{ + kref_put(&ecu->kref, __j1939_ecu_release); +} + +static void j1939_ecu_get(struct j1939_ecu *ecu) +{ + kref_get(&ecu->kref); +} + +static bool j1939_ecu_is_mapped_locked(struct j1939_ecu *ecu) +{ + struct j1939_priv *priv = ecu->priv; + + lockdep_assert_held(&priv->lock); + + return j1939_ecu_find_by_addr_locked(priv, ecu->addr) == ecu; +} + +/* ECU device interface */ +/* map ECU to a bus address space */ +static void j1939_ecu_map_locked(struct j1939_ecu *ecu) +{ + struct j1939_priv *priv = ecu->priv; + struct j1939_addr_ent *ent; + + lockdep_assert_held(&priv->lock); + + if (!j1939_address_is_unicast(ecu->addr)) + return; + + ent = &priv->ents[ecu->addr]; + + if (ent->ecu) { + netdev_warn(priv->ndev, "Trying to map already mapped ECU, addr: 0x%02x, name: 0x%016llx. Skip it.\n", + ecu->addr, ecu->name); + return; + } + + j1939_ecu_get(ecu); + ent->ecu = ecu; + ent->nusers += ecu->nusers; +} + +/* unmap ECU from a bus address space */ +void j1939_ecu_unmap_locked(struct j1939_ecu *ecu) +{ + struct j1939_priv *priv = ecu->priv; + struct j1939_addr_ent *ent; + + lockdep_assert_held(&priv->lock); + + if (!j1939_address_is_unicast(ecu->addr)) + return; + + if (!j1939_ecu_is_mapped_locked(ecu)) + return; + + ent = &priv->ents[ecu->addr]; + ent->ecu = NULL; + ent->nusers -= ecu->nusers; + j1939_ecu_put(ecu); +} + +void j1939_ecu_unmap(struct j1939_ecu *ecu) +{ + write_lock_bh(&ecu->priv->lock); + j1939_ecu_unmap_locked(ecu); + write_unlock_bh(&ecu->priv->lock); +} + +void j1939_ecu_unmap_all(struct j1939_priv *priv) +{ + int i; + + write_lock_bh(&priv->lock); + for (i = 0; i < ARRAY_SIZE(priv->ents); i++) + if (priv->ents[i].ecu) + j1939_ecu_unmap_locked(priv->ents[i].ecu); + write_unlock_bh(&priv->lock); +} + +void j1939_ecu_timer_start(struct j1939_ecu *ecu) +{ + /* The ECU is held here and released in the + * j1939_ecu_timer_handler() or j1939_ecu_timer_cancel(). + */ + j1939_ecu_get(ecu); + + /* Schedule timer in 250 msec to commit address change. */ + hrtimer_start(&ecu->ac_timer, ms_to_ktime(250), + HRTIMER_MODE_REL_SOFT); +} + +void j1939_ecu_timer_cancel(struct j1939_ecu *ecu) +{ + if (hrtimer_cancel(&ecu->ac_timer)) + j1939_ecu_put(ecu); +} + +static enum hrtimer_restart j1939_ecu_timer_handler(struct hrtimer *hrtimer) +{ + struct j1939_ecu *ecu = + container_of(hrtimer, struct j1939_ecu, ac_timer); + struct j1939_priv *priv = ecu->priv; + + write_lock_bh(&priv->lock); + /* TODO: can we test if ecu->addr is unicast before starting + * the timer? + */ + j1939_ecu_map_locked(ecu); + + /* The corresponding j1939_ecu_get() is in + * j1939_ecu_timer_start(). + */ + j1939_ecu_put(ecu); + write_unlock_bh(&priv->lock); + + return HRTIMER_NORESTART; +} + +struct j1939_ecu *j1939_ecu_create_locked(struct j1939_priv *priv, name_t name) +{ + struct j1939_ecu *ecu; + + lockdep_assert_held(&priv->lock); + + ecu = kzalloc(sizeof(*ecu), gfp_any()); + if (!ecu) + return ERR_PTR(-ENOMEM); + kref_init(&ecu->kref); + ecu->addr = J1939_IDLE_ADDR; + ecu->name = name; + + hrtimer_init(&ecu->ac_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); + ecu->ac_timer.function = j1939_ecu_timer_handler; + INIT_LIST_HEAD(&ecu->list); + + j1939_priv_get(priv); + ecu->priv = priv; + list_add_tail(&ecu->list, &priv->ecus); + + return ecu; +} + +struct j1939_ecu *j1939_ecu_find_by_addr_locked(struct j1939_priv *priv, + u8 addr) +{ + lockdep_assert_held(&priv->lock); + + return priv->ents[addr].ecu; +} + +struct j1939_ecu *j1939_ecu_get_by_addr_locked(struct j1939_priv *priv, u8 addr) +{ + struct j1939_ecu *ecu; + + lockdep_assert_held(&priv->lock); + + if (!j1939_address_is_unicast(addr)) + return NULL; + + ecu = j1939_ecu_find_by_addr_locked(priv, addr); + if (ecu) + j1939_ecu_get(ecu); + + return ecu; +} + +struct j1939_ecu *j1939_ecu_get_by_addr(struct j1939_priv *priv, u8 addr) +{ + struct j1939_ecu *ecu; + + read_lock_bh(&priv->lock); + ecu = j1939_ecu_get_by_addr_locked(priv, addr); + read_unlock_bh(&priv->lock); + + return ecu; +} + +/* get pointer to ecu without increasing ref counter */ +static struct j1939_ecu *j1939_ecu_find_by_name_locked(struct j1939_priv *priv, + name_t name) +{ + struct j1939_ecu *ecu; + + lockdep_assert_held(&priv->lock); + + list_for_each_entry(ecu, &priv->ecus, list) { + if (ecu->name == name) + return ecu; + } + + return NULL; +} + +struct j1939_ecu *j1939_ecu_get_by_name_locked(struct j1939_priv *priv, + name_t name) +{ + struct j1939_ecu *ecu; + + lockdep_assert_held(&priv->lock); + + if (!name) + return NULL; + + ecu = j1939_ecu_find_by_name_locked(priv, name); + if (ecu) + j1939_ecu_get(ecu); + + return ecu; +} + +struct j1939_ecu *j1939_ecu_get_by_name(struct j1939_priv *priv, name_t name) +{ + struct j1939_ecu *ecu; + + read_lock_bh(&priv->lock); + ecu = j1939_ecu_get_by_name_locked(priv, name); + read_unlock_bh(&priv->lock); + + return ecu; +} + +u8 j1939_name_to_addr(struct j1939_priv *priv, name_t name) +{ + struct j1939_ecu *ecu; + int addr = J1939_IDLE_ADDR; + + if (!name) + return J1939_NO_ADDR; + + read_lock_bh(&priv->lock); + ecu = j1939_ecu_find_by_name_locked(priv, name); + if (ecu && j1939_ecu_is_mapped_locked(ecu)) + /* ecu's SA is registered */ + addr = ecu->addr; + + read_unlock_bh(&priv->lock); + + return addr; +} + +/* TX addr/name accounting + * Transport protocol needs to know if a SA is local or not + * These functions originate from userspace manipulating sockets, + * so locking is straigforward + */ + +int j1939_local_ecu_get(struct j1939_priv *priv, name_t name, u8 sa) +{ + struct j1939_ecu *ecu; + int err = 0; + + write_lock_bh(&priv->lock); + + if (j1939_address_is_unicast(sa)) + priv->ents[sa].nusers++; + + if (!name) + goto done; + + ecu = j1939_ecu_get_by_name_locked(priv, name); + if (!ecu) + ecu = j1939_ecu_create_locked(priv, name); + err = PTR_ERR_OR_ZERO(ecu); + if (err) + goto done; + + ecu->nusers++; + /* TODO: do we care if ecu->addr != sa? */ + if (j1939_ecu_is_mapped_locked(ecu)) + /* ecu's sa is active already */ + priv->ents[ecu->addr].nusers++; + + done: + write_unlock_bh(&priv->lock); + + return err; +} + +void j1939_local_ecu_put(struct j1939_priv *priv, name_t name, u8 sa) +{ + struct j1939_ecu *ecu; + + write_lock_bh(&priv->lock); + + if (j1939_address_is_unicast(sa)) + priv->ents[sa].nusers--; + + if (!name) + goto done; + + ecu = j1939_ecu_find_by_name_locked(priv, name); + if (WARN_ON_ONCE(!ecu)) + goto done; + + ecu->nusers--; + /* TODO: do we care if ecu->addr != sa? */ + if (j1939_ecu_is_mapped_locked(ecu)) + /* ecu's sa is active already */ + priv->ents[ecu->addr].nusers--; + j1939_ecu_put(ecu); + + done: + write_unlock_bh(&priv->lock); +} diff --git a/net/can/j1939/j1939-priv.h b/net/can/j1939/j1939-priv.h new file mode 100644 index 000000000000..12369b604ce9 --- /dev/null +++ b/net/can/j1939/j1939-priv.h @@ -0,0 +1,338 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +// Copyright (c) 2010-2011 EIA Electronics, +// Kurt Van Dijck <kurt.van.dijck@eia.be> +// Copyright (c) 2017-2019 Pengutronix, +// Marc Kleine-Budde <kernel@pengutronix.de> +// Copyright (c) 2017-2019 Pengutronix, +// Oleksij Rempel <kernel@pengutronix.de> + +#ifndef _J1939_PRIV_H_ +#define _J1939_PRIV_H_ + +#include <linux/can/j1939.h> +#include <net/sock.h> + +/* Timeout to receive the abort signal over loop back. In case CAN + * bus is open, the timeout should be triggered. + */ +#define J1939_XTP_ABORT_TIMEOUT_MS 500 +#define J1939_SIMPLE_ECHO_TIMEOUT_MS (10 * 1000) + +struct j1939_session; +enum j1939_sk_errqueue_type { + J1939_ERRQUEUE_ACK, + J1939_ERRQUEUE_SCHED, + J1939_ERRQUEUE_ABORT, +}; + +/* j1939 devices */ +struct j1939_ecu { + struct list_head list; + name_t name; + u8 addr; + + /* indicates that this ecu successfully claimed @sa as its address */ + struct hrtimer ac_timer; + struct kref kref; + struct j1939_priv *priv; + + /* count users, to help transport protocol decide for interaction */ + int nusers; +}; + +struct j1939_priv { + struct list_head ecus; + /* local list entry in priv + * These allow irq (& softirq) context lookups on j1939 devices + * This approach (separate lists) is done as the other 2 alternatives + * are not easier or even wrong + * 1) using the pure kobject methods involves mutexes, which are not + * allowed in irq context. + * 2) duplicating data structures would require a lot of synchronization + * code + * usage: + */ + + /* segments need a lock to protect the above list */ + rwlock_t lock; + + struct net_device *ndev; + + /* list of 256 ecu ptrs, that cache the claimed addresses. + * also protected by the above lock + */ + struct j1939_addr_ent { + struct j1939_ecu *ecu; + /* count users, to help transport protocol */ + int nusers; + } ents[256]; + + struct kref kref; + + /* List of active sessions to prevent start of conflicting + * one. + * + * Do not start two sessions of same type, addresses and + * direction. + */ + struct list_head active_session_list; + + /* protects active_session_list */ + spinlock_t active_session_list_lock; + + unsigned int tp_max_packet_size; + + /* lock for j1939_socks list */ + spinlock_t j1939_socks_lock; + struct list_head j1939_socks; + + struct kref rx_kref; +}; + +void j1939_ecu_put(struct j1939_ecu *ecu); + +/* keep the cache of what is local */ +int j1939_local_ecu_get(struct j1939_priv *priv, name_t name, u8 sa); +void j1939_local_ecu_put(struct j1939_priv *priv, name_t name, u8 sa); + +static inline bool j1939_address_is_unicast(u8 addr) +{ + return addr <= J1939_MAX_UNICAST_ADDR; +} + +static inline bool j1939_address_is_idle(u8 addr) +{ + return addr == J1939_IDLE_ADDR; +} + +static inline bool j1939_address_is_valid(u8 addr) +{ + return addr != J1939_NO_ADDR; +} + +static inline bool j1939_pgn_is_pdu1(pgn_t pgn) +{ + /* ignore dp & res bits for this */ + return (pgn & 0xff00) < 0xf000; +} + +/* utility to correctly unmap an ECU */ +void j1939_ecu_unmap_locked(struct j1939_ecu *ecu); +void j1939_ecu_unmap(struct j1939_ecu *ecu); + +u8 j1939_name_to_addr(struct j1939_priv *priv, name_t name); +struct j1939_ecu *j1939_ecu_find_by_addr_locked(struct j1939_priv *priv, + u8 addr); +struct j1939_ecu *j1939_ecu_get_by_addr(struct j1939_priv *priv, u8 addr); +struct j1939_ecu *j1939_ecu_get_by_addr_locked(struct j1939_priv *priv, + u8 addr); +struct j1939_ecu *j1939_ecu_get_by_name(struct j1939_priv *priv, name_t name); +struct j1939_ecu *j1939_ecu_get_by_name_locked(struct j1939_priv *priv, + name_t name); + +enum j1939_transfer_type { + J1939_TP, + J1939_ETP, + J1939_SIMPLE, +}; + +struct j1939_addr { + name_t src_name; + name_t dst_name; + pgn_t pgn; + + u8 sa; + u8 da; + + u8 type; +}; + +/* control buffer of the sk_buff */ +struct j1939_sk_buff_cb { + /* Offset in bytes within one ETP session */ + u32 offset; + + /* for tx, MSG_SYN will be used to sync on sockets */ + u32 msg_flags; + u32 tskey; + + struct j1939_addr addr; + + /* Flags for quick lookups during skb processing. + * These are set in the receive path only. + */ +#define J1939_ECU_LOCAL_SRC BIT(0) +#define J1939_ECU_LOCAL_DST BIT(1) + u8 flags; + + priority_t priority; +}; + +static inline +struct j1939_sk_buff_cb *j1939_skb_to_cb(const struct sk_buff *skb) +{ + BUILD_BUG_ON(sizeof(struct j1939_sk_buff_cb) > sizeof(skb->cb)); + + return (struct j1939_sk_buff_cb *)skb->cb; +} + +int j1939_send_one(struct j1939_priv *priv, struct sk_buff *skb); +void j1939_sk_recv(struct j1939_priv *priv, struct sk_buff *skb); +bool j1939_sk_recv_match(struct j1939_priv *priv, + struct j1939_sk_buff_cb *skcb); +void j1939_sk_send_loop_abort(struct sock *sk, int err); +void j1939_sk_errqueue(struct j1939_session *session, + enum j1939_sk_errqueue_type type); +void j1939_sk_queue_activate_next(struct j1939_session *session); + +/* stack entries */ +struct j1939_session *j1939_tp_send(struct j1939_priv *priv, + struct sk_buff *skb, size_t size); +int j1939_tp_recv(struct j1939_priv *priv, struct sk_buff *skb); +int j1939_ac_fixup(struct j1939_priv *priv, struct sk_buff *skb); +void j1939_ac_recv(struct j1939_priv *priv, struct sk_buff *skb); +void j1939_simple_recv(struct j1939_priv *priv, struct sk_buff *skb); + +/* network management */ +struct j1939_ecu *j1939_ecu_create_locked(struct j1939_priv *priv, name_t name); + +void j1939_ecu_timer_start(struct j1939_ecu *ecu); +void j1939_ecu_timer_cancel(struct j1939_ecu *ecu); +void j1939_ecu_unmap_all(struct j1939_priv *priv); + +struct j1939_priv *j1939_netdev_start(struct net_device *ndev); +void j1939_netdev_stop(struct j1939_priv *priv); + +void j1939_priv_put(struct j1939_priv *priv); +void j1939_priv_get(struct j1939_priv *priv); + +/* notify/alert all j1939 sockets bound to ifindex */ +void j1939_sk_netdev_event_netdown(struct j1939_priv *priv); +int j1939_cancel_active_session(struct j1939_priv *priv, struct sock *sk); +void j1939_tp_init(struct j1939_priv *priv); + +/* decrement pending skb for a j1939 socket */ +void j1939_sock_pending_del(struct sock *sk); + +enum j1939_session_state { + J1939_SESSION_NEW, + J1939_SESSION_ACTIVE, + /* waiting for abort signal on the bus */ + J1939_SESSION_WAITING_ABORT, + J1939_SESSION_ACTIVE_MAX, + J1939_SESSION_DONE, +}; + +struct j1939_session { + struct j1939_priv *priv; + struct list_head active_session_list_entry; + struct list_head sk_session_queue_entry; + struct kref kref; + struct sock *sk; + + /* ifindex, src, dst, pgn define the session block + * the are _never_ modified after insertion in the list + * this decreases locking problems a _lot_ + */ + struct j1939_sk_buff_cb skcb; + struct sk_buff_head skb_queue; + + /* all tx related stuff (last_txcmd, pkt.tx) + * is protected (modified only) with the txtimer hrtimer + * 'total' & 'block' are never changed, + * last_cmd, last & block are protected by ->lock + * this means that the tx may run after cts is received that should + * have stopped tx, but this time discrepancy is never avoided anyhow + */ + u8 last_cmd, last_txcmd; + bool transmission; + bool extd; + /* Total message size, number of bytes */ + unsigned int total_message_size; + /* Total number of bytes queue from socket to the session */ + unsigned int total_queued_size; + unsigned int tx_retry; + + int err; + u32 tskey; + enum j1939_session_state state; + + /* Packets counters for a (extended) transfer session. The packet is + * maximal of 7 bytes. + */ + struct { + /* total - total number of packets for this session */ + unsigned int total; + /* last - last packet of a transfer block after which + * responder should send ETP.CM_CTS and originator + * ETP.CM_DPO + */ + unsigned int last; + /* tx - number of packets send by originator node. + * this counter can be set back if responder node + * didn't received all packets send by originator. + */ + unsigned int tx; + unsigned int tx_acked; + /* rx - number of packets received */ + unsigned int rx; + /* block - amount of packets expected in one block */ + unsigned int block; + /* dpo - ETP.CM_DPO, Data Packet Offset */ + unsigned int dpo; + } pkt; + struct hrtimer txtimer, rxtimer; +}; + +struct j1939_sock { + struct sock sk; /* must be first to skip with memset */ + struct j1939_priv *priv; + struct list_head list; + +#define J1939_SOCK_BOUND BIT(0) +#define J1939_SOCK_CONNECTED BIT(1) +#define J1939_SOCK_PROMISC BIT(2) +#define J1939_SOCK_ERRQUEUE BIT(3) + int state; + + int ifindex; + struct j1939_addr addr; + struct j1939_filter *filters; + int nfilters; + pgn_t pgn_rx_filter; + + /* j1939 may emit equal PGN (!= equal CAN-id's) out of order + * when transport protocol comes in. + * To allow emitting in order, keep a 'pending' nr. of packets + */ + atomic_t skb_pending; + wait_queue_head_t waitq; + + /* lock for the sk_session_queue list */ + spinlock_t sk_session_queue_lock; + struct list_head sk_session_queue; +}; + +static inline struct j1939_sock *j1939_sk(const struct sock *sk) +{ + return container_of(sk, struct j1939_sock, sk); +} + +void j1939_session_get(struct j1939_session *session); +void j1939_session_put(struct j1939_session *session); +void j1939_session_skb_queue(struct j1939_session *session, + struct sk_buff *skb); +int j1939_session_activate(struct j1939_session *session); +void j1939_tp_schedule_txtimer(struct j1939_session *session, int msec); +void j1939_session_timers_cancel(struct j1939_session *session); + +#define J1939_MAX_TP_PACKET_SIZE (7 * 0xff) +#define J1939_MAX_ETP_PACKET_SIZE (7 * 0x00ffffff) + +#define J1939_REGULAR 0 +#define J1939_EXTENDED 1 + +/* CAN protocol */ +extern const struct can_proto j1939_can_proto; + +#endif /* _J1939_PRIV_H_ */ diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c new file mode 100644 index 000000000000..137054bff9ec --- /dev/null +++ b/net/can/j1939/main.c @@ -0,0 +1,412 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2010-2011 EIA Electronics, +// Pieter Beyens <pieter.beyens@eia.be> +// Copyright (c) 2010-2011 EIA Electronics, +// Kurt Van Dijck <kurt.van.dijck@eia.be> +// Copyright (c) 2018 Protonic, +// Robin van der Gracht <robin@protonic.nl> +// Copyright (c) 2017-2019 Pengutronix, +// Marc Kleine-Budde <kernel@pengutronix.de> +// Copyright (c) 2017-2019 Pengutronix, +// Oleksij Rempel <kernel@pengutronix.de> + +/* Core of can-j1939 that links j1939 to CAN. */ + +#include <linux/can/can-ml.h> +#include <linux/can/core.h> +#include <linux/can/skb.h> +#include <linux/if_arp.h> +#include <linux/module.h> + +#include "j1939-priv.h" + +MODULE_DESCRIPTION("PF_CAN SAE J1939"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("EIA Electronics (Kurt Van Dijck & Pieter Beyens)"); +MODULE_ALIAS("can-proto-" __stringify(CAN_J1939)); + +/* LOWLEVEL CAN interface */ + +/* CAN_HDR: #bytes before can_frame data part */ +#define J1939_CAN_HDR (offsetof(struct can_frame, data)) + +/* CAN_FTR: #bytes beyond data part */ +#define J1939_CAN_FTR (sizeof(struct can_frame) - J1939_CAN_HDR - \ + sizeof(((struct can_frame *)0)->data)) + +/* lowest layer */ +static void j1939_can_recv(struct sk_buff *iskb, void *data) +{ + struct j1939_priv *priv = data; + struct sk_buff *skb; + struct j1939_sk_buff_cb *skcb, *iskcb; + struct can_frame *cf; + + /* create a copy of the skb + * j1939 only delivers the real data bytes, + * the header goes into sockaddr. + * j1939 may not touch the incoming skb in such way + */ + skb = skb_clone(iskb, GFP_ATOMIC); + if (!skb) + return; + + j1939_priv_get(priv); + can_skb_set_owner(skb, iskb->sk); + + /* get a pointer to the header of the skb + * the skb payload (pointer) is moved, so that the next skb_data + * returns the actual payload + */ + cf = (void *)skb->data; + skb_pull(skb, J1939_CAN_HDR); + + /* fix length, set to dlc, with 8 maximum */ + skb_trim(skb, min_t(uint8_t, cf->can_dlc, 8)); + + /* set addr */ + skcb = j1939_skb_to_cb(skb); + memset(skcb, 0, sizeof(*skcb)); + + iskcb = j1939_skb_to_cb(iskb); + skcb->tskey = iskcb->tskey; + skcb->priority = (cf->can_id >> 26) & 0x7; + skcb->addr.sa = cf->can_id; + skcb->addr.pgn = (cf->can_id >> 8) & J1939_PGN_MAX; + /* set default message type */ + skcb->addr.type = J1939_TP; + if (j1939_pgn_is_pdu1(skcb->addr.pgn)) { + /* Type 1: with destination address */ + skcb->addr.da = skcb->addr.pgn; + /* normalize pgn: strip dst address */ + skcb->addr.pgn &= 0x3ff00; + } else { + /* set broadcast address */ + skcb->addr.da = J1939_NO_ADDR; + } + + /* update localflags */ + read_lock_bh(&priv->lock); + if (j1939_address_is_unicast(skcb->addr.sa) && + priv->ents[skcb->addr.sa].nusers) + skcb->flags |= J1939_ECU_LOCAL_SRC; + if (j1939_address_is_unicast(skcb->addr.da) && + priv->ents[skcb->addr.da].nusers) + skcb->flags |= J1939_ECU_LOCAL_DST; + read_unlock_bh(&priv->lock); + + /* deliver into the j1939 stack ... */ + j1939_ac_recv(priv, skb); + + if (j1939_tp_recv(priv, skb)) + /* this means the transport layer processed the message */ + goto done; + + j1939_simple_recv(priv, skb); + j1939_sk_recv(priv, skb); + done: + j1939_priv_put(priv); + kfree_skb(skb); +} + +/* NETDEV MANAGEMENT */ + +/* values for can_rx_(un)register */ +#define J1939_CAN_ID CAN_EFF_FLAG +#define J1939_CAN_MASK (CAN_EFF_FLAG | CAN_RTR_FLAG) + +static DEFINE_SPINLOCK(j1939_netdev_lock); + +static struct j1939_priv *j1939_priv_create(struct net_device *ndev) +{ + struct j1939_priv *priv; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return NULL; + + rwlock_init(&priv->lock); + INIT_LIST_HEAD(&priv->ecus); + priv->ndev = ndev; + kref_init(&priv->kref); + kref_init(&priv->rx_kref); + dev_hold(ndev); + + netdev_dbg(priv->ndev, "%s : 0x%p\n", __func__, priv); + + return priv; +} + +static inline void j1939_priv_set(struct net_device *ndev, + struct j1939_priv *priv) +{ + struct can_ml_priv *can_ml_priv = ndev->ml_priv; + + can_ml_priv->j1939_priv = priv; +} + +static void __j1939_priv_release(struct kref *kref) +{ + struct j1939_priv *priv = container_of(kref, struct j1939_priv, kref); + struct net_device *ndev = priv->ndev; + + netdev_dbg(priv->ndev, "%s: 0x%p\n", __func__, priv); + + WARN_ON_ONCE(!list_empty(&priv->active_session_list)); + WARN_ON_ONCE(!list_empty(&priv->ecus)); + WARN_ON_ONCE(!list_empty(&priv->j1939_socks)); + + dev_put(ndev); + kfree(priv); +} + +void j1939_priv_put(struct j1939_priv *priv) +{ + kref_put(&priv->kref, __j1939_priv_release); +} + +void j1939_priv_get(struct j1939_priv *priv) +{ + kref_get(&priv->kref); +} + +static int j1939_can_rx_register(struct j1939_priv *priv) +{ + struct net_device *ndev = priv->ndev; + int ret; + + j1939_priv_get(priv); + ret = can_rx_register(dev_net(ndev), ndev, J1939_CAN_ID, J1939_CAN_MASK, + j1939_can_recv, priv, "j1939", NULL); + if (ret < 0) { + j1939_priv_put(priv); + return ret; + } + + return 0; +} + +static void j1939_can_rx_unregister(struct j1939_priv *priv) +{ + struct net_device *ndev = priv->ndev; + + can_rx_unregister(dev_net(ndev), ndev, J1939_CAN_ID, J1939_CAN_MASK, + j1939_can_recv, priv); + + j1939_priv_put(priv); +} + +static void __j1939_rx_release(struct kref *kref) + __releases(&j1939_netdev_lock) +{ + struct j1939_priv *priv = container_of(kref, struct j1939_priv, + rx_kref); + + j1939_can_rx_unregister(priv); + j1939_ecu_unmap_all(priv); + j1939_priv_set(priv->ndev, NULL); + spin_unlock(&j1939_netdev_lock); +} + +/* get pointer to priv without increasing ref counter */ +static inline struct j1939_priv *j1939_ndev_to_priv(struct net_device *ndev) +{ + struct can_ml_priv *can_ml_priv = ndev->ml_priv; + + if (!can_ml_priv) + return NULL; + + return can_ml_priv->j1939_priv; +} + +static struct j1939_priv *j1939_priv_get_by_ndev_locked(struct net_device *ndev) +{ + struct j1939_priv *priv; + + lockdep_assert_held(&j1939_netdev_lock); + + if (ndev->type != ARPHRD_CAN) + return NULL; + + priv = j1939_ndev_to_priv(ndev); + if (priv) + j1939_priv_get(priv); + + return priv; +} + +static struct j1939_priv *j1939_priv_get_by_ndev(struct net_device *ndev) +{ + struct j1939_priv *priv; + + spin_lock(&j1939_netdev_lock); + priv = j1939_priv_get_by_ndev_locked(ndev); + spin_unlock(&j1939_netdev_lock); + + return priv; +} + +struct j1939_priv *j1939_netdev_start(struct net_device *ndev) +{ + struct j1939_priv *priv, *priv_new; + int ret; + + priv = j1939_priv_get_by_ndev(ndev); + if (priv) { + kref_get(&priv->rx_kref); + return priv; + } + + priv = j1939_priv_create(ndev); + if (!priv) + return ERR_PTR(-ENOMEM); + + j1939_tp_init(priv); + spin_lock_init(&priv->j1939_socks_lock); + INIT_LIST_HEAD(&priv->j1939_socks); + + spin_lock(&j1939_netdev_lock); + priv_new = j1939_priv_get_by_ndev_locked(ndev); + if (priv_new) { + /* Someone was faster than us, use their priv and roll + * back our's. + */ + spin_unlock(&j1939_netdev_lock); + dev_put(ndev); + kfree(priv); + kref_get(&priv_new->rx_kref); + return priv_new; + } + j1939_priv_set(ndev, priv); + spin_unlock(&j1939_netdev_lock); + + ret = j1939_can_rx_register(priv); + if (ret < 0) + goto out_priv_put; + + return priv; + + out_priv_put: + j1939_priv_set(ndev, NULL); + dev_put(ndev); + kfree(priv); + + return ERR_PTR(ret); +} + +void j1939_netdev_stop(struct j1939_priv *priv) +{ + kref_put_lock(&priv->rx_kref, __j1939_rx_release, &j1939_netdev_lock); + j1939_priv_put(priv); +} + +int j1939_send_one(struct j1939_priv *priv, struct sk_buff *skb) +{ + int ret, dlc; + canid_t canid; + struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); + struct can_frame *cf; + + /* apply sanity checks */ + if (j1939_pgn_is_pdu1(skcb->addr.pgn)) + skcb->addr.pgn &= J1939_PGN_PDU1_MAX; + else + skcb->addr.pgn &= J1939_PGN_MAX; + + if (skcb->priority > 7) + skcb->priority = 6; + + ret = j1939_ac_fixup(priv, skb); + if (unlikely(ret)) + goto failed; + dlc = skb->len; + + /* re-claim the CAN_HDR from the SKB */ + cf = skb_push(skb, J1939_CAN_HDR); + + /* make it a full can frame again */ + skb_put(skb, J1939_CAN_FTR + (8 - dlc)); + + canid = CAN_EFF_FLAG | + (skcb->priority << 26) | + (skcb->addr.pgn << 8) | + skcb->addr.sa; + if (j1939_pgn_is_pdu1(skcb->addr.pgn)) + canid |= skcb->addr.da << 8; + + cf->can_id = canid; + cf->can_dlc = dlc; + + return can_send(skb, 1); + + failed: + kfree_skb(skb); + return ret; +} + +static int j1939_netdev_notify(struct notifier_block *nb, + unsigned long msg, void *data) +{ + struct net_device *ndev = netdev_notifier_info_to_dev(data); + struct j1939_priv *priv; + + priv = j1939_priv_get_by_ndev(ndev); + if (!priv) + goto notify_done; + + if (ndev->type != ARPHRD_CAN) + goto notify_put; + + switch (msg) { + case NETDEV_DOWN: + j1939_cancel_active_session(priv, NULL); + j1939_sk_netdev_event_netdown(priv); + j1939_ecu_unmap_all(priv); + break; + } + +notify_put: + j1939_priv_put(priv); + +notify_done: + return NOTIFY_DONE; +} + +static struct notifier_block j1939_netdev_notifier = { + .notifier_call = j1939_netdev_notify, +}; + +/* MODULE interface */ +static __init int j1939_module_init(void) +{ + int ret; + + pr_info("can: SAE J1939\n"); + + ret = register_netdevice_notifier(&j1939_netdev_notifier); + if (ret) + goto fail_notifier; + + ret = can_proto_register(&j1939_can_proto); + if (ret < 0) { + pr_err("can: registration of j1939 protocol failed\n"); + goto fail_sk; + } + + return 0; + + fail_sk: + unregister_netdevice_notifier(&j1939_netdev_notifier); + fail_notifier: + return ret; +} + +static __exit void j1939_module_exit(void) +{ + can_proto_unregister(&j1939_can_proto); + + unregister_netdevice_notifier(&j1939_netdev_notifier); +} + +module_init(j1939_module_init); +module_exit(j1939_module_exit); diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c new file mode 100644 index 000000000000..f7587428febd --- /dev/null +++ b/net/can/j1939/socket.c @@ -0,0 +1,1223 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2010-2011 EIA Electronics, +// Pieter Beyens <pieter.beyens@eia.be> +// Copyright (c) 2010-2011 EIA Electronics, +// Kurt Van Dijck <kurt.van.dijck@eia.be> +// Copyright (c) 2018 Protonic, +// Robin van der Gracht <robin@protonic.nl> +// Copyright (c) 2017-2019 Pengutronix, +// Marc Kleine-Budde <kernel@pengutronix.de> +// Copyright (c) 2017-2019 Pengutronix, +// Oleksij Rempel <kernel@pengutronix.de> + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/can/core.h> +#include <linux/can/skb.h> +#include <linux/errqueue.h> +#include <linux/if_arp.h> + +#include "j1939-priv.h" + +#define J1939_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_addr.j1939) + +/* conversion function between struct sock::sk_priority from linux and + * j1939 priority field + */ +static inline priority_t j1939_prio(u32 sk_priority) +{ + sk_priority = min(sk_priority, 7U); + + return 7 - sk_priority; +} + +static inline u32 j1939_to_sk_priority(priority_t prio) +{ + return 7 - prio; +} + +/* function to see if pgn is to be evaluated */ +static inline bool j1939_pgn_is_valid(pgn_t pgn) +{ + return pgn <= J1939_PGN_MAX; +} + +/* test function to avoid non-zero DA placeholder for pdu1 pgn's */ +static inline bool j1939_pgn_is_clean_pdu(pgn_t pgn) +{ + if (j1939_pgn_is_pdu1(pgn)) + return !(pgn & 0xff); + else + return true; +} + +static inline void j1939_sock_pending_add(struct sock *sk) +{ + struct j1939_sock *jsk = j1939_sk(sk); + + atomic_inc(&jsk->skb_pending); +} + +static int j1939_sock_pending_get(struct sock *sk) +{ + struct j1939_sock *jsk = j1939_sk(sk); + + return atomic_read(&jsk->skb_pending); +} + +void j1939_sock_pending_del(struct sock *sk) +{ + struct j1939_sock *jsk = j1939_sk(sk); + + /* atomic_dec_return returns the new value */ + if (!atomic_dec_return(&jsk->skb_pending)) + wake_up(&jsk->waitq); /* no pending SKB's */ +} + +static void j1939_jsk_add(struct j1939_priv *priv, struct j1939_sock *jsk) +{ + jsk->state |= J1939_SOCK_BOUND; + j1939_priv_get(priv); + + spin_lock_bh(&priv->j1939_socks_lock); + list_add_tail(&jsk->list, &priv->j1939_socks); + spin_unlock_bh(&priv->j1939_socks_lock); +} + +static void j1939_jsk_del(struct j1939_priv *priv, struct j1939_sock *jsk) +{ + spin_lock_bh(&priv->j1939_socks_lock); + list_del_init(&jsk->list); + spin_unlock_bh(&priv->j1939_socks_lock); + + j1939_priv_put(priv); + jsk->state &= ~J1939_SOCK_BOUND; +} + +static bool j1939_sk_queue_session(struct j1939_session *session) +{ + struct j1939_sock *jsk = j1939_sk(session->sk); + bool empty; + + spin_lock_bh(&jsk->sk_session_queue_lock); + empty = list_empty(&jsk->sk_session_queue); + j1939_session_get(session); + list_add_tail(&session->sk_session_queue_entry, &jsk->sk_session_queue); + spin_unlock_bh(&jsk->sk_session_queue_lock); + j1939_sock_pending_add(&jsk->sk); + + return empty; +} + +static struct +j1939_session *j1939_sk_get_incomplete_session(struct j1939_sock *jsk) +{ + struct j1939_session *session = NULL; + + spin_lock_bh(&jsk->sk_session_queue_lock); + if (!list_empty(&jsk->sk_session_queue)) { + session = list_last_entry(&jsk->sk_session_queue, + struct j1939_session, + sk_session_queue_entry); + if (session->total_queued_size == session->total_message_size) + session = NULL; + else + j1939_session_get(session); + } + spin_unlock_bh(&jsk->sk_session_queue_lock); + + return session; +} + +static void j1939_sk_queue_drop_all(struct j1939_priv *priv, + struct j1939_sock *jsk, int err) +{ + struct j1939_session *session, *tmp; + + netdev_dbg(priv->ndev, "%s: err: %i\n", __func__, err); + spin_lock_bh(&jsk->sk_session_queue_lock); + list_for_each_entry_safe(session, tmp, &jsk->sk_session_queue, + sk_session_queue_entry) { + list_del_init(&session->sk_session_queue_entry); + session->err = err; + j1939_session_put(session); + } + spin_unlock_bh(&jsk->sk_session_queue_lock); +} + +static void j1939_sk_queue_activate_next_locked(struct j1939_session *session) +{ + struct j1939_sock *jsk; + struct j1939_session *first; + int err; + + /* RX-Session don't have a socket (yet) */ + if (!session->sk) + return; + + jsk = j1939_sk(session->sk); + lockdep_assert_held(&jsk->sk_session_queue_lock); + + err = session->err; + + first = list_first_entry_or_null(&jsk->sk_session_queue, + struct j1939_session, + sk_session_queue_entry); + + /* Some else has already activated the next session */ + if (first != session) + return; + +activate_next: + list_del_init(&first->sk_session_queue_entry); + j1939_session_put(first); + first = list_first_entry_or_null(&jsk->sk_session_queue, + struct j1939_session, + sk_session_queue_entry); + if (!first) + return; + + if (WARN_ON_ONCE(j1939_session_activate(first))) { + first->err = -EBUSY; + goto activate_next; + } else { + /* Give receiver some time (arbitrary chosen) to recover */ + int time_ms = 0; + + if (err) + time_ms = 10 + prandom_u32_max(16); + + j1939_tp_schedule_txtimer(first, time_ms); + } +} + +void j1939_sk_queue_activate_next(struct j1939_session *session) +{ + struct j1939_sock *jsk; + + if (!session->sk) + return; + + jsk = j1939_sk(session->sk); + + spin_lock_bh(&jsk->sk_session_queue_lock); + j1939_sk_queue_activate_next_locked(session); + spin_unlock_bh(&jsk->sk_session_queue_lock); +} + +static bool j1939_sk_match_dst(struct j1939_sock *jsk, + const struct j1939_sk_buff_cb *skcb) +{ + if ((jsk->state & J1939_SOCK_PROMISC)) + return true; + + /* Destination address filter */ + if (jsk->addr.src_name && skcb->addr.dst_name) { + if (jsk->addr.src_name != skcb->addr.dst_name) + return false; + } else { + /* receive (all sockets) if + * - all packages that match our bind() address + * - all broadcast on a socket if SO_BROADCAST + * is set + */ + if (j1939_address_is_unicast(skcb->addr.da)) { + if (jsk->addr.sa != skcb->addr.da) + return false; + } else if (!sock_flag(&jsk->sk, SOCK_BROADCAST)) { + /* receiving broadcast without SO_BROADCAST + * flag is not allowed + */ + return false; + } + } + + /* Source address filter */ + if (jsk->state & J1939_SOCK_CONNECTED) { + /* receive (all sockets) if + * - all packages that match our connect() name or address + */ + if (jsk->addr.dst_name && skcb->addr.src_name) { + if (jsk->addr.dst_name != skcb->addr.src_name) + return false; + } else { + if (jsk->addr.da != skcb->addr.sa) + return false; + } + } + + /* PGN filter */ + if (j1939_pgn_is_valid(jsk->pgn_rx_filter) && + jsk->pgn_rx_filter != skcb->addr.pgn) + return false; + + return true; +} + +/* matches skb control buffer (addr) with a j1939 filter */ +static bool j1939_sk_match_filter(struct j1939_sock *jsk, + const struct j1939_sk_buff_cb *skcb) +{ + const struct j1939_filter *f = jsk->filters; + int nfilter = jsk->nfilters; + + if (!nfilter) + /* receive all when no filters are assigned */ + return true; + + for (; nfilter; ++f, --nfilter) { + if ((skcb->addr.pgn & f->pgn_mask) != f->pgn) + continue; + if ((skcb->addr.sa & f->addr_mask) != f->addr) + continue; + if ((skcb->addr.src_name & f->name_mask) != f->name) + continue; + return true; + } + return false; +} + +static bool j1939_sk_recv_match_one(struct j1939_sock *jsk, + const struct j1939_sk_buff_cb *skcb) +{ + if (!(jsk->state & J1939_SOCK_BOUND)) + return false; + + if (!j1939_sk_match_dst(jsk, skcb)) + return false; + + if (!j1939_sk_match_filter(jsk, skcb)) + return false; + + return true; +} + +static void j1939_sk_recv_one(struct j1939_sock *jsk, struct sk_buff *oskb) +{ + const struct j1939_sk_buff_cb *oskcb = j1939_skb_to_cb(oskb); + struct j1939_sk_buff_cb *skcb; + struct sk_buff *skb; + + if (oskb->sk == &jsk->sk) + return; + + if (!j1939_sk_recv_match_one(jsk, oskcb)) + return; + + skb = skb_clone(oskb, GFP_ATOMIC); + if (!skb) { + pr_warn("skb clone failed\n"); + return; + } + can_skb_set_owner(skb, oskb->sk); + + skcb = j1939_skb_to_cb(skb); + skcb->msg_flags &= ~(MSG_DONTROUTE); + if (skb->sk) + skcb->msg_flags |= MSG_DONTROUTE; + + if (sock_queue_rcv_skb(&jsk->sk, skb) < 0) + kfree_skb(skb); +} + +bool j1939_sk_recv_match(struct j1939_priv *priv, struct j1939_sk_buff_cb *skcb) +{ + struct j1939_sock *jsk; + bool match = false; + + spin_lock_bh(&priv->j1939_socks_lock); + list_for_each_entry(jsk, &priv->j1939_socks, list) { + match = j1939_sk_recv_match_one(jsk, skcb); + if (match) + break; + } + spin_unlock_bh(&priv->j1939_socks_lock); + + return match; +} + +void j1939_sk_recv(struct j1939_priv *priv, struct sk_buff *skb) +{ + struct j1939_sock *jsk; + + spin_lock_bh(&priv->j1939_socks_lock); + list_for_each_entry(jsk, &priv->j1939_socks, list) { + j1939_sk_recv_one(jsk, skb); + } + spin_unlock_bh(&priv->j1939_socks_lock); +} + +static void j1939_sk_sock_destruct(struct sock *sk) +{ + struct j1939_sock *jsk = j1939_sk(sk); + + /* This function will be call by the generic networking code, when then + * the socket is ultimately closed (sk->sk_destruct). + * + * The race between + * - processing a received CAN frame + * (can_receive -> j1939_can_recv) + * and accessing j1939_priv + * ... and ... + * - closing a socket + * (j1939_can_rx_unregister -> can_rx_unregister) + * and calling the final j1939_priv_put() + * + * is avoided by calling the final j1939_priv_put() from this + * RCU deferred cleanup call. + */ + if (jsk->priv) { + j1939_priv_put(jsk->priv); + jsk->priv = NULL; + } + + /* call generic CAN sock destruct */ + can_sock_destruct(sk); +} + +static int j1939_sk_init(struct sock *sk) +{ + struct j1939_sock *jsk = j1939_sk(sk); + + /* Ensure that "sk" is first member in "struct j1939_sock", so that we + * can skip it during memset(). + */ + BUILD_BUG_ON(offsetof(struct j1939_sock, sk) != 0); + memset((void *)jsk + sizeof(jsk->sk), 0x0, + sizeof(*jsk) - sizeof(jsk->sk)); + + INIT_LIST_HEAD(&jsk->list); + init_waitqueue_head(&jsk->waitq); + jsk->sk.sk_priority = j1939_to_sk_priority(6); + jsk->sk.sk_reuse = 1; /* per default */ + jsk->addr.sa = J1939_NO_ADDR; + jsk->addr.da = J1939_NO_ADDR; + jsk->addr.pgn = J1939_NO_PGN; + jsk->pgn_rx_filter = J1939_NO_PGN; + atomic_set(&jsk->skb_pending, 0); + spin_lock_init(&jsk->sk_session_queue_lock); + INIT_LIST_HEAD(&jsk->sk_session_queue); + sk->sk_destruct = j1939_sk_sock_destruct; + + return 0; +} + +static int j1939_sk_sanity_check(struct sockaddr_can *addr, int len) +{ + if (!addr) + return -EDESTADDRREQ; + if (len < J1939_MIN_NAMELEN) + return -EINVAL; + if (addr->can_family != AF_CAN) + return -EINVAL; + if (!addr->can_ifindex) + return -ENODEV; + if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn) && + !j1939_pgn_is_clean_pdu(addr->can_addr.j1939.pgn)) + return -EINVAL; + + return 0; +} + +static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len) +{ + struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; + struct j1939_sock *jsk = j1939_sk(sock->sk); + struct j1939_priv *priv; + struct sock *sk; + struct net *net; + int ret = 0; + + ret = j1939_sk_sanity_check(addr, len); + if (ret) + return ret; + + lock_sock(sock->sk); + + priv = jsk->priv; + sk = sock->sk; + net = sock_net(sk); + + /* Already bound to an interface? */ + if (jsk->state & J1939_SOCK_BOUND) { + /* A re-bind() to a different interface is not + * supported. + */ + if (jsk->ifindex != addr->can_ifindex) { + ret = -EINVAL; + goto out_release_sock; + } + + /* drop old references */ + j1939_jsk_del(priv, jsk); + j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa); + } else { + struct net_device *ndev; + + ndev = dev_get_by_index(net, addr->can_ifindex); + if (!ndev) { + ret = -ENODEV; + goto out_release_sock; + } + + if (ndev->type != ARPHRD_CAN) { + dev_put(ndev); + ret = -ENODEV; + goto out_release_sock; + } + + priv = j1939_netdev_start(ndev); + dev_put(ndev); + if (IS_ERR(priv)) { + ret = PTR_ERR(priv); + goto out_release_sock; + } + + jsk->ifindex = addr->can_ifindex; + + /* the corresponding j1939_priv_put() is called via + * sk->sk_destruct, which points to j1939_sk_sock_destruct() + */ + j1939_priv_get(priv); + jsk->priv = priv; + } + + /* set default transmit pgn */ + if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn)) + jsk->pgn_rx_filter = addr->can_addr.j1939.pgn; + jsk->addr.src_name = addr->can_addr.j1939.name; + jsk->addr.sa = addr->can_addr.j1939.addr; + + /* get new references */ + ret = j1939_local_ecu_get(priv, jsk->addr.src_name, jsk->addr.sa); + if (ret) { + j1939_netdev_stop(priv); + goto out_release_sock; + } + + j1939_jsk_add(priv, jsk); + + out_release_sock: /* fall through */ + release_sock(sock->sk); + + return ret; +} + +static int j1939_sk_connect(struct socket *sock, struct sockaddr *uaddr, + int len, int flags) +{ + struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; + struct j1939_sock *jsk = j1939_sk(sock->sk); + int ret = 0; + + ret = j1939_sk_sanity_check(addr, len); + if (ret) + return ret; + + lock_sock(sock->sk); + + /* bind() before connect() is mandatory */ + if (!(jsk->state & J1939_SOCK_BOUND)) { + ret = -EINVAL; + goto out_release_sock; + } + + /* A connect() to a different interface is not supported. */ + if (jsk->ifindex != addr->can_ifindex) { + ret = -EINVAL; + goto out_release_sock; + } + + if (!addr->can_addr.j1939.name && + addr->can_addr.j1939.addr == J1939_NO_ADDR && + !sock_flag(&jsk->sk, SOCK_BROADCAST)) { + /* broadcast, but SO_BROADCAST not set */ + ret = -EACCES; + goto out_release_sock; + } + + jsk->addr.dst_name = addr->can_addr.j1939.name; + jsk->addr.da = addr->can_addr.j1939.addr; + + if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn)) + jsk->addr.pgn = addr->can_addr.j1939.pgn; + + jsk->state |= J1939_SOCK_CONNECTED; + + out_release_sock: /* fall through */ + release_sock(sock->sk); + + return ret; +} + +static void j1939_sk_sock2sockaddr_can(struct sockaddr_can *addr, + const struct j1939_sock *jsk, int peer) +{ + addr->can_family = AF_CAN; + addr->can_ifindex = jsk->ifindex; + addr->can_addr.j1939.pgn = jsk->addr.pgn; + if (peer) { + addr->can_addr.j1939.name = jsk->addr.dst_name; + addr->can_addr.j1939.addr = jsk->addr.da; + } else { + addr->can_addr.j1939.name = jsk->addr.src_name; + addr->can_addr.j1939.addr = jsk->addr.sa; + } +} + +static int j1939_sk_getname(struct socket *sock, struct sockaddr *uaddr, + int peer) +{ + struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; + struct sock *sk = sock->sk; + struct j1939_sock *jsk = j1939_sk(sk); + int ret = 0; + + lock_sock(sk); + + if (peer && !(jsk->state & J1939_SOCK_CONNECTED)) { + ret = -EADDRNOTAVAIL; + goto failure; + } + + j1939_sk_sock2sockaddr_can(addr, jsk, peer); + ret = J1939_MIN_NAMELEN; + + failure: + release_sock(sk); + + return ret; +} + +static int j1939_sk_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct j1939_sock *jsk; + + if (!sk) + return 0; + + lock_sock(sk); + jsk = j1939_sk(sk); + + if (jsk->state & J1939_SOCK_BOUND) { + struct j1939_priv *priv = jsk->priv; + + if (wait_event_interruptible(jsk->waitq, + !j1939_sock_pending_get(&jsk->sk))) { + j1939_cancel_active_session(priv, sk); + j1939_sk_queue_drop_all(priv, jsk, ESHUTDOWN); + } + + j1939_jsk_del(priv, jsk); + + j1939_local_ecu_put(priv, jsk->addr.src_name, + jsk->addr.sa); + + j1939_netdev_stop(priv); + } + + kfree(jsk->filters); + sock_orphan(sk); + sock->sk = NULL; + + release_sock(sk); + sock_put(sk); + + return 0; +} + +static int j1939_sk_setsockopt_flag(struct j1939_sock *jsk, char __user *optval, + unsigned int optlen, int flag) +{ + int tmp; + + if (optlen != sizeof(tmp)) + return -EINVAL; + if (copy_from_user(&tmp, optval, optlen)) + return -EFAULT; + lock_sock(&jsk->sk); + if (tmp) + jsk->state |= flag; + else + jsk->state &= ~flag; + release_sock(&jsk->sk); + return tmp; +} + +static int j1939_sk_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + struct j1939_sock *jsk = j1939_sk(sk); + int tmp, count = 0, ret = 0; + struct j1939_filter *filters = NULL, *ofilters; + + if (level != SOL_CAN_J1939) + return -EINVAL; + + switch (optname) { + case SO_J1939_FILTER: + if (optval) { + struct j1939_filter *f; + int c; + + if (optlen % sizeof(*filters) != 0) + return -EINVAL; + + if (optlen > J1939_FILTER_MAX * + sizeof(struct j1939_filter)) + return -EINVAL; + + count = optlen / sizeof(*filters); + filters = memdup_user(optval, optlen); + if (IS_ERR(filters)) + return PTR_ERR(filters); + + for (f = filters, c = count; c; f++, c--) { + f->name &= f->name_mask; + f->pgn &= f->pgn_mask; + f->addr &= f->addr_mask; + } + } + + lock_sock(&jsk->sk); + ofilters = jsk->filters; + jsk->filters = filters; + jsk->nfilters = count; + release_sock(&jsk->sk); + kfree(ofilters); + return 0; + case SO_J1939_PROMISC: + return j1939_sk_setsockopt_flag(jsk, optval, optlen, + J1939_SOCK_PROMISC); + case SO_J1939_ERRQUEUE: + ret = j1939_sk_setsockopt_flag(jsk, optval, optlen, + J1939_SOCK_ERRQUEUE); + if (ret < 0) + return ret; + + if (!(jsk->state & J1939_SOCK_ERRQUEUE)) + skb_queue_purge(&sk->sk_error_queue); + return ret; + case SO_J1939_SEND_PRIO: + if (optlen != sizeof(tmp)) + return -EINVAL; + if (copy_from_user(&tmp, optval, optlen)) + return -EFAULT; + if (tmp < 0 || tmp > 7) + return -EDOM; + if (tmp < 2 && !capable(CAP_NET_ADMIN)) + return -EPERM; + lock_sock(&jsk->sk); + jsk->sk.sk_priority = j1939_to_sk_priority(tmp); + release_sock(&jsk->sk); + return 0; + default: + return -ENOPROTOOPT; + } +} + +static int j1939_sk_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct j1939_sock *jsk = j1939_sk(sk); + int ret, ulen; + /* set defaults for using 'int' properties */ + int tmp = 0; + int len = sizeof(tmp); + void *val = &tmp; + + if (level != SOL_CAN_J1939) + return -EINVAL; + if (get_user(ulen, optlen)) + return -EFAULT; + if (ulen < 0) + return -EINVAL; + + lock_sock(&jsk->sk); + switch (optname) { + case SO_J1939_PROMISC: + tmp = (jsk->state & J1939_SOCK_PROMISC) ? 1 : 0; + break; + case SO_J1939_ERRQUEUE: + tmp = (jsk->state & J1939_SOCK_ERRQUEUE) ? 1 : 0; + break; + case SO_J1939_SEND_PRIO: + tmp = j1939_prio(jsk->sk.sk_priority); + break; + default: + ret = -ENOPROTOOPT; + goto no_copy; + } + + /* copy to user, based on 'len' & 'val' + * but most sockopt's are 'int' properties, and have 'len' & 'val' + * left unchanged, but instead modified 'tmp' + */ + if (len > ulen) + ret = -EFAULT; + else if (put_user(len, optlen)) + ret = -EFAULT; + else if (copy_to_user(optval, val, len)) + ret = -EFAULT; + else + ret = 0; + no_copy: + release_sock(&jsk->sk); + return ret; +} + +static int j1939_sk_recvmsg(struct socket *sock, struct msghdr *msg, + size_t size, int flags) +{ + struct sock *sk = sock->sk; + struct sk_buff *skb; + struct j1939_sk_buff_cb *skcb; + int ret = 0; + + if (flags & ~(MSG_DONTWAIT | MSG_ERRQUEUE)) + return -EINVAL; + + if (flags & MSG_ERRQUEUE) + return sock_recv_errqueue(sock->sk, msg, size, SOL_CAN_J1939, + SCM_J1939_ERRQUEUE); + + skb = skb_recv_datagram(sk, flags, 0, &ret); + if (!skb) + return ret; + + if (size < skb->len) + msg->msg_flags |= MSG_TRUNC; + else + size = skb->len; + + ret = memcpy_to_msg(msg, skb->data, size); + if (ret < 0) { + skb_free_datagram(sk, skb); + return ret; + } + + skcb = j1939_skb_to_cb(skb); + if (j1939_address_is_valid(skcb->addr.da)) + put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_DEST_ADDR, + sizeof(skcb->addr.da), &skcb->addr.da); + + if (skcb->addr.dst_name) + put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_DEST_NAME, + sizeof(skcb->addr.dst_name), &skcb->addr.dst_name); + + put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_PRIO, + sizeof(skcb->priority), &skcb->priority); + + if (msg->msg_name) { + struct sockaddr_can *paddr = msg->msg_name; + + msg->msg_namelen = J1939_MIN_NAMELEN; + memset(msg->msg_name, 0, msg->msg_namelen); + paddr->can_family = AF_CAN; + paddr->can_ifindex = skb->skb_iif; + paddr->can_addr.j1939.name = skcb->addr.src_name; + paddr->can_addr.j1939.addr = skcb->addr.sa; + paddr->can_addr.j1939.pgn = skcb->addr.pgn; + } + + sock_recv_ts_and_drops(msg, sk, skb); + msg->msg_flags |= skcb->msg_flags; + skb_free_datagram(sk, skb); + + return size; +} + +static struct sk_buff *j1939_sk_alloc_skb(struct net_device *ndev, + struct sock *sk, + struct msghdr *msg, size_t size, + int *errcode) +{ + struct j1939_sock *jsk = j1939_sk(sk); + struct j1939_sk_buff_cb *skcb; + struct sk_buff *skb; + int ret; + + skb = sock_alloc_send_skb(sk, + size + + sizeof(struct can_frame) - + sizeof(((struct can_frame *)NULL)->data) + + sizeof(struct can_skb_priv), + msg->msg_flags & MSG_DONTWAIT, &ret); + if (!skb) + goto failure; + + can_skb_reserve(skb); + can_skb_prv(skb)->ifindex = ndev->ifindex; + can_skb_prv(skb)->skbcnt = 0; + skb_reserve(skb, offsetof(struct can_frame, data)); + + ret = memcpy_from_msg(skb_put(skb, size), msg, size); + if (ret < 0) + goto free_skb; + + skb->dev = ndev; + + skcb = j1939_skb_to_cb(skb); + memset(skcb, 0, sizeof(*skcb)); + skcb->addr = jsk->addr; + skcb->priority = j1939_prio(sk->sk_priority); + + if (msg->msg_name) { + struct sockaddr_can *addr = msg->msg_name; + + if (addr->can_addr.j1939.name || + addr->can_addr.j1939.addr != J1939_NO_ADDR) { + skcb->addr.dst_name = addr->can_addr.j1939.name; + skcb->addr.da = addr->can_addr.j1939.addr; + } + if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn)) + skcb->addr.pgn = addr->can_addr.j1939.pgn; + } + + *errcode = ret; + return skb; + +free_skb: + kfree_skb(skb); +failure: + *errcode = ret; + return NULL; +} + +static size_t j1939_sk_opt_stats_get_size(void) +{ + return + nla_total_size(sizeof(u32)) + /* J1939_NLA_BYTES_ACKED */ + 0; +} + +static struct sk_buff * +j1939_sk_get_timestamping_opt_stats(struct j1939_session *session) +{ + struct sk_buff *stats; + u32 size; + + stats = alloc_skb(j1939_sk_opt_stats_get_size(), GFP_ATOMIC); + if (!stats) + return NULL; + + if (session->skcb.addr.type == J1939_SIMPLE) + size = session->total_message_size; + else + size = min(session->pkt.tx_acked * 7, + session->total_message_size); + + nla_put_u32(stats, J1939_NLA_BYTES_ACKED, size); + + return stats; +} + +void j1939_sk_errqueue(struct j1939_session *session, + enum j1939_sk_errqueue_type type) +{ + struct j1939_priv *priv = session->priv; + struct sock *sk = session->sk; + struct j1939_sock *jsk; + struct sock_exterr_skb *serr; + struct sk_buff *skb; + char *state = "UNK"; + int err; + + /* currently we have no sk for the RX session */ + if (!sk) + return; + + jsk = j1939_sk(sk); + + if (!(jsk->state & J1939_SOCK_ERRQUEUE)) + return; + + skb = j1939_sk_get_timestamping_opt_stats(session); + if (!skb) + return; + + skb->tstamp = ktime_get_real(); + + BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb)); + + serr = SKB_EXT_ERR(skb); + memset(serr, 0, sizeof(*serr)); + switch (type) { + case J1939_ERRQUEUE_ACK: + if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK)) { + kfree_skb(skb); + return; + } + + serr->ee.ee_errno = ENOMSG; + serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; + serr->ee.ee_info = SCM_TSTAMP_ACK; + state = "ACK"; + break; + case J1939_ERRQUEUE_SCHED: + if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_SCHED)) { + kfree_skb(skb); + return; + } + + serr->ee.ee_errno = ENOMSG; + serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; + serr->ee.ee_info = SCM_TSTAMP_SCHED; + state = "SCH"; + break; + case J1939_ERRQUEUE_ABORT: + serr->ee.ee_errno = session->err; + serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; + serr->ee.ee_info = J1939_EE_INFO_TX_ABORT; + state = "ABT"; + break; + default: + netdev_err(priv->ndev, "Unknown errqueue type %i\n", type); + } + + serr->opt_stats = true; + if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) + serr->ee.ee_data = session->tskey; + + netdev_dbg(session->priv->ndev, "%s: 0x%p tskey: %i, state: %s\n", + __func__, session, session->tskey, state); + err = sock_queue_err_skb(sk, skb); + + if (err) + kfree_skb(skb); +}; + +void j1939_sk_send_loop_abort(struct sock *sk, int err) +{ + sk->sk_err = err; + + sk->sk_error_report(sk); +} + +static int j1939_sk_send_loop(struct j1939_priv *priv, struct sock *sk, + struct msghdr *msg, size_t size) + +{ + struct j1939_sock *jsk = j1939_sk(sk); + struct j1939_session *session = j1939_sk_get_incomplete_session(jsk); + struct sk_buff *skb; + size_t segment_size, todo_size; + int ret = 0; + + if (session && + session->total_message_size != session->total_queued_size + size) { + j1939_session_put(session); + return -EIO; + } + + todo_size = size; + + while (todo_size) { + struct j1939_sk_buff_cb *skcb; + + segment_size = min_t(size_t, J1939_MAX_TP_PACKET_SIZE, + todo_size); + + /* Allocate skb for one segment */ + skb = j1939_sk_alloc_skb(priv->ndev, sk, msg, segment_size, + &ret); + if (ret) + break; + + skcb = j1939_skb_to_cb(skb); + + if (!session) { + /* at this point the size should be full size + * of the session + */ + skcb->offset = 0; + session = j1939_tp_send(priv, skb, size); + if (IS_ERR(session)) { + ret = PTR_ERR(session); + goto kfree_skb; + } + if (j1939_sk_queue_session(session)) { + /* try to activate session if we a + * fist in the queue + */ + if (!j1939_session_activate(session)) { + j1939_tp_schedule_txtimer(session, 0); + } else { + ret = -EBUSY; + session->err = ret; + j1939_sk_queue_drop_all(priv, jsk, + EBUSY); + break; + } + } + } else { + skcb->offset = session->total_queued_size; + j1939_session_skb_queue(session, skb); + } + + todo_size -= segment_size; + session->total_queued_size += segment_size; + } + + switch (ret) { + case 0: /* OK */ + if (todo_size) + netdev_warn(priv->ndev, + "no error found and not completely queued?! %zu\n", + todo_size); + ret = size; + break; + case -ERESTARTSYS: + ret = -EINTR; + /* fall through */ + case -EAGAIN: /* OK */ + if (todo_size != size) + ret = size - todo_size; + break; + default: /* ERROR */ + break; + } + + if (session) + j1939_session_put(session); + + return ret; + + kfree_skb: + kfree_skb(skb); + return ret; +} + +static int j1939_sk_sendmsg(struct socket *sock, struct msghdr *msg, + size_t size) +{ + struct sock *sk = sock->sk; + struct j1939_sock *jsk = j1939_sk(sk); + struct j1939_priv *priv; + int ifindex; + int ret; + + lock_sock(sock->sk); + /* various socket state tests */ + if (!(jsk->state & J1939_SOCK_BOUND)) { + ret = -EBADFD; + goto sendmsg_done; + } + + priv = jsk->priv; + ifindex = jsk->ifindex; + + if (!jsk->addr.src_name && jsk->addr.sa == J1939_NO_ADDR) { + /* no source address assigned yet */ + ret = -EBADFD; + goto sendmsg_done; + } + + /* deal with provided destination address info */ + if (msg->msg_name) { + struct sockaddr_can *addr = msg->msg_name; + + if (msg->msg_namelen < J1939_MIN_NAMELEN) { + ret = -EINVAL; + goto sendmsg_done; + } + + if (addr->can_family != AF_CAN) { + ret = -EINVAL; + goto sendmsg_done; + } + + if (addr->can_ifindex && addr->can_ifindex != ifindex) { + ret = -EBADFD; + goto sendmsg_done; + } + + if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn) && + !j1939_pgn_is_clean_pdu(addr->can_addr.j1939.pgn)) { + ret = -EINVAL; + goto sendmsg_done; + } + + if (!addr->can_addr.j1939.name && + addr->can_addr.j1939.addr == J1939_NO_ADDR && + !sock_flag(sk, SOCK_BROADCAST)) { + /* broadcast, but SO_BROADCAST not set */ + ret = -EACCES; + goto sendmsg_done; + } + } else { + if (!jsk->addr.dst_name && jsk->addr.da == J1939_NO_ADDR && + !sock_flag(sk, SOCK_BROADCAST)) { + /* broadcast, but SO_BROADCAST not set */ + ret = -EACCES; + goto sendmsg_done; + } + } + + ret = j1939_sk_send_loop(priv, sk, msg, size); + +sendmsg_done: + release_sock(sock->sk); + + return ret; +} + +void j1939_sk_netdev_event_netdown(struct j1939_priv *priv) +{ + struct j1939_sock *jsk; + int error_code = ENETDOWN; + + spin_lock_bh(&priv->j1939_socks_lock); + list_for_each_entry(jsk, &priv->j1939_socks, list) { + jsk->sk.sk_err = error_code; + if (!sock_flag(&jsk->sk, SOCK_DEAD)) + jsk->sk.sk_error_report(&jsk->sk); + + j1939_sk_queue_drop_all(priv, jsk, error_code); + } + spin_unlock_bh(&priv->j1939_socks_lock); +} + +static int j1939_sk_no_ioctlcmd(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + /* no ioctls for socket layer -> hand it down to NIC layer */ + return -ENOIOCTLCMD; +} + +static const struct proto_ops j1939_ops = { + .family = PF_CAN, + .release = j1939_sk_release, + .bind = j1939_sk_bind, + .connect = j1939_sk_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = j1939_sk_getname, + .poll = datagram_poll, + .ioctl = j1939_sk_no_ioctlcmd, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = j1939_sk_setsockopt, + .getsockopt = j1939_sk_getsockopt, + .sendmsg = j1939_sk_sendmsg, + .recvmsg = j1939_sk_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct proto j1939_proto __read_mostly = { + .name = "CAN_J1939", + .owner = THIS_MODULE, + .obj_size = sizeof(struct j1939_sock), + .init = j1939_sk_init, +}; + +const struct can_proto j1939_can_proto = { + .type = SOCK_DGRAM, + .protocol = CAN_J1939, + .ops = &j1939_ops, + .prot = &j1939_proto, +}; diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c new file mode 100644 index 000000000000..9f99af5b0b11 --- /dev/null +++ b/net/can/j1939/transport.c @@ -0,0 +1,2063 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2010-2011 EIA Electronics, +// Kurt Van Dijck <kurt.van.dijck@eia.be> +// Copyright (c) 2018 Protonic, +// Robin van der Gracht <robin@protonic.nl> +// Copyright (c) 2017-2019 Pengutronix, +// Marc Kleine-Budde <kernel@pengutronix.de> +// Copyright (c) 2017-2019 Pengutronix, +// Oleksij Rempel <kernel@pengutronix.de> + +#include <linux/can/skb.h> + +#include "j1939-priv.h" + +#define J1939_XTP_TX_RETRY_LIMIT 100 + +#define J1939_ETP_PGN_CTL 0xc800 +#define J1939_ETP_PGN_DAT 0xc700 +#define J1939_TP_PGN_CTL 0xec00 +#define J1939_TP_PGN_DAT 0xeb00 + +#define J1939_TP_CMD_RTS 0x10 +#define J1939_TP_CMD_CTS 0x11 +#define J1939_TP_CMD_EOMA 0x13 +#define J1939_TP_CMD_BAM 0x20 +#define J1939_TP_CMD_ABORT 0xff + +#define J1939_ETP_CMD_RTS 0x14 +#define J1939_ETP_CMD_CTS 0x15 +#define J1939_ETP_CMD_DPO 0x16 +#define J1939_ETP_CMD_EOMA 0x17 +#define J1939_ETP_CMD_ABORT 0xff + +enum j1939_xtp_abort { + J1939_XTP_NO_ABORT = 0, + J1939_XTP_ABORT_BUSY = 1, + /* Already in one or more connection managed sessions and + * cannot support another. + * + * EALREADY: + * Operation already in progress + */ + + J1939_XTP_ABORT_RESOURCE = 2, + /* System resources were needed for another task so this + * connection managed session was terminated. + * + * EMSGSIZE: + * The socket type requires that message be sent atomically, + * and the size of the message to be sent made this + * impossible. + */ + + J1939_XTP_ABORT_TIMEOUT = 3, + /* A timeout occurred and this is the connection abort to + * close the session. + * + * EHOSTUNREACH: + * The destination host cannot be reached (probably because + * the host is down or a remote router cannot reach it). + */ + + J1939_XTP_ABORT_GENERIC = 4, + /* CTS messages received when data transfer is in progress + * + * EBADMSG: + * Not a data message + */ + + J1939_XTP_ABORT_FAULT = 5, + /* Maximal retransmit request limit reached + * + * ENOTRECOVERABLE: + * State not recoverable + */ + + J1939_XTP_ABORT_UNEXPECTED_DATA = 6, + /* Unexpected data transfer packet + * + * ENOTCONN: + * Transport endpoint is not connected + */ + + J1939_XTP_ABORT_BAD_SEQ = 7, + /* Bad sequence number (and software is not able to recover) + * + * EILSEQ: + * Illegal byte sequence + */ + + J1939_XTP_ABORT_DUP_SEQ = 8, + /* Duplicate sequence number (and software is not able to + * recover) + */ + + J1939_XTP_ABORT_EDPO_UNEXPECTED = 9, + /* Unexpected EDPO packet (ETP) or Message size > 1785 bytes + * (TP) + */ + + J1939_XTP_ABORT_BAD_EDPO_PGN = 10, + /* Unexpected EDPO PGN (PGN in EDPO is bad) */ + + J1939_XTP_ABORT_EDPO_OUTOF_CTS = 11, + /* EDPO number of packets is greater than CTS */ + + J1939_XTP_ABORT_BAD_EDPO_OFFSET = 12, + /* Bad EDPO offset */ + + J1939_XTP_ABORT_OTHER_DEPRECATED = 13, + /* Deprecated. Use 250 instead (Any other reason) */ + + J1939_XTP_ABORT_ECTS_UNXPECTED_PGN = 14, + /* Unexpected ECTS PGN (PGN in ECTS is bad) */ + + J1939_XTP_ABORT_ECTS_TOO_BIG = 15, + /* ECTS requested packets exceeds message size */ + + J1939_XTP_ABORT_OTHER = 250, + /* Any other reason (if a Connection Abort reason is + * identified that is not listed in the table use code 250) + */ +}; + +static unsigned int j1939_tp_block = 255; +static unsigned int j1939_tp_packet_delay; +static unsigned int j1939_tp_padding = 1; + +/* helpers */ +static const char *j1939_xtp_abort_to_str(enum j1939_xtp_abort abort) +{ + switch (abort) { + case J1939_XTP_ABORT_BUSY: + return "Already in one or more connection managed sessions and cannot support another."; + case J1939_XTP_ABORT_RESOURCE: + return "System resources were needed for another task so this connection managed session was terminated."; + case J1939_XTP_ABORT_TIMEOUT: + return "A timeout occurred and this is the connection abort to close the session."; + case J1939_XTP_ABORT_GENERIC: + return "CTS messages received when data transfer is in progress"; + case J1939_XTP_ABORT_FAULT: + return "Maximal retransmit request limit reached"; + case J1939_XTP_ABORT_UNEXPECTED_DATA: + return "Unexpected data transfer packet"; + case J1939_XTP_ABORT_BAD_SEQ: + return "Bad sequence number (and software is not able to recover)"; + case J1939_XTP_ABORT_DUP_SEQ: + return "Duplicate sequence number (and software is not able to recover)"; + case J1939_XTP_ABORT_EDPO_UNEXPECTED: + return "Unexpected EDPO packet (ETP) or Message size > 1785 bytes (TP)"; + case J1939_XTP_ABORT_BAD_EDPO_PGN: + return "Unexpected EDPO PGN (PGN in EDPO is bad)"; + case J1939_XTP_ABORT_EDPO_OUTOF_CTS: + return "EDPO number of packets is greater than CTS"; + case J1939_XTP_ABORT_BAD_EDPO_OFFSET: + return "Bad EDPO offset"; + case J1939_XTP_ABORT_OTHER_DEPRECATED: + return "Deprecated. Use 250 instead (Any other reason)"; + case J1939_XTP_ABORT_ECTS_UNXPECTED_PGN: + return "Unexpected ECTS PGN (PGN in ECTS is bad)"; + case J1939_XTP_ABORT_ECTS_TOO_BIG: + return "ECTS requested packets exceeds message size"; + case J1939_XTP_ABORT_OTHER: + return "Any other reason (if a Connection Abort reason is identified that is not listed in the table use code 250)"; + default: + return "<unknown>"; + } +} + +static int j1939_xtp_abort_to_errno(struct j1939_priv *priv, + enum j1939_xtp_abort abort) +{ + int err; + + switch (abort) { + case J1939_XTP_NO_ABORT: + WARN_ON_ONCE(abort == J1939_XTP_NO_ABORT); + err = 0; + break; + case J1939_XTP_ABORT_BUSY: + err = EALREADY; + break; + case J1939_XTP_ABORT_RESOURCE: + err = EMSGSIZE; + break; + case J1939_XTP_ABORT_TIMEOUT: + err = EHOSTUNREACH; + break; + case J1939_XTP_ABORT_GENERIC: + err = EBADMSG; + break; + case J1939_XTP_ABORT_FAULT: + err = ENOTRECOVERABLE; + break; + case J1939_XTP_ABORT_UNEXPECTED_DATA: + err = ENOTCONN; + break; + case J1939_XTP_ABORT_BAD_SEQ: + err = EILSEQ; + break; + case J1939_XTP_ABORT_DUP_SEQ: + err = EPROTO; + break; + case J1939_XTP_ABORT_EDPO_UNEXPECTED: + err = EPROTO; + break; + case J1939_XTP_ABORT_BAD_EDPO_PGN: + err = EPROTO; + break; + case J1939_XTP_ABORT_EDPO_OUTOF_CTS: + err = EPROTO; + break; + case J1939_XTP_ABORT_BAD_EDPO_OFFSET: + err = EPROTO; + break; + case J1939_XTP_ABORT_OTHER_DEPRECATED: + err = EPROTO; + break; + case J1939_XTP_ABORT_ECTS_UNXPECTED_PGN: + err = EPROTO; + break; + case J1939_XTP_ABORT_ECTS_TOO_BIG: + err = EPROTO; + break; + case J1939_XTP_ABORT_OTHER: + err = EPROTO; + break; + default: + netdev_warn(priv->ndev, "Unknown abort code %i", abort); + err = EPROTO; + } + + return err; +} + +static inline void j1939_session_list_lock(struct j1939_priv *priv) +{ + spin_lock_bh(&priv->active_session_list_lock); +} + +static inline void j1939_session_list_unlock(struct j1939_priv *priv) +{ + spin_unlock_bh(&priv->active_session_list_lock); +} + +void j1939_session_get(struct j1939_session *session) +{ + kref_get(&session->kref); +} + +/* session completion functions */ +static void __j1939_session_drop(struct j1939_session *session) +{ + if (!session->transmission) + return; + + j1939_sock_pending_del(session->sk); + sock_put(session->sk); +} + +static void j1939_session_destroy(struct j1939_session *session) +{ + if (session->err) + j1939_sk_errqueue(session, J1939_ERRQUEUE_ABORT); + else + j1939_sk_errqueue(session, J1939_ERRQUEUE_ACK); + + netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session); + + WARN_ON_ONCE(!list_empty(&session->sk_session_queue_entry)); + WARN_ON_ONCE(!list_empty(&session->active_session_list_entry)); + + skb_queue_purge(&session->skb_queue); + __j1939_session_drop(session); + j1939_priv_put(session->priv); + kfree(session); +} + +static void __j1939_session_release(struct kref *kref) +{ + struct j1939_session *session = container_of(kref, struct j1939_session, + kref); + + j1939_session_destroy(session); +} + +void j1939_session_put(struct j1939_session *session) +{ + kref_put(&session->kref, __j1939_session_release); +} + +static void j1939_session_txtimer_cancel(struct j1939_session *session) +{ + if (hrtimer_cancel(&session->txtimer)) + j1939_session_put(session); +} + +static void j1939_session_rxtimer_cancel(struct j1939_session *session) +{ + if (hrtimer_cancel(&session->rxtimer)) + j1939_session_put(session); +} + +void j1939_session_timers_cancel(struct j1939_session *session) +{ + j1939_session_txtimer_cancel(session); + j1939_session_rxtimer_cancel(session); +} + +static inline bool j1939_cb_is_broadcast(const struct j1939_sk_buff_cb *skcb) +{ + return (!skcb->addr.dst_name && (skcb->addr.da == 0xff)); +} + +static void j1939_session_skb_drop_old(struct j1939_session *session) +{ + struct sk_buff *do_skb; + struct j1939_sk_buff_cb *do_skcb; + unsigned int offset_start; + unsigned long flags; + + if (skb_queue_len(&session->skb_queue) < 2) + return; + + offset_start = session->pkt.tx_acked * 7; + + spin_lock_irqsave(&session->skb_queue.lock, flags); + do_skb = skb_peek(&session->skb_queue); + do_skcb = j1939_skb_to_cb(do_skb); + + if ((do_skcb->offset + do_skb->len) < offset_start) { + __skb_unlink(do_skb, &session->skb_queue); + kfree_skb(do_skb); + } + spin_unlock_irqrestore(&session->skb_queue.lock, flags); +} + +void j1939_session_skb_queue(struct j1939_session *session, + struct sk_buff *skb) +{ + struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); + struct j1939_priv *priv = session->priv; + + j1939_ac_fixup(priv, skb); + + if (j1939_address_is_unicast(skcb->addr.da) && + priv->ents[skcb->addr.da].nusers) + skcb->flags |= J1939_ECU_LOCAL_DST; + + skcb->flags |= J1939_ECU_LOCAL_SRC; + + skb_queue_tail(&session->skb_queue, skb); +} + +static struct sk_buff *j1939_session_skb_find(struct j1939_session *session) +{ + struct j1939_priv *priv = session->priv; + struct sk_buff *skb = NULL; + struct sk_buff *do_skb; + struct j1939_sk_buff_cb *do_skcb; + unsigned int offset_start; + unsigned long flags; + + offset_start = session->pkt.dpo * 7; + + spin_lock_irqsave(&session->skb_queue.lock, flags); + skb_queue_walk(&session->skb_queue, do_skb) { + do_skcb = j1939_skb_to_cb(do_skb); + + if (offset_start >= do_skcb->offset && + offset_start < (do_skcb->offset + do_skb->len)) { + skb = do_skb; + } + } + spin_unlock_irqrestore(&session->skb_queue.lock, flags); + + if (!skb) + netdev_dbg(priv->ndev, "%s: 0x%p: no skb found for start: %i, queue size: %i\n", + __func__, session, offset_start, + skb_queue_len(&session->skb_queue)); + + return skb; +} + +/* see if we are receiver + * returns 0 for broadcasts, although we will receive them + */ +static inline int j1939_tp_im_receiver(const struct j1939_sk_buff_cb *skcb) +{ + return skcb->flags & J1939_ECU_LOCAL_DST; +} + +/* see if we are sender */ +static inline int j1939_tp_im_transmitter(const struct j1939_sk_buff_cb *skcb) +{ + return skcb->flags & J1939_ECU_LOCAL_SRC; +} + +/* see if we are involved as either receiver or transmitter */ +static int j1939_tp_im_involved(const struct j1939_sk_buff_cb *skcb, bool swap) +{ + if (swap) + return j1939_tp_im_receiver(skcb); + else + return j1939_tp_im_transmitter(skcb); +} + +static int j1939_tp_im_involved_anydir(struct j1939_sk_buff_cb *skcb) +{ + return skcb->flags & (J1939_ECU_LOCAL_SRC | J1939_ECU_LOCAL_DST); +} + +/* extract pgn from flow-ctl message */ +static inline pgn_t j1939_xtp_ctl_to_pgn(const u8 *dat) +{ + pgn_t pgn; + + pgn = (dat[7] << 16) | (dat[6] << 8) | (dat[5] << 0); + if (j1939_pgn_is_pdu1(pgn)) + pgn &= 0xffff00; + return pgn; +} + +static inline unsigned int j1939_tp_ctl_to_size(const u8 *dat) +{ + return (dat[2] << 8) + (dat[1] << 0); +} + +static inline unsigned int j1939_etp_ctl_to_packet(const u8 *dat) +{ + return (dat[4] << 16) | (dat[3] << 8) | (dat[2] << 0); +} + +static inline unsigned int j1939_etp_ctl_to_size(const u8 *dat) +{ + return (dat[4] << 24) | (dat[3] << 16) | + (dat[2] << 8) | (dat[1] << 0); +} + +/* find existing session: + * reverse: swap cb's src & dst + * there is no problem with matching broadcasts, since + * broadcasts (no dst, no da) would never call this + * with reverse == true + */ +static bool j1939_session_match(struct j1939_addr *se_addr, + struct j1939_addr *sk_addr, bool reverse) +{ + if (se_addr->type != sk_addr->type) + return false; + + if (reverse) { + if (se_addr->src_name) { + if (se_addr->src_name != sk_addr->dst_name) + return false; + } else if (se_addr->sa != sk_addr->da) { + return false; + } + + if (se_addr->dst_name) { + if (se_addr->dst_name != sk_addr->src_name) + return false; + } else if (se_addr->da != sk_addr->sa) { + return false; + } + } else { + if (se_addr->src_name) { + if (se_addr->src_name != sk_addr->src_name) + return false; + } else if (se_addr->sa != sk_addr->sa) { + return false; + } + + if (se_addr->dst_name) { + if (se_addr->dst_name != sk_addr->dst_name) + return false; + } else if (se_addr->da != sk_addr->da) { + return false; + } + } + + return true; +} + +static struct +j1939_session *j1939_session_get_by_addr_locked(struct j1939_priv *priv, + struct list_head *root, + struct j1939_addr *addr, + bool reverse, bool transmitter) +{ + struct j1939_session *session; + + lockdep_assert_held(&priv->active_session_list_lock); + + list_for_each_entry(session, root, active_session_list_entry) { + j1939_session_get(session); + if (j1939_session_match(&session->skcb.addr, addr, reverse) && + session->transmission == transmitter) + return session; + j1939_session_put(session); + } + + return NULL; +} + +static struct +j1939_session *j1939_session_get_simple(struct j1939_priv *priv, + struct sk_buff *skb) +{ + struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); + struct j1939_session *session; + + lockdep_assert_held(&priv->active_session_list_lock); + + list_for_each_entry(session, &priv->active_session_list, + active_session_list_entry) { + j1939_session_get(session); + if (session->skcb.addr.type == J1939_SIMPLE && + session->tskey == skcb->tskey && session->sk == skb->sk) + return session; + j1939_session_put(session); + } + + return NULL; +} + +static struct +j1939_session *j1939_session_get_by_addr(struct j1939_priv *priv, + struct j1939_addr *addr, + bool reverse, bool transmitter) +{ + struct j1939_session *session; + + j1939_session_list_lock(priv); + session = j1939_session_get_by_addr_locked(priv, + &priv->active_session_list, + addr, reverse, transmitter); + j1939_session_list_unlock(priv); + + return session; +} + +static void j1939_skbcb_swap(struct j1939_sk_buff_cb *skcb) +{ + u8 tmp = 0; + + swap(skcb->addr.dst_name, skcb->addr.src_name); + swap(skcb->addr.da, skcb->addr.sa); + + /* swap SRC and DST flags, leave other untouched */ + if (skcb->flags & J1939_ECU_LOCAL_SRC) + tmp |= J1939_ECU_LOCAL_DST; + if (skcb->flags & J1939_ECU_LOCAL_DST) + tmp |= J1939_ECU_LOCAL_SRC; + skcb->flags &= ~(J1939_ECU_LOCAL_SRC | J1939_ECU_LOCAL_DST); + skcb->flags |= tmp; +} + +static struct +sk_buff *j1939_tp_tx_dat_new(struct j1939_priv *priv, + const struct j1939_sk_buff_cb *re_skcb, + bool ctl, + bool swap_src_dst) +{ + struct sk_buff *skb; + struct j1939_sk_buff_cb *skcb; + + skb = alloc_skb(sizeof(struct can_frame) + sizeof(struct can_skb_priv), + GFP_ATOMIC); + if (unlikely(!skb)) + return ERR_PTR(-ENOMEM); + + skb->dev = priv->ndev; + can_skb_reserve(skb); + can_skb_prv(skb)->ifindex = priv->ndev->ifindex; + /* reserve CAN header */ + skb_reserve(skb, offsetof(struct can_frame, data)); + + memcpy(skb->cb, re_skcb, sizeof(skb->cb)); + skcb = j1939_skb_to_cb(skb); + if (swap_src_dst) + j1939_skbcb_swap(skcb); + + if (ctl) { + if (skcb->addr.type == J1939_ETP) + skcb->addr.pgn = J1939_ETP_PGN_CTL; + else + skcb->addr.pgn = J1939_TP_PGN_CTL; + } else { + if (skcb->addr.type == J1939_ETP) + skcb->addr.pgn = J1939_ETP_PGN_DAT; + else + skcb->addr.pgn = J1939_TP_PGN_DAT; + } + + return skb; +} + +/* TP transmit packet functions */ +static int j1939_tp_tx_dat(struct j1939_session *session, + const u8 *dat, int len) +{ + struct j1939_priv *priv = session->priv; + struct sk_buff *skb; + + skb = j1939_tp_tx_dat_new(priv, &session->skcb, + false, false); + if (IS_ERR(skb)) + return PTR_ERR(skb); + + skb_put_data(skb, dat, len); + if (j1939_tp_padding && len < 8) + memset(skb_put(skb, 8 - len), 0xff, 8 - len); + + return j1939_send_one(priv, skb); +} + +static int j1939_xtp_do_tx_ctl(struct j1939_priv *priv, + const struct j1939_sk_buff_cb *re_skcb, + bool swap_src_dst, pgn_t pgn, const u8 *dat) +{ + struct sk_buff *skb; + u8 *skdat; + + if (!j1939_tp_im_involved(re_skcb, swap_src_dst)) + return 0; + + skb = j1939_tp_tx_dat_new(priv, re_skcb, true, swap_src_dst); + if (IS_ERR(skb)) + return PTR_ERR(skb); + + skdat = skb_put(skb, 8); + memcpy(skdat, dat, 5); + skdat[5] = (pgn >> 0); + skdat[6] = (pgn >> 8); + skdat[7] = (pgn >> 16); + + return j1939_send_one(priv, skb); +} + +static inline int j1939_tp_tx_ctl(struct j1939_session *session, + bool swap_src_dst, const u8 *dat) +{ + struct j1939_priv *priv = session->priv; + + return j1939_xtp_do_tx_ctl(priv, &session->skcb, + swap_src_dst, + session->skcb.addr.pgn, dat); +} + +static int j1939_xtp_tx_abort(struct j1939_priv *priv, + const struct j1939_sk_buff_cb *re_skcb, + bool swap_src_dst, + enum j1939_xtp_abort err, + pgn_t pgn) +{ + u8 dat[5]; + + if (!j1939_tp_im_involved(re_skcb, swap_src_dst)) + return 0; + + memset(dat, 0xff, sizeof(dat)); + dat[0] = J1939_TP_CMD_ABORT; + dat[1] = err; + return j1939_xtp_do_tx_ctl(priv, re_skcb, swap_src_dst, pgn, dat); +} + +void j1939_tp_schedule_txtimer(struct j1939_session *session, int msec) +{ + j1939_session_get(session); + hrtimer_start(&session->txtimer, ms_to_ktime(msec), + HRTIMER_MODE_REL_SOFT); +} + +static inline void j1939_tp_set_rxtimeout(struct j1939_session *session, + int msec) +{ + j1939_session_rxtimer_cancel(session); + j1939_session_get(session); + hrtimer_start(&session->rxtimer, ms_to_ktime(msec), + HRTIMER_MODE_REL_SOFT); +} + +static int j1939_session_tx_rts(struct j1939_session *session) +{ + u8 dat[8]; + int ret; + + memset(dat, 0xff, sizeof(dat)); + + dat[1] = (session->total_message_size >> 0); + dat[2] = (session->total_message_size >> 8); + dat[3] = session->pkt.total; + + if (session->skcb.addr.type == J1939_ETP) { + dat[0] = J1939_ETP_CMD_RTS; + dat[1] = (session->total_message_size >> 0); + dat[2] = (session->total_message_size >> 8); + dat[3] = (session->total_message_size >> 16); + dat[4] = (session->total_message_size >> 24); + } else if (j1939_cb_is_broadcast(&session->skcb)) { + dat[0] = J1939_TP_CMD_BAM; + /* fake cts for broadcast */ + session->pkt.tx = 0; + } else { + dat[0] = J1939_TP_CMD_RTS; + dat[4] = dat[3]; + } + + if (dat[0] == session->last_txcmd) + /* done already */ + return 0; + + ret = j1939_tp_tx_ctl(session, false, dat); + if (ret < 0) + return ret; + + session->last_txcmd = dat[0]; + if (dat[0] == J1939_TP_CMD_BAM) + j1939_tp_schedule_txtimer(session, 50); + + j1939_tp_set_rxtimeout(session, 1250); + + netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session); + + return 0; +} + +static int j1939_session_tx_dpo(struct j1939_session *session) +{ + unsigned int pkt; + u8 dat[8]; + int ret; + + memset(dat, 0xff, sizeof(dat)); + + dat[0] = J1939_ETP_CMD_DPO; + session->pkt.dpo = session->pkt.tx_acked; + pkt = session->pkt.dpo; + dat[1] = session->pkt.last - session->pkt.tx_acked; + dat[2] = (pkt >> 0); + dat[3] = (pkt >> 8); + dat[4] = (pkt >> 16); + + ret = j1939_tp_tx_ctl(session, false, dat); + if (ret < 0) + return ret; + + session->last_txcmd = dat[0]; + j1939_tp_set_rxtimeout(session, 1250); + session->pkt.tx = session->pkt.tx_acked; + + netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session); + + return 0; +} + +static int j1939_session_tx_dat(struct j1939_session *session) +{ + struct j1939_priv *priv = session->priv; + struct j1939_sk_buff_cb *skcb; + int offset, pkt_done, pkt_end; + unsigned int len, pdelay; + struct sk_buff *se_skb; + const u8 *tpdat; + int ret = 0; + u8 dat[8]; + + se_skb = j1939_session_skb_find(session); + if (!se_skb) + return -ENOBUFS; + + skcb = j1939_skb_to_cb(se_skb); + tpdat = se_skb->data; + ret = 0; + pkt_done = 0; + if (session->skcb.addr.type != J1939_ETP && + j1939_cb_is_broadcast(&session->skcb)) + pkt_end = session->pkt.total; + else + pkt_end = session->pkt.last; + + while (session->pkt.tx < pkt_end) { + dat[0] = session->pkt.tx - session->pkt.dpo + 1; + offset = (session->pkt.tx * 7) - skcb->offset; + len = se_skb->len - offset; + if (len > 7) + len = 7; + + memcpy(&dat[1], &tpdat[offset], len); + ret = j1939_tp_tx_dat(session, dat, len + 1); + if (ret < 0) { + /* ENOBUS == CAN interface TX queue is full */ + if (ret != -ENOBUFS) + netdev_alert(priv->ndev, + "%s: 0x%p: queue data error: %i\n", + __func__, session, ret); + break; + } + + session->last_txcmd = 0xff; + pkt_done++; + session->pkt.tx++; + pdelay = j1939_cb_is_broadcast(&session->skcb) ? 50 : + j1939_tp_packet_delay; + + if (session->pkt.tx < session->pkt.total && pdelay) { + j1939_tp_schedule_txtimer(session, pdelay); + break; + } + } + + if (pkt_done) + j1939_tp_set_rxtimeout(session, 250); + + return ret; +} + +static int j1939_xtp_txnext_transmiter(struct j1939_session *session) +{ + struct j1939_priv *priv = session->priv; + int ret = 0; + + if (!j1939_tp_im_transmitter(&session->skcb)) { + netdev_alert(priv->ndev, "%s: 0x%p: called by not transmitter!\n", + __func__, session); + return -EINVAL; + } + + switch (session->last_cmd) { + case 0: + ret = j1939_session_tx_rts(session); + break; + + case J1939_ETP_CMD_CTS: + if (session->last_txcmd != J1939_ETP_CMD_DPO) { + ret = j1939_session_tx_dpo(session); + if (ret) + return ret; + } + + /* fall through */ + case J1939_TP_CMD_CTS: + case 0xff: /* did some data */ + case J1939_ETP_CMD_DPO: + case J1939_TP_CMD_BAM: + ret = j1939_session_tx_dat(session); + + break; + default: + netdev_alert(priv->ndev, "%s: 0x%p: unexpected last_cmd: %x\n", + __func__, session, session->last_cmd); + } + + return ret; +} + +static int j1939_session_tx_cts(struct j1939_session *session) +{ + struct j1939_priv *priv = session->priv; + unsigned int pkt, len; + int ret; + u8 dat[8]; + + if (!j1939_sk_recv_match(priv, &session->skcb)) + return -ENOENT; + + len = session->pkt.total - session->pkt.rx; + len = min3(len, session->pkt.block, j1939_tp_block ?: 255); + memset(dat, 0xff, sizeof(dat)); + + if (session->skcb.addr.type == J1939_ETP) { + pkt = session->pkt.rx + 1; + dat[0] = J1939_ETP_CMD_CTS; + dat[1] = len; + dat[2] = (pkt >> 0); + dat[3] = (pkt >> 8); + dat[4] = (pkt >> 16); + } else { + dat[0] = J1939_TP_CMD_CTS; + dat[1] = len; + dat[2] = session->pkt.rx + 1; + } + + if (dat[0] == session->last_txcmd) + /* done already */ + return 0; + + ret = j1939_tp_tx_ctl(session, true, dat); + if (ret < 0) + return ret; + + if (len) + /* only mark cts done when len is set */ + session->last_txcmd = dat[0]; + j1939_tp_set_rxtimeout(session, 1250); + + netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session); + + return 0; +} + +static int j1939_session_tx_eoma(struct j1939_session *session) +{ + struct j1939_priv *priv = session->priv; + u8 dat[8]; + int ret; + + if (!j1939_sk_recv_match(priv, &session->skcb)) + return -ENOENT; + + memset(dat, 0xff, sizeof(dat)); + + if (session->skcb.addr.type == J1939_ETP) { + dat[0] = J1939_ETP_CMD_EOMA; + dat[1] = session->total_message_size >> 0; + dat[2] = session->total_message_size >> 8; + dat[3] = session->total_message_size >> 16; + dat[4] = session->total_message_size >> 24; + } else { + dat[0] = J1939_TP_CMD_EOMA; + dat[1] = session->total_message_size; + dat[2] = session->total_message_size >> 8; + dat[3] = session->pkt.total; + } + + if (dat[0] == session->last_txcmd) + /* done already */ + return 0; + + ret = j1939_tp_tx_ctl(session, true, dat); + if (ret < 0) + return ret; + + session->last_txcmd = dat[0]; + + /* wait for the EOMA packet to come in */ + j1939_tp_set_rxtimeout(session, 1250); + + netdev_dbg(session->priv->ndev, "%p: 0x%p\n", __func__, session); + + return 0; +} + +static int j1939_xtp_txnext_receiver(struct j1939_session *session) +{ + struct j1939_priv *priv = session->priv; + int ret = 0; + + if (!j1939_tp_im_receiver(&session->skcb)) { + netdev_alert(priv->ndev, "%s: 0x%p: called by not receiver!\n", + __func__, session); + return -EINVAL; + } + + switch (session->last_cmd) { + case J1939_TP_CMD_RTS: + case J1939_ETP_CMD_RTS: + ret = j1939_session_tx_cts(session); + break; + + case J1939_ETP_CMD_CTS: + case J1939_TP_CMD_CTS: + case 0xff: /* did some data */ + case J1939_ETP_CMD_DPO: + if ((session->skcb.addr.type == J1939_TP && + j1939_cb_is_broadcast(&session->skcb))) + break; + + if (session->pkt.rx >= session->pkt.total) { + ret = j1939_session_tx_eoma(session); + } else if (session->pkt.rx >= session->pkt.last) { + session->last_txcmd = 0; + ret = j1939_session_tx_cts(session); + } + break; + default: + netdev_alert(priv->ndev, "%s: 0x%p: unexpected last_cmd: %x\n", + __func__, session, session->last_cmd); + } + + return ret; +} + +static int j1939_simple_txnext(struct j1939_session *session) +{ + struct j1939_priv *priv = session->priv; + struct sk_buff *se_skb = j1939_session_skb_find(session); + struct sk_buff *skb; + int ret; + + if (!se_skb) + return 0; + + skb = skb_clone(se_skb, GFP_ATOMIC); + if (!skb) + return -ENOMEM; + + can_skb_set_owner(skb, se_skb->sk); + + j1939_tp_set_rxtimeout(session, J1939_SIMPLE_ECHO_TIMEOUT_MS); + + ret = j1939_send_one(priv, skb); + if (ret) + return ret; + + j1939_sk_errqueue(session, J1939_ERRQUEUE_SCHED); + j1939_sk_queue_activate_next(session); + + return 0; +} + +static bool j1939_session_deactivate_locked(struct j1939_session *session) +{ + bool active = false; + + lockdep_assert_held(&session->priv->active_session_list_lock); + + if (session->state >= J1939_SESSION_ACTIVE && + session->state < J1939_SESSION_ACTIVE_MAX) { + active = true; + + list_del_init(&session->active_session_list_entry); + session->state = J1939_SESSION_DONE; + j1939_session_put(session); + } + + return active; +} + +static bool j1939_session_deactivate(struct j1939_session *session) +{ + bool active; + + j1939_session_list_lock(session->priv); + active = j1939_session_deactivate_locked(session); + j1939_session_list_unlock(session->priv); + + return active; +} + +static void +j1939_session_deactivate_activate_next(struct j1939_session *session) +{ + if (j1939_session_deactivate(session)) + j1939_sk_queue_activate_next(session); +} + +static void __j1939_session_cancel(struct j1939_session *session, + enum j1939_xtp_abort err) +{ + struct j1939_priv *priv = session->priv; + + WARN_ON_ONCE(!err); + lockdep_assert_held(&session->priv->active_session_list_lock); + + session->err = j1939_xtp_abort_to_errno(priv, err); + /* do not send aborts on incoming broadcasts */ + if (!j1939_cb_is_broadcast(&session->skcb)) { + session->state = J1939_SESSION_WAITING_ABORT; + j1939_xtp_tx_abort(priv, &session->skcb, + !session->transmission, + err, session->skcb.addr.pgn); + } + + if (session->sk) + j1939_sk_send_loop_abort(session->sk, session->err); +} + +static void j1939_session_cancel(struct j1939_session *session, + enum j1939_xtp_abort err) +{ + j1939_session_list_lock(session->priv); + + if (session->state >= J1939_SESSION_ACTIVE && + session->state < J1939_SESSION_WAITING_ABORT) { + j1939_tp_set_rxtimeout(session, J1939_XTP_ABORT_TIMEOUT_MS); + __j1939_session_cancel(session, err); + } + + j1939_session_list_unlock(session->priv); +} + +static enum hrtimer_restart j1939_tp_txtimer(struct hrtimer *hrtimer) +{ + struct j1939_session *session = + container_of(hrtimer, struct j1939_session, txtimer); + struct j1939_priv *priv = session->priv; + int ret = 0; + + if (session->skcb.addr.type == J1939_SIMPLE) { + ret = j1939_simple_txnext(session); + } else { + if (session->transmission) + ret = j1939_xtp_txnext_transmiter(session); + else + ret = j1939_xtp_txnext_receiver(session); + } + + switch (ret) { + case -ENOBUFS: + /* Retry limit is currently arbitrary chosen */ + if (session->tx_retry < J1939_XTP_TX_RETRY_LIMIT) { + session->tx_retry++; + j1939_tp_schedule_txtimer(session, + 10 + prandom_u32_max(16)); + } else { + netdev_alert(priv->ndev, "%s: 0x%p: tx retry count reached\n", + __func__, session); + session->err = -ENETUNREACH; + j1939_session_rxtimer_cancel(session); + j1939_session_deactivate_activate_next(session); + } + break; + case -ENETDOWN: + /* In this case we should get a netdev_event(), all active + * sessions will be cleared by + * j1939_cancel_all_active_sessions(). So handle this as an + * error, but let j1939_cancel_all_active_sessions() do the + * cleanup including propagation of the error to user space. + */ + break; + case 0: + session->tx_retry = 0; + break; + default: + netdev_alert(priv->ndev, "%s: 0x%p: tx aborted with unknown reason: %i\n", + __func__, session, ret); + if (session->skcb.addr.type != J1939_SIMPLE) { + j1939_session_cancel(session, J1939_XTP_ABORT_OTHER); + } else { + session->err = ret; + j1939_session_rxtimer_cancel(session); + j1939_session_deactivate_activate_next(session); + } + } + + j1939_session_put(session); + + return HRTIMER_NORESTART; +} + +static void j1939_session_completed(struct j1939_session *session) +{ + struct sk_buff *skb; + + if (!session->transmission) { + skb = j1939_session_skb_find(session); + /* distribute among j1939 receivers */ + j1939_sk_recv(session->priv, skb); + } + + j1939_session_deactivate_activate_next(session); +} + +static enum hrtimer_restart j1939_tp_rxtimer(struct hrtimer *hrtimer) +{ + struct j1939_session *session = container_of(hrtimer, + struct j1939_session, + rxtimer); + struct j1939_priv *priv = session->priv; + + if (session->state == J1939_SESSION_WAITING_ABORT) { + netdev_alert(priv->ndev, "%s: 0x%p: abort rx timeout. Force session deactivation\n", + __func__, session); + + j1939_session_deactivate_activate_next(session); + + } else if (session->skcb.addr.type == J1939_SIMPLE) { + netdev_alert(priv->ndev, "%s: 0x%p: Timeout. Failed to send simple message.\n", + __func__, session); + + /* The message is probably stuck in the CAN controller and can + * be send as soon as CAN bus is in working state again. + */ + session->err = -ETIME; + j1939_session_deactivate(session); + } else { + netdev_alert(priv->ndev, "%s: 0x%p: rx timeout, send abort\n", + __func__, session); + + j1939_session_list_lock(session->priv); + if (session->state >= J1939_SESSION_ACTIVE && + session->state < J1939_SESSION_ACTIVE_MAX) { + j1939_session_get(session); + hrtimer_start(&session->rxtimer, + ms_to_ktime(J1939_XTP_ABORT_TIMEOUT_MS), + HRTIMER_MODE_REL_SOFT); + __j1939_session_cancel(session, J1939_XTP_ABORT_TIMEOUT); + } + j1939_session_list_unlock(session->priv); + } + + j1939_session_put(session); + + return HRTIMER_NORESTART; +} + +static bool j1939_xtp_rx_cmd_bad_pgn(struct j1939_session *session, + const struct sk_buff *skb) +{ + const struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); + pgn_t pgn = j1939_xtp_ctl_to_pgn(skb->data); + struct j1939_priv *priv = session->priv; + enum j1939_xtp_abort abort = J1939_XTP_NO_ABORT; + u8 cmd = skb->data[0]; + + if (session->skcb.addr.pgn == pgn) + return false; + + switch (cmd) { + case J1939_TP_CMD_BAM: + abort = J1939_XTP_NO_ABORT; + break; + + case J1939_ETP_CMD_RTS: + case J1939_TP_CMD_RTS: /* fall through */ + abort = J1939_XTP_ABORT_BUSY; + break; + + case J1939_ETP_CMD_CTS: + case J1939_TP_CMD_CTS: /* fall through */ + abort = J1939_XTP_ABORT_ECTS_UNXPECTED_PGN; + break; + + case J1939_ETP_CMD_DPO: + abort = J1939_XTP_ABORT_BAD_EDPO_PGN; + break; + + case J1939_ETP_CMD_EOMA: + case J1939_TP_CMD_EOMA: /* fall through */ + abort = J1939_XTP_ABORT_OTHER; + break; + + case J1939_ETP_CMD_ABORT: /* && J1939_TP_CMD_ABORT */ + abort = J1939_XTP_NO_ABORT; + break; + + default: + WARN_ON_ONCE(1); + break; + } + + netdev_warn(priv->ndev, "%s: 0x%p: CMD 0x%02x with PGN 0x%05x for running session with different PGN 0x%05x.\n", + __func__, session, cmd, pgn, session->skcb.addr.pgn); + if (abort != J1939_XTP_NO_ABORT) + j1939_xtp_tx_abort(priv, skcb, true, abort, pgn); + + return true; +} + +static void j1939_xtp_rx_abort_one(struct j1939_priv *priv, struct sk_buff *skb, + bool reverse, bool transmitter) +{ + struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); + struct j1939_session *session; + u8 abort = skb->data[1]; + + session = j1939_session_get_by_addr(priv, &skcb->addr, reverse, + transmitter); + if (!session) + return; + + if (j1939_xtp_rx_cmd_bad_pgn(session, skb)) + goto abort_put; + + netdev_info(priv->ndev, "%s: 0x%p: 0x%05x: (%u) %s\n", __func__, + session, j1939_xtp_ctl_to_pgn(skb->data), abort, + j1939_xtp_abort_to_str(abort)); + + j1939_session_timers_cancel(session); + session->err = j1939_xtp_abort_to_errno(priv, abort); + if (session->sk) + j1939_sk_send_loop_abort(session->sk, session->err); + j1939_session_deactivate_activate_next(session); + +abort_put: + j1939_session_put(session); +} + +/* abort packets may come in 2 directions */ +static void +j1939_xtp_rx_abort(struct j1939_priv *priv, struct sk_buff *skb, + bool transmitter) +{ + j1939_xtp_rx_abort_one(priv, skb, false, transmitter); + j1939_xtp_rx_abort_one(priv, skb, true, transmitter); +} + +static void +j1939_xtp_rx_eoma_one(struct j1939_session *session, struct sk_buff *skb) +{ + struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); + const u8 *dat; + int len; + + if (j1939_xtp_rx_cmd_bad_pgn(session, skb)) + return; + + dat = skb->data; + + if (skcb->addr.type == J1939_ETP) + len = j1939_etp_ctl_to_size(dat); + else + len = j1939_tp_ctl_to_size(dat); + + if (session->total_message_size != len) { + netdev_warn_once(session->priv->ndev, + "%s: 0x%p: Incorrect size. Expected: %i; got: %i.\n", + __func__, session, session->total_message_size, + len); + } + + netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session); + + session->pkt.tx_acked = session->pkt.total; + j1939_session_timers_cancel(session); + /* transmitted without problems */ + j1939_session_completed(session); +} + +static void +j1939_xtp_rx_eoma(struct j1939_priv *priv, struct sk_buff *skb, + bool transmitter) +{ + struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); + struct j1939_session *session; + + session = j1939_session_get_by_addr(priv, &skcb->addr, true, + transmitter); + if (!session) + return; + + j1939_xtp_rx_eoma_one(session, skb); + j1939_session_put(session); +} + +static void +j1939_xtp_rx_cts_one(struct j1939_session *session, struct sk_buff *skb) +{ + enum j1939_xtp_abort err = J1939_XTP_ABORT_FAULT; + unsigned int pkt; + const u8 *dat; + + dat = skb->data; + + if (j1939_xtp_rx_cmd_bad_pgn(session, skb)) + return; + + netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session); + + if (session->last_cmd == dat[0]) { + err = J1939_XTP_ABORT_DUP_SEQ; + goto out_session_cancel; + } + + if (session->skcb.addr.type == J1939_ETP) + pkt = j1939_etp_ctl_to_packet(dat); + else + pkt = dat[2]; + + if (!pkt) + goto out_session_cancel; + else if (dat[1] > session->pkt.block /* 0xff for etp */) + goto out_session_cancel; + + /* set packet counters only when not CTS(0) */ + session->pkt.tx_acked = pkt - 1; + j1939_session_skb_drop_old(session); + session->pkt.last = session->pkt.tx_acked + dat[1]; + if (session->pkt.last > session->pkt.total) + /* safety measure */ + session->pkt.last = session->pkt.total; + /* TODO: do not set tx here, do it in txtimer */ + session->pkt.tx = session->pkt.tx_acked; + + session->last_cmd = dat[0]; + if (dat[1]) { + j1939_tp_set_rxtimeout(session, 1250); + if (session->transmission) { + if (session->pkt.tx_acked) + j1939_sk_errqueue(session, + J1939_ERRQUEUE_SCHED); + j1939_session_txtimer_cancel(session); + j1939_tp_schedule_txtimer(session, 0); + } + } else { + /* CTS(0) */ + j1939_tp_set_rxtimeout(session, 550); + } + return; + + out_session_cancel: + j1939_session_timers_cancel(session); + j1939_session_cancel(session, err); +} + +static void +j1939_xtp_rx_cts(struct j1939_priv *priv, struct sk_buff *skb, bool transmitter) +{ + struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); + struct j1939_session *session; + + session = j1939_session_get_by_addr(priv, &skcb->addr, true, + transmitter); + if (!session) + return; + j1939_xtp_rx_cts_one(session, skb); + j1939_session_put(session); +} + +static struct j1939_session *j1939_session_new(struct j1939_priv *priv, + struct sk_buff *skb, size_t size) +{ + struct j1939_session *session; + struct j1939_sk_buff_cb *skcb; + + session = kzalloc(sizeof(*session), gfp_any()); + if (!session) + return NULL; + + INIT_LIST_HEAD(&session->active_session_list_entry); + INIT_LIST_HEAD(&session->sk_session_queue_entry); + kref_init(&session->kref); + + j1939_priv_get(priv); + session->priv = priv; + session->total_message_size = size; + session->state = J1939_SESSION_NEW; + + skb_queue_head_init(&session->skb_queue); + skb_queue_tail(&session->skb_queue, skb); + + skcb = j1939_skb_to_cb(skb); + memcpy(&session->skcb, skcb, sizeof(session->skcb)); + + hrtimer_init(&session->txtimer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_SOFT); + session->txtimer.function = j1939_tp_txtimer; + hrtimer_init(&session->rxtimer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_SOFT); + session->rxtimer.function = j1939_tp_rxtimer; + + netdev_dbg(priv->ndev, "%s: 0x%p: sa: %02x, da: %02x\n", + __func__, session, skcb->addr.sa, skcb->addr.da); + + return session; +} + +static struct +j1939_session *j1939_session_fresh_new(struct j1939_priv *priv, + int size, + const struct j1939_sk_buff_cb *rel_skcb) +{ + struct sk_buff *skb; + struct j1939_sk_buff_cb *skcb; + struct j1939_session *session; + + skb = alloc_skb(size + sizeof(struct can_skb_priv), GFP_ATOMIC); + if (unlikely(!skb)) + return NULL; + + skb->dev = priv->ndev; + can_skb_reserve(skb); + can_skb_prv(skb)->ifindex = priv->ndev->ifindex; + skcb = j1939_skb_to_cb(skb); + memcpy(skcb, rel_skcb, sizeof(*skcb)); + + session = j1939_session_new(priv, skb, size); + if (!session) { + kfree_skb(skb); + return NULL; + } + + /* alloc data area */ + skb_put(skb, size); + /* skb is recounted in j1939_session_new() */ + return session; +} + +int j1939_session_activate(struct j1939_session *session) +{ + struct j1939_priv *priv = session->priv; + struct j1939_session *active = NULL; + int ret = 0; + + j1939_session_list_lock(priv); + if (session->skcb.addr.type != J1939_SIMPLE) + active = j1939_session_get_by_addr_locked(priv, + &priv->active_session_list, + &session->skcb.addr, false, + session->transmission); + if (active) { + j1939_session_put(active); + ret = -EAGAIN; + } else { + WARN_ON_ONCE(session->state != J1939_SESSION_NEW); + list_add_tail(&session->active_session_list_entry, + &priv->active_session_list); + j1939_session_get(session); + session->state = J1939_SESSION_ACTIVE; + + netdev_dbg(session->priv->ndev, "%s: 0x%p\n", + __func__, session); + } + j1939_session_list_unlock(priv); + + return ret; +} + +static struct +j1939_session *j1939_xtp_rx_rts_session_new(struct j1939_priv *priv, + struct sk_buff *skb) +{ + enum j1939_xtp_abort abort = J1939_XTP_NO_ABORT; + struct j1939_sk_buff_cb skcb = *j1939_skb_to_cb(skb); + struct j1939_session *session; + const u8 *dat; + pgn_t pgn; + int len; + + netdev_dbg(priv->ndev, "%s\n", __func__); + + dat = skb->data; + pgn = j1939_xtp_ctl_to_pgn(dat); + skcb.addr.pgn = pgn; + + if (!j1939_sk_recv_match(priv, &skcb)) + return NULL; + + if (skcb.addr.type == J1939_ETP) { + len = j1939_etp_ctl_to_size(dat); + if (len > J1939_MAX_ETP_PACKET_SIZE) + abort = J1939_XTP_ABORT_FAULT; + else if (len > priv->tp_max_packet_size) + abort = J1939_XTP_ABORT_RESOURCE; + else if (len <= J1939_MAX_TP_PACKET_SIZE) + abort = J1939_XTP_ABORT_FAULT; + } else { + len = j1939_tp_ctl_to_size(dat); + if (len > J1939_MAX_TP_PACKET_SIZE) + abort = J1939_XTP_ABORT_FAULT; + else if (len > priv->tp_max_packet_size) + abort = J1939_XTP_ABORT_RESOURCE; + } + + if (abort != J1939_XTP_NO_ABORT) { + j1939_xtp_tx_abort(priv, &skcb, true, abort, pgn); + return NULL; + } + + session = j1939_session_fresh_new(priv, len, &skcb); + if (!session) { + j1939_xtp_tx_abort(priv, &skcb, true, + J1939_XTP_ABORT_RESOURCE, pgn); + return NULL; + } + + /* initialize the control buffer: plain copy */ + session->pkt.total = (len + 6) / 7; + session->pkt.block = 0xff; + if (skcb.addr.type != J1939_ETP) { + if (dat[3] != session->pkt.total) + netdev_alert(priv->ndev, "%s: 0x%p: strange total, %u != %u\n", + __func__, session, session->pkt.total, + dat[3]); + session->pkt.total = dat[3]; + session->pkt.block = min(dat[3], dat[4]); + } + + session->pkt.rx = 0; + session->pkt.tx = 0; + + WARN_ON_ONCE(j1939_session_activate(session)); + + return session; +} + +static int j1939_xtp_rx_rts_session_active(struct j1939_session *session, + struct sk_buff *skb) +{ + struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); + struct j1939_priv *priv = session->priv; + + if (!session->transmission) { + if (j1939_xtp_rx_cmd_bad_pgn(session, skb)) + return -EBUSY; + + /* RTS on active session */ + j1939_session_timers_cancel(session); + j1939_session_cancel(session, J1939_XTP_ABORT_BUSY); + } + + if (session->last_cmd != 0) { + /* we received a second rts on the same connection */ + netdev_alert(priv->ndev, "%s: 0x%p: connection exists (%02x %02x). last cmd: %x\n", + __func__, session, skcb->addr.sa, skcb->addr.da, + session->last_cmd); + + j1939_session_timers_cancel(session); + j1939_session_cancel(session, J1939_XTP_ABORT_BUSY); + + return -EBUSY; + } + + if (session->skcb.addr.sa != skcb->addr.sa || + session->skcb.addr.da != skcb->addr.da) + netdev_warn(priv->ndev, "%s: 0x%p: session->skcb.addr.sa=0x%02x skcb->addr.sa=0x%02x session->skcb.addr.da=0x%02x skcb->addr.da=0x%02x\n", + __func__, session, + session->skcb.addr.sa, skcb->addr.sa, + session->skcb.addr.da, skcb->addr.da); + /* make sure 'sa' & 'da' are correct ! + * They may be 'not filled in yet' for sending + * skb's, since they did not pass the Address Claim ever. + */ + session->skcb.addr.sa = skcb->addr.sa; + session->skcb.addr.da = skcb->addr.da; + + netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session); + + return 0; +} + +static void j1939_xtp_rx_rts(struct j1939_priv *priv, struct sk_buff *skb, + bool transmitter) +{ + struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); + struct j1939_session *session; + u8 cmd = skb->data[0]; + + session = j1939_session_get_by_addr(priv, &skcb->addr, false, + transmitter); + + if (!session) { + if (transmitter) { + /* If we're the transmitter and this function is called, + * we received our own RTS. A session has already been + * created. + * + * For some reasons however it might have been destroyed + * already. So don't create a new one here (using + * "j1939_xtp_rx_rts_session_new()") as this will be a + * receiver session. + * + * The reasons the session is already destroyed might + * be: + * - user space closed socket was and the session was + * aborted + * - session was aborted due to external abort message + */ + return; + } + session = j1939_xtp_rx_rts_session_new(priv, skb); + if (!session) + return; + } else { + if (j1939_xtp_rx_rts_session_active(session, skb)) { + j1939_session_put(session); + return; + } + } + session->last_cmd = cmd; + + j1939_tp_set_rxtimeout(session, 1250); + + if (cmd != J1939_TP_CMD_BAM && !session->transmission) { + j1939_session_txtimer_cancel(session); + j1939_tp_schedule_txtimer(session, 0); + } + + j1939_session_put(session); +} + +static void j1939_xtp_rx_dpo_one(struct j1939_session *session, + struct sk_buff *skb) +{ + const u8 *dat = skb->data; + + if (j1939_xtp_rx_cmd_bad_pgn(session, skb)) + return; + + netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session); + + /* transmitted without problems */ + session->pkt.dpo = j1939_etp_ctl_to_packet(skb->data); + session->last_cmd = dat[0]; + j1939_tp_set_rxtimeout(session, 750); +} + +static void j1939_xtp_rx_dpo(struct j1939_priv *priv, struct sk_buff *skb, + bool transmitter) +{ + struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); + struct j1939_session *session; + + session = j1939_session_get_by_addr(priv, &skcb->addr, false, + transmitter); + if (!session) { + netdev_info(priv->ndev, + "%s: no connection found\n", __func__); + return; + } + + j1939_xtp_rx_dpo_one(session, skb); + j1939_session_put(session); +} + +static void j1939_xtp_rx_dat_one(struct j1939_session *session, + struct sk_buff *skb) +{ + struct j1939_priv *priv = session->priv; + struct j1939_sk_buff_cb *skcb; + struct sk_buff *se_skb; + const u8 *dat; + u8 *tpdat; + int offset; + int nbytes; + bool final = false; + bool do_cts_eoma = false; + int packet; + + skcb = j1939_skb_to_cb(skb); + dat = skb->data; + if (skb->len <= 1) + /* makes no sense */ + goto out_session_cancel; + + switch (session->last_cmd) { + case 0xff: + break; + case J1939_ETP_CMD_DPO: + if (skcb->addr.type == J1939_ETP) + break; + /* fall through */ + case J1939_TP_CMD_BAM: /* fall through */ + case J1939_TP_CMD_CTS: /* fall through */ + if (skcb->addr.type != J1939_ETP) + break; + /* fall through */ + default: + netdev_info(priv->ndev, "%s: 0x%p: last %02x\n", __func__, + session, session->last_cmd); + goto out_session_cancel; + } + + packet = (dat[0] - 1 + session->pkt.dpo); + if (packet > session->pkt.total || + (session->pkt.rx + 1) > session->pkt.total) { + netdev_info(priv->ndev, "%s: 0x%p: should have been completed\n", + __func__, session); + goto out_session_cancel; + } + se_skb = j1939_session_skb_find(session); + if (!se_skb) { + netdev_warn(priv->ndev, "%s: 0x%p: no skb found\n", __func__, + session); + goto out_session_cancel; + } + + skcb = j1939_skb_to_cb(se_skb); + offset = packet * 7 - skcb->offset; + nbytes = se_skb->len - offset; + if (nbytes > 7) + nbytes = 7; + if (nbytes <= 0 || (nbytes + 1) > skb->len) { + netdev_info(priv->ndev, "%s: 0x%p: nbytes %i, len %i\n", + __func__, session, nbytes, skb->len); + goto out_session_cancel; + } + + tpdat = se_skb->data; + memcpy(&tpdat[offset], &dat[1], nbytes); + if (packet == session->pkt.rx) + session->pkt.rx++; + + if (skcb->addr.type != J1939_ETP && + j1939_cb_is_broadcast(&session->skcb)) { + if (session->pkt.rx >= session->pkt.total) + final = true; + } else { + /* never final, an EOMA must follow */ + if (session->pkt.rx >= session->pkt.last) + do_cts_eoma = true; + } + + if (final) { + j1939_session_completed(session); + } else if (do_cts_eoma) { + j1939_tp_set_rxtimeout(session, 1250); + if (!session->transmission) + j1939_tp_schedule_txtimer(session, 0); + } else { + j1939_tp_set_rxtimeout(session, 250); + } + session->last_cmd = 0xff; + j1939_session_put(session); + + return; + + out_session_cancel: + j1939_session_timers_cancel(session); + j1939_session_cancel(session, J1939_XTP_ABORT_FAULT); + j1939_session_put(session); +} + +static void j1939_xtp_rx_dat(struct j1939_priv *priv, struct sk_buff *skb) +{ + struct j1939_sk_buff_cb *skcb; + struct j1939_session *session; + + skcb = j1939_skb_to_cb(skb); + + if (j1939_tp_im_transmitter(skcb)) { + session = j1939_session_get_by_addr(priv, &skcb->addr, false, + true); + if (!session) + netdev_info(priv->ndev, "%s: no tx connection found\n", + __func__); + else + j1939_xtp_rx_dat_one(session, skb); + } + + if (j1939_tp_im_receiver(skcb)) { + session = j1939_session_get_by_addr(priv, &skcb->addr, false, + false); + if (!session) + netdev_info(priv->ndev, "%s: no rx connection found\n", + __func__); + else + j1939_xtp_rx_dat_one(session, skb); + } +} + +/* j1939 main intf */ +struct j1939_session *j1939_tp_send(struct j1939_priv *priv, + struct sk_buff *skb, size_t size) +{ + struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); + struct j1939_session *session; + int ret; + + if (skcb->addr.pgn == J1939_TP_PGN_DAT || + skcb->addr.pgn == J1939_TP_PGN_CTL || + skcb->addr.pgn == J1939_ETP_PGN_DAT || + skcb->addr.pgn == J1939_ETP_PGN_CTL) + /* avoid conflict */ + return ERR_PTR(-EDOM); + + if (size > priv->tp_max_packet_size) + return ERR_PTR(-EMSGSIZE); + + if (size <= 8) + skcb->addr.type = J1939_SIMPLE; + else if (size > J1939_MAX_TP_PACKET_SIZE) + skcb->addr.type = J1939_ETP; + else + skcb->addr.type = J1939_TP; + + if (skcb->addr.type == J1939_ETP && + j1939_cb_is_broadcast(skcb)) + return ERR_PTR(-EDESTADDRREQ); + + /* fill in addresses from names */ + ret = j1939_ac_fixup(priv, skb); + if (unlikely(ret)) + return ERR_PTR(ret); + + /* fix DST flags, it may be used there soon */ + if (j1939_address_is_unicast(skcb->addr.da) && + priv->ents[skcb->addr.da].nusers) + skcb->flags |= J1939_ECU_LOCAL_DST; + + /* src is always local, I'm sending ... */ + skcb->flags |= J1939_ECU_LOCAL_SRC; + + /* prepare new session */ + session = j1939_session_new(priv, skb, size); + if (!session) + return ERR_PTR(-ENOMEM); + + /* skb is recounted in j1939_session_new() */ + sock_hold(skb->sk); + session->sk = skb->sk; + session->transmission = true; + session->pkt.total = (size + 6) / 7; + session->pkt.block = skcb->addr.type == J1939_ETP ? 255 : + min(j1939_tp_block ?: 255, session->pkt.total); + + if (j1939_cb_is_broadcast(&session->skcb)) + /* set the end-packet for broadcast */ + session->pkt.last = session->pkt.total; + + skcb->tskey = session->sk->sk_tskey++; + session->tskey = skcb->tskey; + + return session; +} + +static void j1939_tp_cmd_recv(struct j1939_priv *priv, struct sk_buff *skb) +{ + struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); + int extd = J1939_TP; + u8 cmd = skb->data[0]; + + switch (cmd) { + case J1939_ETP_CMD_RTS: + extd = J1939_ETP; + /* fall through */ + case J1939_TP_CMD_BAM: /* fall through */ + case J1939_TP_CMD_RTS: /* fall through */ + if (skcb->addr.type != extd) + return; + + if (cmd == J1939_TP_CMD_RTS && j1939_cb_is_broadcast(skcb)) { + netdev_alert(priv->ndev, "%s: rts without destination (%02x)\n", + __func__, skcb->addr.sa); + return; + } + + if (j1939_tp_im_transmitter(skcb)) + j1939_xtp_rx_rts(priv, skb, true); + + if (j1939_tp_im_receiver(skcb)) + j1939_xtp_rx_rts(priv, skb, false); + + break; + + case J1939_ETP_CMD_CTS: + extd = J1939_ETP; + /* fall through */ + case J1939_TP_CMD_CTS: + if (skcb->addr.type != extd) + return; + + if (j1939_tp_im_transmitter(skcb)) + j1939_xtp_rx_cts(priv, skb, false); + + if (j1939_tp_im_receiver(skcb)) + j1939_xtp_rx_cts(priv, skb, true); + + break; + + case J1939_ETP_CMD_DPO: + if (skcb->addr.type != J1939_ETP) + return; + + if (j1939_tp_im_transmitter(skcb)) + j1939_xtp_rx_dpo(priv, skb, true); + + if (j1939_tp_im_receiver(skcb)) + j1939_xtp_rx_dpo(priv, skb, false); + + break; + + case J1939_ETP_CMD_EOMA: + extd = J1939_ETP; + /* fall through */ + case J1939_TP_CMD_EOMA: + if (skcb->addr.type != extd) + return; + + if (j1939_tp_im_transmitter(skcb)) + j1939_xtp_rx_eoma(priv, skb, false); + + if (j1939_tp_im_receiver(skcb)) + j1939_xtp_rx_eoma(priv, skb, true); + + break; + + case J1939_ETP_CMD_ABORT: /* && J1939_TP_CMD_ABORT */ + if (j1939_tp_im_transmitter(skcb)) + j1939_xtp_rx_abort(priv, skb, true); + + if (j1939_tp_im_receiver(skcb)) + j1939_xtp_rx_abort(priv, skb, false); + + break; + default: + return; + } +} + +int j1939_tp_recv(struct j1939_priv *priv, struct sk_buff *skb) +{ + struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); + + if (!j1939_tp_im_involved_anydir(skcb)) + return 0; + + switch (skcb->addr.pgn) { + case J1939_ETP_PGN_DAT: + skcb->addr.type = J1939_ETP; + /* fall through */ + case J1939_TP_PGN_DAT: + j1939_xtp_rx_dat(priv, skb); + break; + + case J1939_ETP_PGN_CTL: + skcb->addr.type = J1939_ETP; + /* fall through */ + case J1939_TP_PGN_CTL: + if (skb->len < 8) + return 0; /* Don't care. Nothing to extract here */ + + j1939_tp_cmd_recv(priv, skb); + break; + default: + return 0; /* no problem */ + } + return 1; /* "I processed the message" */ +} + +void j1939_simple_recv(struct j1939_priv *priv, struct sk_buff *skb) +{ + struct j1939_session *session; + + if (!skb->sk) + return; + + j1939_session_list_lock(priv); + session = j1939_session_get_simple(priv, skb); + j1939_session_list_unlock(priv); + if (!session) { + netdev_warn(priv->ndev, + "%s: Received already invalidated message\n", + __func__); + return; + } + + j1939_session_timers_cancel(session); + j1939_session_deactivate(session); + j1939_session_put(session); +} + +int j1939_cancel_active_session(struct j1939_priv *priv, struct sock *sk) +{ + struct j1939_session *session, *saved; + + netdev_dbg(priv->ndev, "%s, sk: %p\n", __func__, sk); + j1939_session_list_lock(priv); + list_for_each_entry_safe(session, saved, + &priv->active_session_list, + active_session_list_entry) { + if (!sk || sk == session->sk) { + if (hrtimer_try_to_cancel(&session->txtimer) == 1) + j1939_session_put(session); + if (hrtimer_try_to_cancel(&session->rxtimer) == 1) + j1939_session_put(session); + + session->err = ESHUTDOWN; + j1939_session_deactivate_locked(session); + } + } + j1939_session_list_unlock(priv); + return NOTIFY_DONE; +} + +void j1939_tp_init(struct j1939_priv *priv) +{ + spin_lock_init(&priv->active_session_list_lock); + INIT_LIST_HEAD(&priv->active_session_list); + priv->tp_max_packet_size = J1939_MAX_ETP_PACKET_SIZE; +} diff --git a/net/can/proc.c b/net/can/proc.c index 70fea17bb04c..e6881bfc3ed1 100644 --- a/net/can/proc.c +++ b/net/can/proc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) /* * proc.c - procfs support for Protocol family CAN core module * @@ -44,6 +45,7 @@ #include <linux/list.h> #include <linux/rcupdate.h> #include <linux/if_arp.h> +#include <linux/can/can-ml.h> #include <linux/can/core.h> #include "af_can.h" @@ -77,21 +79,21 @@ static const char rx_list_name[][8] = { static void can_init_stats(struct net *net) { - struct s_stats *can_stats = net->can.can_stats; - struct s_pstats *can_pstats = net->can.can_pstats; + struct can_pkg_stats *pkg_stats = net->can.pkg_stats; + struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats; /* * This memset function is called from a timer context (when * can_stattimer is active which is the default) OR in a process * context (reading the proc_fs when can_stattimer is disabled). */ - memset(can_stats, 0, sizeof(struct s_stats)); - can_stats->jiffies_init = jiffies; + memset(pkg_stats, 0, sizeof(struct can_pkg_stats)); + pkg_stats->jiffies_init = jiffies; - can_pstats->stats_reset++; + rcv_lists_stats->stats_reset++; if (user_reset) { user_reset = 0; - can_pstats->user_reset++; + rcv_lists_stats->user_reset++; } } @@ -117,8 +119,8 @@ static unsigned long calc_rate(unsigned long oldjif, unsigned long newjif, void can_stat_update(struct timer_list *t) { - struct net *net = from_timer(net, t, can.can_stattimer); - struct s_stats *can_stats = net->can.can_stats; + struct net *net = from_timer(net, t, can.stattimer); + struct can_pkg_stats *pkg_stats = net->can.pkg_stats; unsigned long j = jiffies; /* snapshot */ /* restart counting in timer context on user request */ @@ -126,57 +128,57 @@ void can_stat_update(struct timer_list *t) can_init_stats(net); /* restart counting on jiffies overflow */ - if (j < can_stats->jiffies_init) + if (j < pkg_stats->jiffies_init) can_init_stats(net); /* prevent overflow in calc_rate() */ - if (can_stats->rx_frames > (ULONG_MAX / HZ)) + if (pkg_stats->rx_frames > (ULONG_MAX / HZ)) can_init_stats(net); /* prevent overflow in calc_rate() */ - if (can_stats->tx_frames > (ULONG_MAX / HZ)) + if (pkg_stats->tx_frames > (ULONG_MAX / HZ)) can_init_stats(net); /* matches overflow - very improbable */ - if (can_stats->matches > (ULONG_MAX / 100)) + if (pkg_stats->matches > (ULONG_MAX / 100)) can_init_stats(net); /* calc total values */ - if (can_stats->rx_frames) - can_stats->total_rx_match_ratio = (can_stats->matches * 100) / - can_stats->rx_frames; + if (pkg_stats->rx_frames) + pkg_stats->total_rx_match_ratio = (pkg_stats->matches * 100) / + pkg_stats->rx_frames; - can_stats->total_tx_rate = calc_rate(can_stats->jiffies_init, j, - can_stats->tx_frames); - can_stats->total_rx_rate = calc_rate(can_stats->jiffies_init, j, - can_stats->rx_frames); + pkg_stats->total_tx_rate = calc_rate(pkg_stats->jiffies_init, j, + pkg_stats->tx_frames); + pkg_stats->total_rx_rate = calc_rate(pkg_stats->jiffies_init, j, + pkg_stats->rx_frames); /* calc current values */ - if (can_stats->rx_frames_delta) - can_stats->current_rx_match_ratio = - (can_stats->matches_delta * 100) / - can_stats->rx_frames_delta; + if (pkg_stats->rx_frames_delta) + pkg_stats->current_rx_match_ratio = + (pkg_stats->matches_delta * 100) / + pkg_stats->rx_frames_delta; - can_stats->current_tx_rate = calc_rate(0, HZ, can_stats->tx_frames_delta); - can_stats->current_rx_rate = calc_rate(0, HZ, can_stats->rx_frames_delta); + pkg_stats->current_tx_rate = calc_rate(0, HZ, pkg_stats->tx_frames_delta); + pkg_stats->current_rx_rate = calc_rate(0, HZ, pkg_stats->rx_frames_delta); /* check / update maximum values */ - if (can_stats->max_tx_rate < can_stats->current_tx_rate) - can_stats->max_tx_rate = can_stats->current_tx_rate; + if (pkg_stats->max_tx_rate < pkg_stats->current_tx_rate) + pkg_stats->max_tx_rate = pkg_stats->current_tx_rate; - if (can_stats->max_rx_rate < can_stats->current_rx_rate) - can_stats->max_rx_rate = can_stats->current_rx_rate; + if (pkg_stats->max_rx_rate < pkg_stats->current_rx_rate) + pkg_stats->max_rx_rate = pkg_stats->current_rx_rate; - if (can_stats->max_rx_match_ratio < can_stats->current_rx_match_ratio) - can_stats->max_rx_match_ratio = can_stats->current_rx_match_ratio; + if (pkg_stats->max_rx_match_ratio < pkg_stats->current_rx_match_ratio) + pkg_stats->max_rx_match_ratio = pkg_stats->current_rx_match_ratio; /* clear values for 'current rate' calculation */ - can_stats->tx_frames_delta = 0; - can_stats->rx_frames_delta = 0; - can_stats->matches_delta = 0; + pkg_stats->tx_frames_delta = 0; + pkg_stats->rx_frames_delta = 0; + pkg_stats->matches_delta = 0; /* restart timer (one second) */ - mod_timer(&net->can.can_stattimer, round_jiffies(jiffies + HZ)); + mod_timer(&net->can.stattimer, round_jiffies(jiffies + HZ)); } /* @@ -211,60 +213,60 @@ static void can_print_recv_banner(struct seq_file *m) static int can_stats_proc_show(struct seq_file *m, void *v) { struct net *net = m->private; - struct s_stats *can_stats = net->can.can_stats; - struct s_pstats *can_pstats = net->can.can_pstats; + struct can_pkg_stats *pkg_stats = net->can.pkg_stats; + struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats; seq_putc(m, '\n'); - seq_printf(m, " %8ld transmitted frames (TXF)\n", can_stats->tx_frames); - seq_printf(m, " %8ld received frames (RXF)\n", can_stats->rx_frames); - seq_printf(m, " %8ld matched frames (RXMF)\n", can_stats->matches); + seq_printf(m, " %8ld transmitted frames (TXF)\n", pkg_stats->tx_frames); + seq_printf(m, " %8ld received frames (RXF)\n", pkg_stats->rx_frames); + seq_printf(m, " %8ld matched frames (RXMF)\n", pkg_stats->matches); seq_putc(m, '\n'); - if (net->can.can_stattimer.function == can_stat_update) { + if (net->can.stattimer.function == can_stat_update) { seq_printf(m, " %8ld %% total match ratio (RXMR)\n", - can_stats->total_rx_match_ratio); + pkg_stats->total_rx_match_ratio); seq_printf(m, " %8ld frames/s total tx rate (TXR)\n", - can_stats->total_tx_rate); + pkg_stats->total_tx_rate); seq_printf(m, " %8ld frames/s total rx rate (RXR)\n", - can_stats->total_rx_rate); + pkg_stats->total_rx_rate); seq_putc(m, '\n'); seq_printf(m, " %8ld %% current match ratio (CRXMR)\n", - can_stats->current_rx_match_ratio); + pkg_stats->current_rx_match_ratio); seq_printf(m, " %8ld frames/s current tx rate (CTXR)\n", - can_stats->current_tx_rate); + pkg_stats->current_tx_rate); seq_printf(m, " %8ld frames/s current rx rate (CRXR)\n", - can_stats->current_rx_rate); + pkg_stats->current_rx_rate); seq_putc(m, '\n'); seq_printf(m, " %8ld %% max match ratio (MRXMR)\n", - can_stats->max_rx_match_ratio); + pkg_stats->max_rx_match_ratio); seq_printf(m, " %8ld frames/s max tx rate (MTXR)\n", - can_stats->max_tx_rate); + pkg_stats->max_tx_rate); seq_printf(m, " %8ld frames/s max rx rate (MRXR)\n", - can_stats->max_rx_rate); + pkg_stats->max_rx_rate); seq_putc(m, '\n'); } seq_printf(m, " %8ld current receive list entries (CRCV)\n", - can_pstats->rcv_entries); + rcv_lists_stats->rcv_entries); seq_printf(m, " %8ld maximum receive list entries (MRCV)\n", - can_pstats->rcv_entries_max); + rcv_lists_stats->rcv_entries_max); - if (can_pstats->stats_reset) + if (rcv_lists_stats->stats_reset) seq_printf(m, "\n %8ld statistic resets (STR)\n", - can_pstats->stats_reset); + rcv_lists_stats->stats_reset); - if (can_pstats->user_reset) + if (rcv_lists_stats->user_reset) seq_printf(m, " %8ld user statistic resets (USTR)\n", - can_pstats->user_reset); + rcv_lists_stats->user_reset); seq_putc(m, '\n'); return 0; @@ -273,20 +275,20 @@ static int can_stats_proc_show(struct seq_file *m, void *v) static int can_reset_stats_proc_show(struct seq_file *m, void *v) { struct net *net = m->private; - struct s_pstats *can_pstats = net->can.can_pstats; - struct s_stats *can_stats = net->can.can_stats; + struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats; + struct can_pkg_stats *pkg_stats = net->can.pkg_stats; user_reset = 1; - if (net->can.can_stattimer.function == can_stat_update) { + if (net->can.stattimer.function == can_stat_update) { seq_printf(m, "Scheduled statistic reset #%ld.\n", - can_pstats->stats_reset + 1); + rcv_lists_stats->stats_reset + 1); } else { - if (can_stats->jiffies_init != jiffies) + if (pkg_stats->jiffies_init != jiffies) can_init_stats(net); seq_printf(m, "Performed statistic reset #%ld.\n", - can_pstats->stats_reset); + rcv_lists_stats->stats_reset); } return 0; } @@ -299,11 +301,11 @@ static int can_version_proc_show(struct seq_file *m, void *v) static inline void can_rcvlist_proc_show_one(struct seq_file *m, int idx, struct net_device *dev, - struct can_dev_rcv_lists *d) + struct can_dev_rcv_lists *dev_rcv_lists) { - if (!hlist_empty(&d->rx[idx])) { + if (!hlist_empty(&dev_rcv_lists->rx[idx])) { can_print_recv_banner(m); - can_print_rcvlist(m, &d->rx[idx], dev); + can_print_rcvlist(m, &dev_rcv_lists->rx[idx], dev); } else seq_printf(m, " (%s: no entry)\n", DNAME(dev)); @@ -314,7 +316,7 @@ static int can_rcvlist_proc_show(struct seq_file *m, void *v) /* double cast to prevent GCC warning */ int idx = (int)(long)PDE_DATA(m->file->f_inode); struct net_device *dev; - struct can_dev_rcv_lists *d; + struct can_dev_rcv_lists *dev_rcv_lists; struct net *net = m->private; seq_printf(m, "\nreceive list '%s':\n", rx_list_name[idx]); @@ -322,8 +324,8 @@ static int can_rcvlist_proc_show(struct seq_file *m, void *v) rcu_read_lock(); /* receive list for 'all' CAN devices (dev == NULL) */ - d = net->can.can_rx_alldev_list; - can_rcvlist_proc_show_one(m, idx, NULL, d); + dev_rcv_lists = net->can.rx_alldev_list; + can_rcvlist_proc_show_one(m, idx, NULL, dev_rcv_lists); /* receive list for registered CAN devices */ for_each_netdev_rcu(net, dev) { @@ -365,7 +367,7 @@ static inline void can_rcvlist_proc_show_array(struct seq_file *m, static int can_rcvlist_sff_proc_show(struct seq_file *m, void *v) { struct net_device *dev; - struct can_dev_rcv_lists *d; + struct can_dev_rcv_lists *dev_rcv_lists; struct net *net = m->private; /* RX_SFF */ @@ -374,15 +376,16 @@ static int can_rcvlist_sff_proc_show(struct seq_file *m, void *v) rcu_read_lock(); /* sff receive list for 'all' CAN devices (dev == NULL) */ - d = net->can.can_rx_alldev_list; - can_rcvlist_proc_show_array(m, NULL, d->rx_sff, ARRAY_SIZE(d->rx_sff)); + dev_rcv_lists = net->can.rx_alldev_list; + can_rcvlist_proc_show_array(m, NULL, dev_rcv_lists->rx_sff, + ARRAY_SIZE(dev_rcv_lists->rx_sff)); /* sff receive list for registered CAN devices */ for_each_netdev_rcu(net, dev) { if (dev->type == ARPHRD_CAN && dev->ml_priv) { - d = dev->ml_priv; - can_rcvlist_proc_show_array(m, dev, d->rx_sff, - ARRAY_SIZE(d->rx_sff)); + dev_rcv_lists = dev->ml_priv; + can_rcvlist_proc_show_array(m, dev, dev_rcv_lists->rx_sff, + ARRAY_SIZE(dev_rcv_lists->rx_sff)); } } @@ -395,7 +398,7 @@ static int can_rcvlist_sff_proc_show(struct seq_file *m, void *v) static int can_rcvlist_eff_proc_show(struct seq_file *m, void *v) { struct net_device *dev; - struct can_dev_rcv_lists *d; + struct can_dev_rcv_lists *dev_rcv_lists; struct net *net = m->private; /* RX_EFF */ @@ -404,15 +407,16 @@ static int can_rcvlist_eff_proc_show(struct seq_file *m, void *v) rcu_read_lock(); /* eff receive list for 'all' CAN devices (dev == NULL) */ - d = net->can.can_rx_alldev_list; - can_rcvlist_proc_show_array(m, NULL, d->rx_eff, ARRAY_SIZE(d->rx_eff)); + dev_rcv_lists = net->can.rx_alldev_list; + can_rcvlist_proc_show_array(m, NULL, dev_rcv_lists->rx_eff, + ARRAY_SIZE(dev_rcv_lists->rx_eff)); /* eff receive list for registered CAN devices */ for_each_netdev_rcu(net, dev) { if (dev->type == ARPHRD_CAN && dev->ml_priv) { - d = dev->ml_priv; - can_rcvlist_proc_show_array(m, dev, d->rx_eff, - ARRAY_SIZE(d->rx_eff)); + dev_rcv_lists = dev->ml_priv; + can_rcvlist_proc_show_array(m, dev, dev_rcv_lists->rx_eff, + ARRAY_SIZE(dev_rcv_lists->rx_eff)); } } diff --git a/net/can/raw.c b/net/can/raw.c index afcbff063a67..59c039d73c6d 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -1,5 +1,5 @@ -/* - * raw.c - Raw sockets for protocol family CAN +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* raw.c - Raw sockets for protocol family CAN * * Copyright (c) 2002-2007 Volkswagen Group Electronic Research * All rights reserved. @@ -64,8 +64,7 @@ MODULE_ALIAS("can-proto-1"); #define MASK_ALL 0 -/* - * A raw socket has a list of can_filters attached to it, each receiving +/* A raw socket has a list of can_filters attached to it, each receiving * the CAN frames matching that filter. If the filter list is empty, * no CAN frames will be received by the socket. The default after * opening the socket, is to have one filter which receives all frames. @@ -96,8 +95,7 @@ struct raw_sock { struct uniqframe __percpu *uniq; }; -/* - * Return pointer to store the extra msg flags for raw_recvmsg(). +/* Return pointer to store the extra msg flags for raw_recvmsg(). * We use the space of one unsigned int beyond the 'struct sockaddr_can' * in skb->cb. */ @@ -156,8 +154,7 @@ static void raw_rcv(struct sk_buff *oskb, void *data) if (!skb) return; - /* - * Put the datagram to the queue so that raw_recvmsg() can + /* Put the datagram to the queue so that raw_recvmsg() can * get it from there. We need to pass the interface index to * raw_recvmsg(). We pass a whole struct sockaddr_can in skb->cb * containing the interface index. @@ -283,7 +280,6 @@ static int raw_notifier(struct notifier_block *nb, return NOTIFY_DONE; switch (msg) { - case NETDEV_UNREGISTER: lock_sock(sk); /* remove current filters & unregister */ @@ -369,8 +365,9 @@ static int raw_release(struct socket *sock) raw_disable_allfilters(dev_net(dev), dev, sk); dev_put(dev); } - } else + } else { raw_disable_allfilters(sock_net(sk), NULL, sk); + } } if (ro->count > 1) @@ -399,7 +396,7 @@ static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len) int err = 0; int notify_enetdown = 0; - if (len < sizeof(*addr)) + if (len < CAN_REQUIRED_SIZE(*addr, can_ifindex)) return -EINVAL; if (addr->can_family != AF_CAN) return -EINVAL; @@ -450,8 +447,9 @@ static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len) dev, sk); dev_put(dev); } - } else + } else { raw_disable_allfilters(sock_net(sk), NULL, sk); + } } ro->ifindex = ifindex; ro->bound = 1; @@ -502,7 +500,6 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, return -EINVAL; switch (optname) { - case CAN_RAW_FILTER: if (optlen % sizeof(struct can_filter) != 0) return -EINVAL; @@ -665,17 +662,18 @@ static int raw_getsockopt(struct socket *sock, int level, int optname, return -EINVAL; switch (optname) { - case CAN_RAW_FILTER: lock_sock(sk); if (ro->count > 0) { int fsize = ro->count * sizeof(struct can_filter); + if (len > fsize) len = fsize; if (copy_to_user(optval, ro->filter, len)) err = -EFAULT; - } else + } else { len = 0; + } release_sock(sk); if (!err) @@ -735,15 +733,16 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_can *, addr, msg->msg_name); - if (msg->msg_namelen < sizeof(*addr)) + if (msg->msg_namelen < CAN_REQUIRED_SIZE(*addr, can_ifindex)) return -EINVAL; if (addr->can_family != AF_CAN) return -EINVAL; ifindex = addr->can_ifindex; - } else + } else { ifindex = ro->ifindex; + } dev = dev_get_by_index(sock_net(sk), ifindex); if (!dev) @@ -836,6 +835,13 @@ static int raw_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, return size; } +static int raw_sock_no_ioctlcmd(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + /* no ioctls for socket layer -> hand it down to NIC layer */ + return -ENOIOCTLCMD; +} + static const struct proto_ops raw_ops = { .family = PF_CAN, .release = raw_release, @@ -845,7 +851,7 @@ static const struct proto_ops raw_ops = { .accept = sock_no_accept, .getname = raw_getname, .poll = datagram_poll, - .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */ + .ioctl = raw_sock_no_ioctlcmd, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -879,7 +885,7 @@ static __init int raw_module_init(void) err = can_proto_register(&raw_can_proto); if (err < 0) - printk(KERN_ERR "can: registration of raw protocol failed\n"); + pr_err("can: registration of raw protocol failed\n"); return err; } diff --git a/net/ceph/Makefile b/net/ceph/Makefile index 59d0ba2072de..ce09bb4fb249 100644 --- a/net/ceph/Makefile +++ b/net/ceph/Makefile @@ -13,5 +13,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \ auth.o auth_none.o \ crypto.o armor.o \ auth_x.o \ - ceph_fs.o ceph_strings.o ceph_hash.o \ + ceph_strings.o ceph_hash.o \ pagevec.o snapshot.o string_table.o diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 4eeea4d5c3ef..a0e97f6c1072 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -11,8 +11,9 @@ #include <linux/module.h> #include <linux/mount.h> #include <linux/nsproxy.h> -#include <linux/parser.h> +#include <linux/fs_parser.h> #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/statfs.h> @@ -185,18 +186,34 @@ int ceph_compare_options(struct ceph_options *new_opt, } EXPORT_SYMBOL(ceph_compare_options); +/* + * kvmalloc() doesn't fall back to the vmalloc allocator unless flags are + * compatible with (a superset of) GFP_KERNEL. This is because while the + * actual pages are allocated with the specified flags, the page table pages + * are always allocated with GFP_KERNEL. map_vm_area() doesn't even take + * flags because GFP_KERNEL is hard-coded in {p4d,pud,pmd,pte}_alloc(). + * + * ceph_kvmalloc() may be called with GFP_KERNEL, GFP_NOFS or GFP_NOIO. + */ void *ceph_kvmalloc(size_t size, gfp_t flags) { - if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - void *ptr = kmalloc(size, flags | __GFP_NOWARN); - if (ptr) - return ptr; + void *p; + + if ((flags & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) { + p = kvmalloc(size, flags); + } else if ((flags & (__GFP_IO | __GFP_FS)) == __GFP_IO) { + unsigned int nofs_flag = memalloc_nofs_save(); + p = kvmalloc(size, GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); + } else { + unsigned int noio_flag = memalloc_noio_save(); + p = kvmalloc(size, GFP_KERNEL); + memalloc_noio_restore(noio_flag); } - return __vmalloc(size, flags, PAGE_KERNEL); + return p; } - static int parse_fsid(const char *str, struct ceph_fsid *fsid) { int i = 0; @@ -237,58 +254,72 @@ enum { Opt_mount_timeout, Opt_osd_idle_ttl, Opt_osd_request_timeout, - Opt_last_int, /* int args above */ Opt_fsid, Opt_name, Opt_secret, Opt_key, Opt_ip, - Opt_last_string, /* string args above */ Opt_share, - Opt_noshare, Opt_crc, - Opt_nocrc, Opt_cephx_require_signatures, - Opt_nocephx_require_signatures, Opt_cephx_sign_messages, - Opt_nocephx_sign_messages, Opt_tcp_nodelay, - Opt_notcp_nodelay, Opt_abort_on_full, }; -static match_table_t opt_tokens = { - {Opt_osdtimeout, "osdtimeout=%d"}, - {Opt_osdkeepalivetimeout, "osdkeepalive=%d"}, - {Opt_mount_timeout, "mount_timeout=%d"}, - {Opt_osd_idle_ttl, "osd_idle_ttl=%d"}, - {Opt_osd_request_timeout, "osd_request_timeout=%d"}, - /* int args above */ - {Opt_fsid, "fsid=%s"}, - {Opt_name, "name=%s"}, - {Opt_secret, "secret=%s"}, - {Opt_key, "key=%s"}, - {Opt_ip, "ip=%s"}, - /* string args above */ - {Opt_share, "share"}, - {Opt_noshare, "noshare"}, - {Opt_crc, "crc"}, - {Opt_nocrc, "nocrc"}, - {Opt_cephx_require_signatures, "cephx_require_signatures"}, - {Opt_nocephx_require_signatures, "nocephx_require_signatures"}, - {Opt_cephx_sign_messages, "cephx_sign_messages"}, - {Opt_nocephx_sign_messages, "nocephx_sign_messages"}, - {Opt_tcp_nodelay, "tcp_nodelay"}, - {Opt_notcp_nodelay, "notcp_nodelay"}, - {Opt_abort_on_full, "abort_on_full"}, - {-1, NULL} +static const struct fs_parameter_spec ceph_parameters[] = { + fsparam_flag ("abort_on_full", Opt_abort_on_full), + fsparam_flag_no ("cephx_require_signatures", Opt_cephx_require_signatures), + fsparam_flag_no ("cephx_sign_messages", Opt_cephx_sign_messages), + fsparam_flag_no ("crc", Opt_crc), + fsparam_string ("fsid", Opt_fsid), + fsparam_string ("ip", Opt_ip), + fsparam_string ("key", Opt_key), + fsparam_u32 ("mount_timeout", Opt_mount_timeout), + fsparam_string ("name", Opt_name), + fsparam_u32 ("osd_idle_ttl", Opt_osd_idle_ttl), + fsparam_u32 ("osd_request_timeout", Opt_osd_request_timeout), + fsparam_u32 ("osdkeepalive", Opt_osdkeepalivetimeout), + __fsparam (fs_param_is_s32, "osdtimeout", Opt_osdtimeout, + fs_param_deprecated, NULL), + fsparam_string ("secret", Opt_secret), + fsparam_flag_no ("share", Opt_share), + fsparam_flag_no ("tcp_nodelay", Opt_tcp_nodelay), + {} }; +struct ceph_options *ceph_alloc_options(void) +{ + struct ceph_options *opt; + + opt = kzalloc(sizeof(*opt), GFP_KERNEL); + if (!opt) + return NULL; + + opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr), + GFP_KERNEL); + if (!opt->mon_addr) { + kfree(opt); + return NULL; + } + + opt->flags = CEPH_OPT_DEFAULT; + opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; + opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; + opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; + opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT; + return opt; +} +EXPORT_SYMBOL(ceph_alloc_options); + void ceph_destroy_options(struct ceph_options *opt) { dout("destroy_options %p\n", opt); + if (!opt) + return; + kfree(opt->name); if (opt->key) { ceph_crypto_key_destroy(opt->key); @@ -300,7 +331,9 @@ void ceph_destroy_options(struct ceph_options *opt) EXPORT_SYMBOL(ceph_destroy_options); /* get secret from key store */ -static int get_secret(struct ceph_crypto_key *dst, const char *name) { +static int get_secret(struct ceph_crypto_key *dst, const char *name, + struct p_log *log) +{ struct key *ukey; int key_err; int err = 0; @@ -313,20 +346,20 @@ static int get_secret(struct ceph_crypto_key *dst, const char *name) { key_err = PTR_ERR(ukey); switch (key_err) { case -ENOKEY: - pr_warn("ceph: Mount failed due to key not found: %s\n", - name); + error_plog(log, "Failed due to key not found: %s", + name); break; case -EKEYEXPIRED: - pr_warn("ceph: Mount failed due to expired key: %s\n", - name); + error_plog(log, "Failed due to expired key: %s", + name); break; case -EKEYREVOKED: - pr_warn("ceph: Mount failed due to revoked key: %s\n", - name); + error_plog(log, "Failed due to revoked key: %s", + name); break; default: - pr_warn("ceph: Mount failed due to unknown key error %d: %s\n", - key_err, name); + error_plog(log, "Failed due to key error %d: %s", + key_err, name); } err = -EPERM; goto out; @@ -344,217 +377,159 @@ out: return err; } -struct ceph_options * -ceph_parse_options(char *options, const char *dev_name, - const char *dev_name_end, - int (*parse_extra_token)(char *c, void *private), - void *private) +int ceph_parse_mon_ips(const char *buf, size_t len, struct ceph_options *opt, + struct fc_log *l) { - struct ceph_options *opt; - const char *c; - int err = -ENOMEM; - substring_t argstr[MAX_OPT_ARGS]; - - opt = kzalloc(sizeof(*opt), GFP_KERNEL); - if (!opt) - return ERR_PTR(-ENOMEM); - opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr), - GFP_KERNEL); - if (!opt->mon_addr) - goto out; - - dout("parse_options %p options '%s' dev_name '%s'\n", opt, options, - dev_name); - - /* start with defaults */ - opt->flags = CEPH_OPT_DEFAULT; - opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; - opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; - opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; - opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT; + struct p_log log = {.prefix = "libceph", .log = l}; + int ret; - /* get mon ip(s) */ /* ip1[:port1][,ip2[:port2]...] */ - err = ceph_parse_ips(dev_name, dev_name_end, opt->mon_addr, - CEPH_MAX_MON, &opt->num_mon); - if (err < 0) - goto out; + ret = ceph_parse_ips(buf, buf + len, opt->mon_addr, CEPH_MAX_MON, + &opt->num_mon); + if (ret) { + error_plog(&log, "Failed to parse monitor IPs: %d", ret); + return ret; + } - /* parse mount options */ - while ((c = strsep(&options, ",")) != NULL) { - int token, intval; - if (!*c) - continue; - err = -EINVAL; - token = match_token((char *)c, opt_tokens, argstr); - if (token < 0 && parse_extra_token) { - /* extra? */ - err = parse_extra_token((char *)c, private); - if (err < 0) { - pr_err("bad option at '%s'\n", c); - goto out; - } - continue; - } - if (token < Opt_last_int) { - err = match_int(&argstr[0], &intval); - if (err < 0) { - pr_err("bad option arg (not int) at '%s'\n", c); - goto out; - } - dout("got int token %d val %d\n", token, intval); - } else if (token > Opt_last_int && token < Opt_last_string) { - dout("got string token %d val %s\n", token, - argstr[0].from); - } else { - dout("got token %d\n", token); + return 0; +} +EXPORT_SYMBOL(ceph_parse_mon_ips); + +int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt, + struct fc_log *l) +{ + struct fs_parse_result result; + int token, err; + struct p_log log = {.prefix = "libceph", .log = l}; + + token = __fs_parse(&log, ceph_parameters, param, &result); + dout("%s fs_parse '%s' token %d\n", __func__, param->key, token); + if (token < 0) + return token; + + switch (token) { + case Opt_ip: + err = ceph_parse_ips(param->string, + param->string + param->size, + &opt->my_addr, + 1, NULL); + if (err) { + error_plog(&log, "Failed to parse ip: %d", err); + return err; } - switch (token) { - case Opt_ip: - err = ceph_parse_ips(argstr[0].from, - argstr[0].to, - &opt->my_addr, - 1, NULL); - if (err < 0) - goto out; - opt->flags |= CEPH_OPT_MYIP; - break; + opt->flags |= CEPH_OPT_MYIP; + break; - case Opt_fsid: - err = parse_fsid(argstr[0].from, &opt->fsid); - if (err == 0) - opt->flags |= CEPH_OPT_FSID; - break; - case Opt_name: - kfree(opt->name); - opt->name = kstrndup(argstr[0].from, - argstr[0].to-argstr[0].from, - GFP_KERNEL); - if (!opt->name) { - err = -ENOMEM; - goto out; - } - break; - case Opt_secret: - ceph_crypto_key_destroy(opt->key); - kfree(opt->key); - - opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL); - if (!opt->key) { - err = -ENOMEM; - goto out; - } - err = ceph_crypto_key_unarmor(opt->key, argstr[0].from); - if (err < 0) - goto out; - break; - case Opt_key: - ceph_crypto_key_destroy(opt->key); - kfree(opt->key); - - opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL); - if (!opt->key) { - err = -ENOMEM; - goto out; - } - err = get_secret(opt->key, argstr[0].from); - if (err < 0) - goto out; - break; + case Opt_fsid: + err = parse_fsid(param->string, &opt->fsid); + if (err) { + error_plog(&log, "Failed to parse fsid: %d", err); + return err; + } + opt->flags |= CEPH_OPT_FSID; + break; + case Opt_name: + kfree(opt->name); + opt->name = param->string; + param->string = NULL; + break; + case Opt_secret: + ceph_crypto_key_destroy(opt->key); + kfree(opt->key); - /* misc */ - case Opt_osdtimeout: - pr_warn("ignoring deprecated osdtimeout option\n"); - break; - case Opt_osdkeepalivetimeout: - /* 0 isn't well defined right now, reject it */ - if (intval < 1 || intval > INT_MAX / 1000) { - pr_err("osdkeepalive out of range\n"); - err = -EINVAL; - goto out; - } - opt->osd_keepalive_timeout = - msecs_to_jiffies(intval * 1000); - break; - case Opt_osd_idle_ttl: - /* 0 isn't well defined right now, reject it */ - if (intval < 1 || intval > INT_MAX / 1000) { - pr_err("osd_idle_ttl out of range\n"); - err = -EINVAL; - goto out; - } - opt->osd_idle_ttl = msecs_to_jiffies(intval * 1000); - break; - case Opt_mount_timeout: - /* 0 is "wait forever" (i.e. infinite timeout) */ - if (intval < 0 || intval > INT_MAX / 1000) { - pr_err("mount_timeout out of range\n"); - err = -EINVAL; - goto out; - } - opt->mount_timeout = msecs_to_jiffies(intval * 1000); - break; - case Opt_osd_request_timeout: - /* 0 is "wait forever" (i.e. infinite timeout) */ - if (intval < 0 || intval > INT_MAX / 1000) { - pr_err("osd_request_timeout out of range\n"); - err = -EINVAL; - goto out; - } - opt->osd_request_timeout = msecs_to_jiffies(intval * 1000); - break; + opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL); + if (!opt->key) + return -ENOMEM; + err = ceph_crypto_key_unarmor(opt->key, param->string); + if (err) { + error_plog(&log, "Failed to parse secret: %d", err); + return err; + } + break; + case Opt_key: + ceph_crypto_key_destroy(opt->key); + kfree(opt->key); - case Opt_share: + opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL); + if (!opt->key) + return -ENOMEM; + return get_secret(opt->key, param->string, &log); + + case Opt_osdtimeout: + warn_plog(&log, "Ignoring osdtimeout"); + break; + case Opt_osdkeepalivetimeout: + /* 0 isn't well defined right now, reject it */ + if (result.uint_32 < 1 || result.uint_32 > INT_MAX / 1000) + goto out_of_range; + opt->osd_keepalive_timeout = + msecs_to_jiffies(result.uint_32 * 1000); + break; + case Opt_osd_idle_ttl: + /* 0 isn't well defined right now, reject it */ + if (result.uint_32 < 1 || result.uint_32 > INT_MAX / 1000) + goto out_of_range; + opt->osd_idle_ttl = msecs_to_jiffies(result.uint_32 * 1000); + break; + case Opt_mount_timeout: + /* 0 is "wait forever" (i.e. infinite timeout) */ + if (result.uint_32 > INT_MAX / 1000) + goto out_of_range; + opt->mount_timeout = msecs_to_jiffies(result.uint_32 * 1000); + break; + case Opt_osd_request_timeout: + /* 0 is "wait forever" (i.e. infinite timeout) */ + if (result.uint_32 > INT_MAX / 1000) + goto out_of_range; + opt->osd_request_timeout = + msecs_to_jiffies(result.uint_32 * 1000); + break; + + case Opt_share: + if (!result.negated) opt->flags &= ~CEPH_OPT_NOSHARE; - break; - case Opt_noshare: + else opt->flags |= CEPH_OPT_NOSHARE; - break; - - case Opt_crc: + break; + case Opt_crc: + if (!result.negated) opt->flags &= ~CEPH_OPT_NOCRC; - break; - case Opt_nocrc: + else opt->flags |= CEPH_OPT_NOCRC; - break; - - case Opt_cephx_require_signatures: + break; + case Opt_cephx_require_signatures: + if (!result.negated) opt->flags &= ~CEPH_OPT_NOMSGAUTH; - break; - case Opt_nocephx_require_signatures: + else opt->flags |= CEPH_OPT_NOMSGAUTH; - break; - case Opt_cephx_sign_messages: + break; + case Opt_cephx_sign_messages: + if (!result.negated) opt->flags &= ~CEPH_OPT_NOMSGSIGN; - break; - case Opt_nocephx_sign_messages: + else opt->flags |= CEPH_OPT_NOMSGSIGN; - break; - - case Opt_tcp_nodelay: + break; + case Opt_tcp_nodelay: + if (!result.negated) opt->flags |= CEPH_OPT_TCP_NODELAY; - break; - case Opt_notcp_nodelay: + else opt->flags &= ~CEPH_OPT_TCP_NODELAY; - break; + break; - case Opt_abort_on_full: - opt->flags |= CEPH_OPT_ABORT_ON_FULL; - break; + case Opt_abort_on_full: + opt->flags |= CEPH_OPT_ABORT_ON_FULL; + break; - default: - BUG_ON(token); - } + default: + BUG(); } - /* success */ - return opt; + return 0; -out: - ceph_destroy_options(opt); - return ERR_PTR(err); +out_of_range: + return inval_plog(&log, "%s out of range", param->key); } -EXPORT_SYMBOL(ceph_parse_options); +EXPORT_SYMBOL(ceph_parse_param); int ceph_print_client_options(struct seq_file *m, struct ceph_client *client, bool show_all) @@ -694,6 +669,14 @@ void ceph_destroy_client(struct ceph_client *client) } EXPORT_SYMBOL(ceph_destroy_client); +void ceph_reset_client_addr(struct ceph_client *client) +{ + ceph_messenger_reset_nonce(&client->msgr); + ceph_monc_reopen_session(&client->monc); + ceph_osdc_reopen_osds(&client->osdc); +} +EXPORT_SYMBOL(ceph_reset_client_addr); + /* * true if we have the mon map (and have thus joined the cluster) */ diff --git a/net/ceph/ceph_fs.c b/net/ceph/ceph_fs.c deleted file mode 100644 index 756a2dc10d27..000000000000 --- a/net/ceph/ceph_fs.c +++ /dev/null @@ -1,104 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Some non-inline ceph helpers - */ -#include <linux/module.h> -#include <linux/ceph/types.h> - -/* - * return true if @layout appears to be valid - */ -int ceph_file_layout_is_valid(const struct ceph_file_layout *layout) -{ - __u32 su = layout->stripe_unit; - __u32 sc = layout->stripe_count; - __u32 os = layout->object_size; - - /* stripe unit, object size must be non-zero, 64k increment */ - if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1))) - return 0; - if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1))) - return 0; - /* object size must be a multiple of stripe unit */ - if (os < su || os % su) - return 0; - /* stripe count must be non-zero */ - if (!sc) - return 0; - return 1; -} - -void ceph_file_layout_from_legacy(struct ceph_file_layout *fl, - struct ceph_file_layout_legacy *legacy) -{ - fl->stripe_unit = le32_to_cpu(legacy->fl_stripe_unit); - fl->stripe_count = le32_to_cpu(legacy->fl_stripe_count); - fl->object_size = le32_to_cpu(legacy->fl_object_size); - fl->pool_id = le32_to_cpu(legacy->fl_pg_pool); - if (fl->pool_id == 0 && fl->stripe_unit == 0 && - fl->stripe_count == 0 && fl->object_size == 0) - fl->pool_id = -1; -} -EXPORT_SYMBOL(ceph_file_layout_from_legacy); - -void ceph_file_layout_to_legacy(struct ceph_file_layout *fl, - struct ceph_file_layout_legacy *legacy) -{ - legacy->fl_stripe_unit = cpu_to_le32(fl->stripe_unit); - legacy->fl_stripe_count = cpu_to_le32(fl->stripe_count); - legacy->fl_object_size = cpu_to_le32(fl->object_size); - if (fl->pool_id >= 0) - legacy->fl_pg_pool = cpu_to_le32(fl->pool_id); - else - legacy->fl_pg_pool = 0; -} -EXPORT_SYMBOL(ceph_file_layout_to_legacy); - -int ceph_flags_to_mode(int flags) -{ - int mode; - -#ifdef O_DIRECTORY /* fixme */ - if ((flags & O_DIRECTORY) == O_DIRECTORY) - return CEPH_FILE_MODE_PIN; -#endif - - switch (flags & O_ACCMODE) { - case O_WRONLY: - mode = CEPH_FILE_MODE_WR; - break; - case O_RDONLY: - mode = CEPH_FILE_MODE_RD; - break; - case O_RDWR: - case O_ACCMODE: /* this is what the VFS does */ - mode = CEPH_FILE_MODE_RDWR; - break; - } -#ifdef O_LAZY - if (flags & O_LAZY) - mode |= CEPH_FILE_MODE_LAZY; -#endif - - return mode; -} -EXPORT_SYMBOL(ceph_flags_to_mode); - -int ceph_caps_for_mode(int mode) -{ - int caps = CEPH_CAP_PIN; - - if (mode & CEPH_FILE_MODE_RD) - caps |= CEPH_CAP_FILE_SHARED | - CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE; - if (mode & CEPH_FILE_MODE_WR) - caps |= CEPH_CAP_FILE_EXCL | - CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | - CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL | - CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL; - if (mode & CEPH_FILE_MODE_LAZY) - caps |= CEPH_CAP_FILE_LAZYIO; - - return caps; -} -EXPORT_SYMBOL(ceph_caps_for_mode); diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index 5d6724cee38f..4f75df40fb12 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -136,8 +136,10 @@ void ceph_crypto_key_destroy(struct ceph_crypto_key *key) if (key) { kfree(key->key); key->key = NULL; - crypto_free_sync_skcipher(key->tfm); - key->tfm = NULL; + if (key->tfm) { + crypto_free_sync_skcipher(key->tfm); + key->tfm = NULL; + } } } diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 962f521c863e..5b4bd8261002 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2004,10 +2004,8 @@ int ceph_parse_ips(const char *c, const char *end, return 0; bad: - pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c); return ret; } -EXPORT_SYMBOL(ceph_parse_ips); static int process_banner(struct ceph_connection *con) { @@ -3031,6 +3029,12 @@ static void con_fault(struct ceph_connection *con) } +void ceph_messenger_reset_nonce(struct ceph_messenger *msgr) +{ + u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000; + msgr->inst.addr.nonce = cpu_to_le32(nonce); + encode_my_addr(msgr); +} /* * initialize a new messenger instance diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 0520bf9825aa..9d9e4e4ea600 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -213,6 +213,13 @@ static void reopen_session(struct ceph_mon_client *monc) __open_session(monc); } +void ceph_monc_reopen_session(struct ceph_mon_client *monc) +{ + mutex_lock(&monc->mutex); + reopen_session(monc); + mutex_unlock(&monc->mutex); +} + static void un_backoff(struct ceph_mon_client *monc) { monc->hunt_mult /= 2; /* reduce by 50% */ @@ -1226,9 +1233,6 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) struct ceph_mon_client *monc = con->private; int type = le16_to_cpu(msg->hdr.type); - if (!monc) - return; - switch (type) { case CEPH_MSG_AUTH_REPLY: handle_auth_reply(monc, msg); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 0b2df09b2554..b68b376d8c2f 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -402,7 +402,7 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req, case CEPH_OSD_OP_LIST_WATCHERS: ceph_osd_data_release(&op->list_watchers.response_data); break; - case CEPH_OSD_OP_COPY_FROM: + case CEPH_OSD_OP_COPY_FROM2: ceph_osd_data_release(&op->copy_from.osd_data); break; default: @@ -697,7 +697,7 @@ static void get_num_data_items(struct ceph_osd_request *req, case CEPH_OSD_OP_SETXATTR: case CEPH_OSD_OP_CMPXATTR: case CEPH_OSD_OP_NOTIFY_ACK: - case CEPH_OSD_OP_COPY_FROM: + case CEPH_OSD_OP_COPY_FROM2: *num_request_data_items += 1; break; @@ -841,6 +841,7 @@ int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, struct ceph_pagelist *pagelist; size_t payload_len = 0; size_t size; + int ret; op = _osd_req_op_init(osd_req, which, CEPH_OSD_OP_CALL, 0); @@ -852,20 +853,27 @@ int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, size = strlen(class); BUG_ON(size > (size_t) U8_MAX); op->cls.class_len = size; - ceph_pagelist_append(pagelist, class, size); + ret = ceph_pagelist_append(pagelist, class, size); + if (ret) + goto err_pagelist_free; payload_len += size; op->cls.method_name = method; size = strlen(method); BUG_ON(size > (size_t) U8_MAX); op->cls.method_len = size; - ceph_pagelist_append(pagelist, method, size); + ret = ceph_pagelist_append(pagelist, method, size); + if (ret) + goto err_pagelist_free; payload_len += size; osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist); - op->indata_len = payload_len; return 0; + +err_pagelist_free: + ceph_pagelist_release(pagelist); + return ret; } EXPORT_SYMBOL(osd_req_op_cls_init); @@ -877,6 +885,7 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, opcode, 0); struct ceph_pagelist *pagelist; size_t payload_len; + int ret; BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR); @@ -886,10 +895,14 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, payload_len = strlen(name); op->xattr.name_len = payload_len; - ceph_pagelist_append(pagelist, name, payload_len); + ret = ceph_pagelist_append(pagelist, name, payload_len); + if (ret) + goto err_pagelist_free; op->xattr.value_len = size; - ceph_pagelist_append(pagelist, value, size); + ret = ceph_pagelist_append(pagelist, value, size); + if (ret) + goto err_pagelist_free; payload_len += size; op->xattr.cmp_op = cmp_op; @@ -898,6 +911,10 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist); op->indata_len = payload_len; return 0; + +err_pagelist_free: + ceph_pagelist_release(pagelist); + return ret; } EXPORT_SYMBOL(osd_req_op_xattr_init); @@ -1012,7 +1029,7 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst, case CEPH_OSD_OP_CREATE: case CEPH_OSD_OP_DELETE: break; - case CEPH_OSD_OP_COPY_FROM: + case CEPH_OSD_OP_COPY_FROM2: dst->copy_from.snapid = cpu_to_le64(src->copy_from.snapid); dst->copy_from.src_version = cpu_to_le64(src->copy_from.src_version); @@ -1488,7 +1505,6 @@ enum calc_target_result { static enum calc_target_result calc_target(struct ceph_osd_client *osdc, struct ceph_osd_request_target *t, - struct ceph_connection *con, bool any_change) { struct ceph_pg_pool_info *pi; @@ -1496,7 +1512,7 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, struct ceph_osds up, acting; bool force_resend = false; bool unpaused = false; - bool legacy_change; + bool legacy_change = false; bool split = false; bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE); bool recovery_deletes = ceph_osdmap_flag(osdc, @@ -1584,15 +1600,14 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, t->osd = acting.primary; } - if (unpaused || legacy_change || force_resend || - (split && con && CEPH_HAVE_FEATURE(con->peer_features, - RESEND_ON_SPLIT))) + if (unpaused || legacy_change || force_resend || split) ct_res = CALC_TARGET_NEED_RESEND; else ct_res = CALC_TARGET_NO_ACTION; out: - dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd); + dout("%s t %p -> %d%d%d%d ct_res %d osd%d\n", __func__, t, unpaused, + legacy_change, force_resend, split, ct_res, t->osd); return ct_res; } @@ -1951,7 +1966,7 @@ static void setup_request_data(struct ceph_osd_request *req) ceph_osdc_msg_data_add(request_msg, &op->notify_ack.request_data); break; - case CEPH_OSD_OP_COPY_FROM: + case CEPH_OSD_OP_COPY_FROM2: ceph_osdc_msg_data_add(request_msg, &op->copy_from.osd_data); break; @@ -2273,7 +2288,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked) dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); again: - ct_res = calc_target(osdc, &req->r_t, NULL, false); + ct_res = calc_target(osdc, &req->r_t, false); if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked) goto promote; @@ -2477,6 +2492,14 @@ void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err) } EXPORT_SYMBOL(ceph_osdc_abort_requests); +void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc) +{ + down_write(&osdc->lock); + osdc->abort_err = 0; + up_write(&osdc->lock); +} +EXPORT_SYMBOL(ceph_osdc_clear_abort_err); + static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb) { if (likely(eb > osdc->epoch_barrier)) { @@ -3088,7 +3111,7 @@ static void linger_submit(struct ceph_osd_linger_request *lreq) lreq->reg_req->r_ops[0].notify.cookie = lreq->linger_id; } - calc_target(osdc, &lreq->t, NULL, false); + calc_target(osdc, &lreq->t, false); osd = lookup_create_osd(osdc, lreq->t.osd, true); link_linger(osd, lreq); @@ -3705,7 +3728,7 @@ recalc_linger_target(struct ceph_osd_linger_request *lreq) struct ceph_osd_client *osdc = lreq->osdc; enum calc_target_result ct_res; - ct_res = calc_target(osdc, &lreq->t, NULL, true); + ct_res = calc_target(osdc, &lreq->t, true); if (ct_res == CALC_TARGET_NEED_RESEND) { struct ceph_osd *osd; @@ -3777,8 +3800,7 @@ static void scan_requests(struct ceph_osd *osd, n = rb_next(n); /* unlink_request(), check_pool_dne() */ dout("%s req %p tid %llu\n", __func__, req, req->r_tid); - ct_res = calc_target(osdc, &req->r_t, &req->r_osd->o_con, - false); + ct_res = calc_target(osdc, &req->r_t, false); switch (ct_res) { case CALC_TARGET_NO_ACTION: force_resend_writes = cleared_full || @@ -3887,7 +3909,7 @@ static void kick_requests(struct ceph_osd_client *osdc, n = rb_next(n); if (req->r_t.epoch < osdc->osdmap->epoch) { - ct_res = calc_target(osdc, &req->r_t, NULL, false); + ct_res = calc_target(osdc, &req->r_t, false); if (ct_res == CALC_TARGET_POOL_DNE) { erase_request(need_resend, req); check_pool_dne(req); @@ -5088,6 +5110,24 @@ out_put_req: EXPORT_SYMBOL(ceph_osdc_call); /* + * reset all osd connections + */ +void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc) +{ + struct rb_node *n; + + down_write(&osdc->lock); + for (n = rb_first(&osdc->osds); n; ) { + struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); + + n = rb_next(n); + if (!reopen_osd(osd)) + kick_osd_requests(osd); + } + up_write(&osdc->lock); +} + +/* * init, shutdown */ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) @@ -5275,6 +5315,7 @@ static int osd_req_op_copy_from_init(struct ceph_osd_request *req, struct ceph_object_locator *src_oloc, u32 src_fadvise_flags, u32 dst_fadvise_flags, + u32 truncate_seq, u64 truncate_size, u8 copy_from_flags) { struct ceph_osd_req_op *op; @@ -5285,7 +5326,8 @@ static int osd_req_op_copy_from_init(struct ceph_osd_request *req, if (IS_ERR(pages)) return PTR_ERR(pages); - op = _osd_req_op_init(req, 0, CEPH_OSD_OP_COPY_FROM, dst_fadvise_flags); + op = _osd_req_op_init(req, 0, CEPH_OSD_OP_COPY_FROM2, + dst_fadvise_flags); op->copy_from.snapid = src_snapid; op->copy_from.src_version = src_version; op->copy_from.flags = copy_from_flags; @@ -5295,6 +5337,8 @@ static int osd_req_op_copy_from_init(struct ceph_osd_request *req, end = p + PAGE_SIZE; ceph_encode_string(&p, end, src_oid->name, src_oid->name_len); encode_oloc(&p, end, src_oloc); + ceph_encode_32(&p, truncate_seq); + ceph_encode_64(&p, truncate_size); op->indata_len = PAGE_SIZE - (end - p); ceph_osd_data_pages_init(&op->copy_from.osd_data, pages, @@ -5310,6 +5354,7 @@ int ceph_osdc_copy_from(struct ceph_osd_client *osdc, struct ceph_object_id *dst_oid, struct ceph_object_locator *dst_oloc, u32 dst_fadvise_flags, + u32 truncate_seq, u64 truncate_size, u8 copy_from_flags) { struct ceph_osd_request *req; @@ -5326,7 +5371,8 @@ int ceph_osdc_copy_from(struct ceph_osd_client *osdc, ret = osd_req_op_copy_from_init(req, src_snapid, src_version, src_oid, src_oloc, src_fadvise_flags, - dst_fadvise_flags, copy_from_flags); + dst_fadvise_flags, truncate_seq, + truncate_size, copy_from_flags); if (ret) goto out; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 90437906b7bc..4e0de14f80bb 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -973,11 +973,11 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) struct ceph_pg_pool_info, node); __remove_pg_pool(&map->pg_pools, pi); } - kfree(map->osd_state); - kfree(map->osd_weight); - kfree(map->osd_addr); - kfree(map->osd_primary_affinity); - kfree(map->crush_workspace); + kvfree(map->osd_state); + kvfree(map->osd_weight); + kvfree(map->osd_addr); + kvfree(map->osd_primary_affinity); + kvfree(map->crush_workspace); kfree(map); } @@ -986,28 +986,41 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) * * The new elements are properly initialized. */ -static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) +static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max) { u32 *state; u32 *weight; struct ceph_entity_addr *addr; + u32 to_copy; int i; - state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS); - if (!state) - return -ENOMEM; - map->osd_state = state; + dout("%s old %u new %u\n", __func__, map->max_osd, max); + if (max == map->max_osd) + return 0; - weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS); - if (!weight) + state = ceph_kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS); + weight = ceph_kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS); + addr = ceph_kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS); + if (!state || !weight || !addr) { + kvfree(state); + kvfree(weight); + kvfree(addr); return -ENOMEM; - map->osd_weight = weight; + } - addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS); - if (!addr) - return -ENOMEM; - map->osd_addr = addr; + to_copy = min(map->max_osd, max); + if (map->osd_state) { + memcpy(state, map->osd_state, to_copy * sizeof(*state)); + memcpy(weight, map->osd_weight, to_copy * sizeof(*weight)); + memcpy(addr, map->osd_addr, to_copy * sizeof(*addr)); + kvfree(map->osd_state); + kvfree(map->osd_weight); + kvfree(map->osd_addr); + } + map->osd_state = state; + map->osd_weight = weight; + map->osd_addr = addr; for (i = map->max_osd; i < max; i++) { map->osd_state[i] = 0; map->osd_weight[i] = CEPH_OSD_OUT; @@ -1017,12 +1030,16 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) if (map->osd_primary_affinity) { u32 *affinity; - affinity = krealloc(map->osd_primary_affinity, - max*sizeof(*affinity), GFP_NOFS); + affinity = ceph_kvmalloc(array_size(max, sizeof(*affinity)), + GFP_NOFS); if (!affinity) return -ENOMEM; - map->osd_primary_affinity = affinity; + memcpy(affinity, map->osd_primary_affinity, + to_copy * sizeof(*affinity)); + kvfree(map->osd_primary_affinity); + + map->osd_primary_affinity = affinity; for (i = map->max_osd; i < max; i++) map->osd_primary_affinity[i] = CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; @@ -1043,7 +1060,7 @@ static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush) work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE); dout("%s work_size %zu bytes\n", __func__, work_size); - workspace = kmalloc(work_size, GFP_NOIO); + workspace = ceph_kvmalloc(work_size, GFP_NOIO); if (!workspace) { crush_destroy(crush); return -ENOMEM; @@ -1052,7 +1069,7 @@ static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush) if (map->crush) crush_destroy(map->crush); - kfree(map->crush_workspace); + kvfree(map->crush_workspace); map->crush = crush; map->crush_workspace = workspace; return 0; @@ -1298,9 +1315,9 @@ static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff) if (!map->osd_primary_affinity) { int i; - map->osd_primary_affinity = kmalloc_array(map->max_osd, - sizeof(u32), - GFP_NOFS); + map->osd_primary_affinity = ceph_kvmalloc( + array_size(map->max_osd, sizeof(*map->osd_primary_affinity)), + GFP_NOFS); if (!map->osd_primary_affinity) return -ENOMEM; @@ -1321,7 +1338,7 @@ static int decode_primary_affinity(void **p, void *end, ceph_decode_32_safe(p, end, len, e_inval); if (len == 0) { - kfree(map->osd_primary_affinity); + kvfree(map->osd_primary_affinity); map->osd_primary_affinity = NULL; return 0; } diff --git a/net/compat.c b/net/compat.c index 0f7ded26059e..47d99c784947 100644 --- a/net/compat.c +++ b/net/compat.c @@ -232,7 +232,7 @@ int put_cmsg_compat(struct msghdr *kmsg, int level, int type, int len, void *dat (type == SO_TIMESTAMPNS_OLD || type == SO_TIMESTAMPING_OLD)) { int count = type == SO_TIMESTAMPNS_OLD ? 1 : 3; int i; - struct timespec *ts = (struct timespec *)data; + struct __kernel_old_timespec *ts = data; for (i = 0; i < count; i++) { cts[i].tv_sec = ts[i].tv_sec; cts[i].tv_nsec = ts[i].tv_nsec; diff --git a/net/core/Makefile b/net/core/Makefile index a104dc8faafc..3e2c378e5f31 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -8,7 +8,7 @@ obj-y := sock.o request_sock.o skbuff.o datagram.o stream.o scm.o \ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o -obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ +obj-y += dev.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \ fib_notifier.o xdp.o flow_offload.o diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 94c7f77ecb6b..3ab23f698221 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -12,6 +12,9 @@ static atomic_t cache_idx; +#define SK_STORAGE_CREATE_FLAG_MASK \ + (BPF_F_NO_PREALLOC | BPF_F_CLONE) + struct bucket { struct hlist_head list; raw_spinlock_t lock; @@ -209,7 +212,6 @@ static void selem_unlink_sk(struct bpf_sk_storage_elem *selem) kfree_rcu(sk_storage, rcu); } -/* sk_storage->lock must be held and sk_storage->list cannot be empty */ static void __selem_link_sk(struct bpf_sk_storage *sk_storage, struct bpf_sk_storage_elem *selem) { @@ -509,7 +511,7 @@ static int sk_storage_delete(struct sock *sk, struct bpf_map *map) return 0; } -/* Called by __sk_destruct() */ +/* Called by __sk_destruct() & bpf_sk_storage_clone() */ void bpf_sk_storage_free(struct sock *sk) { struct bpf_sk_storage_elem *selem; @@ -557,6 +559,11 @@ static void bpf_sk_storage_map_free(struct bpf_map *map) smap = (struct bpf_sk_storage_map *)map; + /* Note that this map might be concurrently cloned from + * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone + * RCU read section to finish before proceeding. New RCU + * read sections should be prevented via bpf_map_inc_not_zero. + */ synchronize_rcu(); /* bpf prog and the userspace can no longer access this map @@ -601,7 +608,9 @@ static void bpf_sk_storage_map_free(struct bpf_map *map) static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr) { - if (attr->map_flags != BPF_F_NO_PREALLOC || attr->max_entries || + if (attr->map_flags & ~SK_STORAGE_CREATE_FLAG_MASK || + !(attr->map_flags & BPF_F_NO_PREALLOC) || + attr->max_entries || attr->key_size != sizeof(int) || !attr->value_size || /* Enforce BTF for userspace sk dumping */ !attr->btf_key_type_id || !attr->btf_value_type_id) @@ -634,9 +643,10 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&smap->map, attr); + nbuckets = roundup_pow_of_two(num_possible_cpus()); /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */ - smap->bucket_log = max_t(u32, 1, ilog2(roundup_pow_of_two(num_possible_cpus()))); - nbuckets = 1U << smap->bucket_log; + nbuckets = max_t(u32, 2, nbuckets); + smap->bucket_log = ilog2(nbuckets); cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap); ret = bpf_map_charge_init(&smap->map.memory, cost); @@ -739,6 +749,95 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key) return err; } +static struct bpf_sk_storage_elem * +bpf_sk_storage_clone_elem(struct sock *newsk, + struct bpf_sk_storage_map *smap, + struct bpf_sk_storage_elem *selem) +{ + struct bpf_sk_storage_elem *copy_selem; + + copy_selem = selem_alloc(smap, newsk, NULL, true); + if (!copy_selem) + return NULL; + + if (map_value_has_spin_lock(&smap->map)) + copy_map_value_locked(&smap->map, SDATA(copy_selem)->data, + SDATA(selem)->data, true); + else + copy_map_value(&smap->map, SDATA(copy_selem)->data, + SDATA(selem)->data); + + return copy_selem; +} + +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) +{ + struct bpf_sk_storage *new_sk_storage = NULL; + struct bpf_sk_storage *sk_storage; + struct bpf_sk_storage_elem *selem; + int ret = 0; + + RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL); + + rcu_read_lock(); + sk_storage = rcu_dereference(sk->sk_bpf_storage); + + if (!sk_storage || hlist_empty(&sk_storage->list)) + goto out; + + hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) { + struct bpf_sk_storage_elem *copy_selem; + struct bpf_sk_storage_map *smap; + struct bpf_map *map; + + smap = rcu_dereference(SDATA(selem)->smap); + if (!(smap->map.map_flags & BPF_F_CLONE)) + continue; + + /* Note that for lockless listeners adding new element + * here can race with cleanup in bpf_sk_storage_map_free. + * Try to grab map refcnt to make sure that it's still + * alive and prevent concurrent removal. + */ + map = bpf_map_inc_not_zero(&smap->map); + if (IS_ERR(map)) + continue; + + copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem); + if (!copy_selem) { + ret = -ENOMEM; + bpf_map_put(map); + goto out; + } + + if (new_sk_storage) { + selem_link_map(smap, copy_selem); + __selem_link_sk(new_sk_storage, copy_selem); + } else { + ret = sk_storage_alloc(newsk, smap, copy_selem); + if (ret) { + kfree(copy_selem); + atomic_sub(smap->elem_size, + &newsk->sk_omem_alloc); + bpf_map_put(map); + goto out; + } + + new_sk_storage = rcu_dereference(copy_selem->sk_storage); + } + bpf_map_put(map); + } + +out: + rcu_read_unlock(); + + /* In case of an error, don't free anything explicitly here, the + * caller is responsible to call bpf_sk_storage_free. + */ + + return ret; +} + BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, void *, value, u64, flags) { diff --git a/net/core/datagram.c b/net/core/datagram.c index 45a162ef5e02..a78e7f864c1e 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -84,7 +84,8 @@ static int receiver_wake_function(wait_queue_entry_t *wait, unsigned int mode, i /* * Wait for the last received packet to be different from skb */ -int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p, +int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue, + int *err, long *timeo_p, const struct sk_buff *skb) { int error; @@ -97,7 +98,7 @@ int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p, if (error) goto out_err; - if (sk->sk_receive_queue.prev != skb) + if (READ_ONCE(queue->prev) != skb) goto out; /* Socket shut down? */ @@ -209,6 +210,7 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, /** * __skb_try_recv_datagram - Receive a datagram skbuff * @sk: socket + * @queue: socket queue from which to receive * @flags: MSG\_ flags * @destructor: invoked under the receive lock on successful dequeue * @off: an offset in bytes to peek skb from. Returns an offset @@ -241,13 +243,14 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, * quite explicitly by POSIX 1003.1g, don't change them without having * the standard around please. */ -struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags, +struct sk_buff *__skb_try_recv_datagram(struct sock *sk, + struct sk_buff_head *queue, + unsigned int flags, void (*destructor)(struct sock *sk, struct sk_buff *skb), int *off, int *err, struct sk_buff **last) { - struct sk_buff_head *queue = &sk->sk_receive_queue; struct sk_buff *skb; unsigned long cpu_flags; /* @@ -278,7 +281,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags, break; sk_busy_loop(sk, flags & MSG_DONTWAIT); - } while (sk->sk_receive_queue.prev != *last); + } while (READ_ONCE(queue->prev) != *last); error = -EAGAIN; @@ -288,7 +291,9 @@ no_packet: } EXPORT_SYMBOL(__skb_try_recv_datagram); -struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, +struct sk_buff *__skb_recv_datagram(struct sock *sk, + struct sk_buff_head *sk_queue, + unsigned int flags, void (*destructor)(struct sock *sk, struct sk_buff *skb), int *off, int *err) @@ -299,15 +304,16 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { - skb = __skb_try_recv_datagram(sk, flags, destructor, off, err, - &last); + skb = __skb_try_recv_datagram(sk, sk_queue, flags, destructor, + off, err, &last); if (skb) return skb; if (*err != -EAGAIN) break; } while (timeo && - !__skb_wait_for_more_packets(sk, err, &timeo, last)); + !__skb_wait_for_more_packets(sk, sk_queue, err, + &timeo, last)); return NULL; } @@ -318,7 +324,8 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags, { int off = 0; - return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), + return __skb_recv_datagram(sk, &sk->sk_receive_queue, + flags | (noblock ? MSG_DONTWAIT : 0), NULL, &off, err); } EXPORT_SYMBOL(skb_recv_datagram); @@ -442,8 +449,8 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset, if (copy > len) copy = len; - n = cb(vaddr + frag->page_offset + - offset - start, copy, data, to); + n = cb(vaddr + skb_frag_off(frag) + offset - start, + copy, data, to); kunmap(page); offset += n; if (n != copy) @@ -573,7 +580,7 @@ int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, if (copy > len) copy = len; copied = copy_page_from_iter(skb_frag_page(frag), - frag->page_offset + offset - start, + skb_frag_off(frag) + offset - start, copy, from); if (copied != copy) goto fault; @@ -640,7 +647,7 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, skb->len += copied; skb->truesize += truesize; if (sk && sk->sk_type == SOCK_STREAM) { - sk->sk_wmem_queued += truesize; + sk_wmem_queued_add(sk, truesize); sk_mem_charge(sk, truesize); } else { refcount_add(truesize, &skb->sk->sk_wmem_alloc); @@ -767,7 +774,7 @@ __poll_t datagram_poll(struct file *file, struct socket *sock, mask = 0; /* exceptional events? */ - if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); @@ -777,7 +784,7 @@ __poll_t datagram_poll(struct file *file, struct socket *sock, mask |= EPOLLHUP; /* readable? */ - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* Connection-based need to check for termination and startup */ diff --git a/net/core/dev.c b/net/core/dev.c index 0891f499c1bb..a6316b336128 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -146,6 +146,7 @@ #include "net-sysfs.h" #define MAX_GRO_SKBS 8 +#define MAX_NEST_DEV 8 /* This should be increased if a protocol with a bigger head is added. */ #define GRO_MAX_HEAD (MAX_HEADER + 128) @@ -228,6 +229,122 @@ static inline void rps_unlock(struct softnet_data *sd) #endif } +static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev, + const char *name) +{ + struct netdev_name_node *name_node; + + name_node = kmalloc(sizeof(*name_node), GFP_KERNEL); + if (!name_node) + return NULL; + INIT_HLIST_NODE(&name_node->hlist); + name_node->dev = dev; + name_node->name = name; + return name_node; +} + +static struct netdev_name_node * +netdev_name_node_head_alloc(struct net_device *dev) +{ + struct netdev_name_node *name_node; + + name_node = netdev_name_node_alloc(dev, dev->name); + if (!name_node) + return NULL; + INIT_LIST_HEAD(&name_node->list); + return name_node; +} + +static void netdev_name_node_free(struct netdev_name_node *name_node) +{ + kfree(name_node); +} + +static void netdev_name_node_add(struct net *net, + struct netdev_name_node *name_node) +{ + hlist_add_head_rcu(&name_node->hlist, + dev_name_hash(net, name_node->name)); +} + +static void netdev_name_node_del(struct netdev_name_node *name_node) +{ + hlist_del_rcu(&name_node->hlist); +} + +static struct netdev_name_node *netdev_name_node_lookup(struct net *net, + const char *name) +{ + struct hlist_head *head = dev_name_hash(net, name); + struct netdev_name_node *name_node; + + hlist_for_each_entry(name_node, head, hlist) + if (!strcmp(name_node->name, name)) + return name_node; + return NULL; +} + +static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net, + const char *name) +{ + struct hlist_head *head = dev_name_hash(net, name); + struct netdev_name_node *name_node; + + hlist_for_each_entry_rcu(name_node, head, hlist) + if (!strcmp(name_node->name, name)) + return name_node; + return NULL; +} + +int netdev_name_node_alt_create(struct net_device *dev, const char *name) +{ + struct netdev_name_node *name_node; + struct net *net = dev_net(dev); + + name_node = netdev_name_node_lookup(net, name); + if (name_node) + return -EEXIST; + name_node = netdev_name_node_alloc(dev, name); + if (!name_node) + return -ENOMEM; + netdev_name_node_add(net, name_node); + /* The node that holds dev->name acts as a head of per-device list. */ + list_add_tail(&name_node->list, &dev->name_node->list); + + return 0; +} +EXPORT_SYMBOL(netdev_name_node_alt_create); + +static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node) +{ + list_del(&name_node->list); + netdev_name_node_del(name_node); + kfree(name_node->name); + netdev_name_node_free(name_node); +} + +int netdev_name_node_alt_destroy(struct net_device *dev, const char *name) +{ + struct netdev_name_node *name_node; + struct net *net = dev_net(dev); + + name_node = netdev_name_node_lookup(net, name); + if (!name_node) + return -ENOENT; + __netdev_name_node_alt_destroy(name_node); + + return 0; +} +EXPORT_SYMBOL(netdev_name_node_alt_destroy); + +static void netdev_name_node_alt_flush(struct net_device *dev) +{ + struct netdev_name_node *name_node, *tmp; + + list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) + __netdev_name_node_alt_destroy(name_node); +} + /* Device list insertion */ static void list_netdevice(struct net_device *dev) { @@ -237,7 +354,7 @@ static void list_netdevice(struct net_device *dev) write_lock_bh(&dev_base_lock); list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); - hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); + netdev_name_node_add(net, dev->name_node); hlist_add_head_rcu(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); write_unlock_bh(&dev_base_lock); @@ -255,7 +372,7 @@ static void unlist_netdevice(struct net_device *dev) /* Unlink dev from the device chain */ write_lock_bh(&dev_base_lock); list_del_rcu(&dev->dev_list); - hlist_del_rcu(&dev->name_hlist); + netdev_name_node_del(dev->name_node); hlist_del_rcu(&dev->index_hlist); write_unlock_bh(&dev_base_lock); @@ -276,88 +393,6 @@ static RAW_NOTIFIER_HEAD(netdev_chain); DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); EXPORT_PER_CPU_SYMBOL(softnet_data); -#ifdef CONFIG_LOCKDEP -/* - * register_netdevice() inits txq->_xmit_lock and sets lockdep class - * according to dev->type - */ -static const unsigned short netdev_lock_type[] = { - ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, - ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, - ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, - ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, - ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, - ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, - ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, - ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, - ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, - ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, - ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, - ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, - ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM, - ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, - ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; - -static const char *const netdev_lock_name[] = { - "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", - "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", - "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", - "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", - "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", - "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", - "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", - "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", - "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", - "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", - "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", - "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", - "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", - "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", - "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; - -static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; -static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; - -static inline unsigned short netdev_lock_pos(unsigned short dev_type) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) - if (netdev_lock_type[i] == dev_type) - return i; - /* the last key is used by default */ - return ARRAY_SIZE(netdev_lock_type) - 1; -} - -static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, - unsigned short dev_type) -{ - int i; - - i = netdev_lock_pos(dev_type); - lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], - netdev_lock_name[i]); -} - -static inline void netdev_set_addr_lockdep_class(struct net_device *dev) -{ - int i; - - i = netdev_lock_pos(dev->type); - lockdep_set_class_and_name(&dev->addr_list_lock, - &netdev_addr_lock_key[i], - netdev_lock_name[i]); -} -#else -static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, - unsigned short dev_type) -{ -} -static inline void netdev_set_addr_lockdep_class(struct net_device *dev) -{ -} -#endif - /******************************************************************************* * * Protocol management and registration routines @@ -733,14 +768,10 @@ EXPORT_SYMBOL_GPL(dev_fill_metadata_dst); struct net_device *__dev_get_by_name(struct net *net, const char *name) { - struct net_device *dev; - struct hlist_head *head = dev_name_hash(net, name); - - hlist_for_each_entry(dev, head, name_hlist) - if (!strncmp(dev->name, name, IFNAMSIZ)) - return dev; + struct netdev_name_node *node_name; - return NULL; + node_name = netdev_name_node_lookup(net, name); + return node_name ? node_name->dev : NULL; } EXPORT_SYMBOL(__dev_get_by_name); @@ -758,14 +789,10 @@ EXPORT_SYMBOL(__dev_get_by_name); struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) { - struct net_device *dev; - struct hlist_head *head = dev_name_hash(net, name); + struct netdev_name_node *node_name; - hlist_for_each_entry_rcu(dev, head, name_hlist) - if (!strncmp(dev->name, name, IFNAMSIZ)) - return dev; - - return NULL; + node_name = netdev_name_node_lookup_rcu(net, name); + return node_name ? node_name->dev : NULL; } EXPORT_SYMBOL(dev_get_by_name_rcu); @@ -901,7 +928,7 @@ EXPORT_SYMBOL(dev_get_by_napi_id); * * The use of raw_seqcount_begin() and cond_resched() before * retrying is required as we want to give the writers a chance - * to complete when CONFIG_PREEMPT is not set. + * to complete when CONFIG_PREEMPTION is not set. */ int netdev_get_name(struct net *net, char *name, int ifindex) { @@ -1141,8 +1168,8 @@ int dev_alloc_name(struct net_device *dev, const char *name) } EXPORT_SYMBOL(dev_alloc_name); -int dev_get_valid_name(struct net *net, struct net_device *dev, - const char *name) +static int dev_get_valid_name(struct net *net, struct net_device *dev, + const char *name) { BUG_ON(!net); @@ -1158,7 +1185,6 @@ int dev_get_valid_name(struct net *net, struct net_device *dev, return 0; } -EXPORT_SYMBOL(dev_get_valid_name); /** * dev_change_name - change name of a device @@ -1232,13 +1258,13 @@ rollback: netdev_adjacent_rename_links(dev, oldname); write_lock_bh(&dev_base_lock); - hlist_del_rcu(&dev->name_hlist); + netdev_name_node_del(dev->name_node); write_unlock_bh(&dev_base_lock); synchronize_rcu(); write_lock_bh(&dev_base_lock); - hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); + netdev_name_node_add(net, dev->name_node); write_unlock_bh(&dev_base_lock); ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); @@ -1288,8 +1314,8 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len) } mutex_lock(&ifalias_mutex); - rcu_swap_protected(dev->ifalias, new_alias, - mutex_is_locked(&ifalias_mutex)); + new_alias = rcu_replace_pointer(dev->ifalias, new_alias, + mutex_is_locked(&ifalias_mutex)); mutex_unlock(&ifalias_mutex); if (new_alias) @@ -1617,6 +1643,62 @@ static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, return nb->notifier_call(nb, val, &info); } +static int call_netdevice_register_notifiers(struct notifier_block *nb, + struct net_device *dev) +{ + int err; + + err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); + err = notifier_to_errno(err); + if (err) + return err; + + if (!(dev->flags & IFF_UP)) + return 0; + + call_netdevice_notifier(nb, NETDEV_UP, dev); + return 0; +} + +static void call_netdevice_unregister_notifiers(struct notifier_block *nb, + struct net_device *dev) +{ + if (dev->flags & IFF_UP) { + call_netdevice_notifier(nb, NETDEV_GOING_DOWN, + dev); + call_netdevice_notifier(nb, NETDEV_DOWN, dev); + } + call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); +} + +static int call_netdevice_register_net_notifiers(struct notifier_block *nb, + struct net *net) +{ + struct net_device *dev; + int err; + + for_each_netdev(net, dev) { + err = call_netdevice_register_notifiers(nb, dev); + if (err) + goto rollback; + } + return 0; + +rollback: + for_each_netdev_continue_reverse(net, dev) + call_netdevice_unregister_notifiers(nb, dev); + return err; +} + +static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb, + struct net *net) +{ + struct net_device *dev; + + for_each_netdev(net, dev) + call_netdevice_unregister_notifiers(nb, dev); +} + static int dev_boot_phase = 1; /** @@ -1635,8 +1717,6 @@ static int dev_boot_phase = 1; int register_netdevice_notifier(struct notifier_block *nb) { - struct net_device *dev; - struct net_device *last; struct net *net; int err; @@ -1649,17 +1729,9 @@ int register_netdevice_notifier(struct notifier_block *nb) if (dev_boot_phase) goto unlock; for_each_net(net) { - for_each_netdev(net, dev) { - err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); - err = notifier_to_errno(err); - if (err) - goto rollback; - - if (!(dev->flags & IFF_UP)) - continue; - - call_netdevice_notifier(nb, NETDEV_UP, dev); - } + err = call_netdevice_register_net_notifiers(nb, net); + if (err) + goto rollback; } unlock: @@ -1668,22 +1740,9 @@ unlock: return err; rollback: - last = dev; - for_each_net(net) { - for_each_netdev(net, dev) { - if (dev == last) - goto outroll; - - if (dev->flags & IFF_UP) { - call_netdevice_notifier(nb, NETDEV_GOING_DOWN, - dev); - call_netdevice_notifier(nb, NETDEV_DOWN, dev); - } - call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); - } - } + for_each_net_continue_reverse(net) + call_netdevice_unregister_net_notifiers(nb, net); -outroll: raw_notifier_chain_unregister(&netdev_chain, nb); goto unlock; } @@ -1705,7 +1764,6 @@ EXPORT_SYMBOL(register_netdevice_notifier); int unregister_netdevice_notifier(struct notifier_block *nb) { - struct net_device *dev; struct net *net; int err; @@ -1716,16 +1774,9 @@ int unregister_netdevice_notifier(struct notifier_block *nb) if (err) goto unlock; - for_each_net(net) { - for_each_netdev(net, dev) { - if (dev->flags & IFF_UP) { - call_netdevice_notifier(nb, NETDEV_GOING_DOWN, - dev); - call_netdevice_notifier(nb, NETDEV_DOWN, dev); - } - call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); - } - } + for_each_net(net) + call_netdevice_unregister_net_notifiers(nb, net); + unlock: rtnl_unlock(); up_write(&pernet_ops_rwsem); @@ -1733,6 +1784,138 @@ unlock: } EXPORT_SYMBOL(unregister_netdevice_notifier); +static int __register_netdevice_notifier_net(struct net *net, + struct notifier_block *nb, + bool ignore_call_fail) +{ + int err; + + err = raw_notifier_chain_register(&net->netdev_chain, nb); + if (err) + return err; + if (dev_boot_phase) + return 0; + + err = call_netdevice_register_net_notifiers(nb, net); + if (err && !ignore_call_fail) + goto chain_unregister; + + return 0; + +chain_unregister: + raw_notifier_chain_unregister(&net->netdev_chain, nb); + return err; +} + +static int __unregister_netdevice_notifier_net(struct net *net, + struct notifier_block *nb) +{ + int err; + + err = raw_notifier_chain_unregister(&net->netdev_chain, nb); + if (err) + return err; + + call_netdevice_unregister_net_notifiers(nb, net); + return 0; +} + +/** + * register_netdevice_notifier_net - register a per-netns network notifier block + * @net: network namespace + * @nb: notifier + * + * Register a notifier to be called when network device events occur. + * The notifier passed is linked into the kernel structures and must + * not be reused until it has been unregistered. A negative errno code + * is returned on a failure. + * + * When registered all registration and up events are replayed + * to the new notifier to allow device to have a race free + * view of the network device list. + */ + +int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb) +{ + int err; + + rtnl_lock(); + err = __register_netdevice_notifier_net(net, nb, false); + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL(register_netdevice_notifier_net); + +/** + * unregister_netdevice_notifier_net - unregister a per-netns + * network notifier block + * @net: network namespace + * @nb: notifier + * + * Unregister a notifier previously registered by + * register_netdevice_notifier(). The notifier is unlinked into the + * kernel structures and may then be reused. A negative errno code + * is returned on a failure. + * + * After unregistering unregister and down device events are synthesized + * for all devices on the device list to the removed notifier to remove + * the need for special case cleanup code. + */ + +int unregister_netdevice_notifier_net(struct net *net, + struct notifier_block *nb) +{ + int err; + + rtnl_lock(); + err = __unregister_netdevice_notifier_net(net, nb); + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL(unregister_netdevice_notifier_net); + +int register_netdevice_notifier_dev_net(struct net_device *dev, + struct notifier_block *nb, + struct netdev_net_notifier *nn) +{ + int err; + + rtnl_lock(); + err = __register_netdevice_notifier_net(dev_net(dev), nb, false); + if (!err) { + nn->nb = nb; + list_add(&nn->list, &dev->net_notifier_list); + } + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL(register_netdevice_notifier_dev_net); + +int unregister_netdevice_notifier_dev_net(struct net_device *dev, + struct notifier_block *nb, + struct netdev_net_notifier *nn) +{ + int err; + + rtnl_lock(); + list_del(&nn->list); + err = __unregister_netdevice_notifier_net(dev_net(dev), nb); + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net); + +static void move_netdevice_notifiers_dev_net(struct net_device *dev, + struct net *net) +{ + struct netdev_net_notifier *nn; + + list_for_each_entry(nn, &dev->net_notifier_list, list) { + __unregister_netdevice_notifier_net(dev_net(dev), nn->nb); + __register_netdevice_notifier_net(net, nn->nb, true); + } +} + /** * call_netdevice_notifiers_info - call all network notifier blocks * @val: value passed unmodified to notifier function @@ -1745,7 +1928,18 @@ EXPORT_SYMBOL(unregister_netdevice_notifier); static int call_netdevice_notifiers_info(unsigned long val, struct netdev_notifier_info *info) { + struct net *net = dev_net(info->dev); + int ret; + ASSERT_RTNL(); + + /* Run per-netns notifier block chain first, then run the global one. + * Hopefully, one day, the global one is going to be removed after + * all notifier block registrators get converted to be per-netns. + */ + ret = raw_notifier_call_chain(&net->netdev_chain, val, info); + if (ret & NOTIFY_STOP_MASK) + return ret; return raw_notifier_call_chain(&netdev_chain, val, info); } @@ -2771,7 +2965,7 @@ static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) void netif_schedule_queue(struct netdev_queue *txq) { rcu_read_lock(); - if (!(txq->state & QUEUE_STATE_ANY_XOFF)) { + if (!netif_xmit_stopped(txq)) { struct Qdisc *q = rcu_dereference(txq->qdisc); __netif_schedule(q); @@ -2939,12 +3133,9 @@ int skb_checksum_help(struct sk_buff *skb) offset += skb->csum_offset; BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); - if (skb_cloned(skb) && - !skb_clone_writable(skb, offset + sizeof(__sum16))) { - ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); - if (ret) - goto out; - } + ret = skb_ensure_writable(skb, offset + sizeof(__sum16)); + if (ret) + goto out; *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0; out_set_summed: @@ -2979,12 +3170,11 @@ int skb_crc32c_csum_help(struct sk_buff *skb) ret = -EINVAL; goto out; } - if (skb_cloned(skb) && - !skb_clone_writable(skb, offset + sizeof(__le32))) { - ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); - if (ret) - goto out; - } + + ret = skb_ensure_writable(skb, offset + sizeof(__le32)); + if (ret) + goto out; + crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start, skb->len - start, ~(__u32)0, crc32c_csum_stub)); @@ -3109,7 +3299,7 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, segs = skb_mac_gso_segment(skb, features); - if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs))) + if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs))) skb_warn_bad_offload(skb); return segs; @@ -3467,18 +3657,22 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, qdisc_calculate_pkt_len(skb, q); if (q->flags & TCQ_F_NOLOCK) { - if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { - __qdisc_drop(skb, &to_free); - rc = NET_XMIT_DROP; - } else if ((q->flags & TCQ_F_CAN_BYPASS) && q->empty && - qdisc_run_begin(q)) { + if ((q->flags & TCQ_F_CAN_BYPASS) && READ_ONCE(q->empty) && + qdisc_run_begin(q)) { + if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, + &q->state))) { + __qdisc_drop(skb, &to_free); + rc = NET_XMIT_DROP; + goto end_run; + } qdisc_bstats_cpu_update(q, skb); + rc = NET_XMIT_SUCCESS; if (sch_direct_xmit(skb, q, dev, txq, NULL, true)) __qdisc_run(q); +end_run: qdisc_run_end(q); - rc = NET_XMIT_SUCCESS; } else { rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; qdisc_run(q); @@ -3963,6 +4157,8 @@ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ int dev_rx_weight __read_mostly = 64; int dev_tx_weight __read_mostly = 64; +/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */ +int gro_normal_batch __read_mostly = 8; /* Called with irq disabled */ static inline void ____napi_schedule(struct softnet_data *sd, @@ -4331,14 +4527,14 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, /* Reinjected packets coming from act_mirred or similar should * not get XDP generic processing. */ - if (skb_cloned(skb) || skb_is_tc_redirected(skb)) + if (skb_is_tc_redirected(skb)) return XDP_PASS; /* XDP packets must be linear and must have sufficient headroom * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also * native XDP provides, thus we need to do it here as well. */ - if (skb_is_nonlinear(skb) || + if (skb_cloned(skb) || skb_is_nonlinear(skb) || skb_headroom(skb) < XDP_PACKET_HEADROOM) { int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); int troom = skb->tail + skb->data_len - skb->end; @@ -4786,7 +4982,6 @@ static bool skb_pfmemalloc_protocol(struct sk_buff *skb) static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev) { -#ifdef CONFIG_NETFILTER_INGRESS if (nf_hook_ingress_active(skb)) { int ingress_retval; @@ -4800,7 +4995,6 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, rcu_read_unlock(); return ingress_retval; } -#endif /* CONFIG_NETFILTER_INGRESS */ return 0; } @@ -5345,9 +5539,29 @@ static void flush_all_backlogs(void) put_online_cpus(); } +/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ +static void gro_normal_list(struct napi_struct *napi) +{ + if (!napi->rx_count) + return; + netif_receive_skb_list_internal(&napi->rx_list); + INIT_LIST_HEAD(&napi->rx_list); + napi->rx_count = 0; +} + +/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, + * pass the whole batch up to the stack. + */ +static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb) +{ + list_add_tail(&skb->list, &napi->rx_list); + if (++napi->rx_count >= gro_normal_batch) + gro_normal_list(napi); +} + INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int)); INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int)); -static int napi_gro_complete(struct sk_buff *skb) +static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) { struct packet_offload *ptype; __be16 type = skb->protocol; @@ -5380,7 +5594,8 @@ static int napi_gro_complete(struct sk_buff *skb) } out: - return netif_receive_skb_internal(skb); + gro_normal_one(napi, skb); + return NET_RX_SUCCESS; } static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, @@ -5393,7 +5608,7 @@ static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) return; skb_list_del_init(skb); - napi_gro_complete(skb); + napi_gro_complete(napi, skb); napi->gro_hash[index].count--; } @@ -5440,7 +5655,7 @@ static struct list_head *gro_list_prepare(struct napi_struct *napi, diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb); if (skb_vlan_tag_present(p)) - diffs |= p->vlan_tci ^ skb->vlan_tci; + diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb); diffs |= skb_metadata_dst_cmp(p, skb); diffs |= skb_metadata_differs(p, skb); if (maclen == ETH_HLEN) @@ -5465,8 +5680,7 @@ static void skb_gro_reset_offset(struct sk_buff *skb) NAPI_GRO_CB(skb)->frag0 = NULL; NAPI_GRO_CB(skb)->frag0_len = 0; - if (skb_mac_header(skb) == skb_tail_pointer(skb) && - pinfo->nr_frags && + if (!skb_headlen(skb) && pinfo->nr_frags && !PageHighMem(skb_frag_page(frag0))) { NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, @@ -5486,7 +5700,7 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow) skb->data_len -= grow; skb->tail += grow; - pinfo->frags[0].page_offset += grow; + skb_frag_off_add(&pinfo->frags[0], grow); skb_frag_size_sub(&pinfo->frags[0], grow); if (unlikely(!skb_frag_size(&pinfo->frags[0]))) { @@ -5496,7 +5710,7 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow) } } -static void gro_flush_oldest(struct list_head *head) +static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) { struct sk_buff *oldest; @@ -5512,7 +5726,7 @@ static void gro_flush_oldest(struct list_head *head) * SKB to the chain. */ skb_list_del_init(oldest); - napi_gro_complete(oldest); + napi_gro_complete(napi, oldest); } INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *, @@ -5578,7 +5792,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (&ptype->list == head) goto normal; - if (IS_ERR(pp) && PTR_ERR(pp) == -EINPROGRESS) { + if (PTR_ERR(pp) == -EINPROGRESS) { ret = GRO_CONSUMED; goto ok; } @@ -5588,7 +5802,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (pp) { skb_list_del_init(pp); - napi_gro_complete(pp); + napi_gro_complete(napi, pp); napi->gro_hash[hash].count--; } @@ -5599,7 +5813,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff goto normal; if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) { - gro_flush_oldest(gro_head); + gro_flush_oldest(napi, gro_head); } else { napi->gro_hash[hash].count++; } @@ -5660,16 +5874,17 @@ EXPORT_SYMBOL(gro_find_complete_by_type); static void napi_skb_free_stolen_head(struct sk_buff *skb) { skb_dst_drop(skb); - secpath_reset(skb); + skb_ext_put(skb); kmem_cache_free(skbuff_head_cache, skb); } -static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) +static gro_result_t napi_skb_finish(struct napi_struct *napi, + struct sk_buff *skb, + gro_result_t ret) { switch (ret) { case GRO_NORMAL: - if (netif_receive_skb_internal(skb)) - ret = GRO_DROP; + gro_normal_one(napi, skb); break; case GRO_DROP: @@ -5701,7 +5916,7 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) skb_gro_reset_offset(skb); - ret = napi_skb_finish(dev_gro_receive(napi, skb), skb); + ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb)); trace_napi_gro_receive_exit(ret); return ret; @@ -5727,7 +5942,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) skb->encapsulation = 0; skb_shinfo(skb)->gso_type = 0; skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); - secpath_reset(skb); + skb_ext_reset(skb); napi->skb = skb; } @@ -5756,8 +5971,8 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, case GRO_HELD: __skb_push(skb, ETH_HLEN); skb->protocol = eth_type_trans(skb, skb->dev); - if (ret == GRO_NORMAL && netif_receive_skb_internal(skb)) - ret = GRO_DROP; + if (ret == GRO_NORMAL) + gro_normal_one(napi, skb); break; case GRO_DROP: @@ -6049,6 +6264,9 @@ bool napi_complete_done(struct napi_struct *n, int work_done) hrtimer_start(&n->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); } + + gro_normal_list(n); + if (unlikely(!list_empty(&n->poll_list))) { /* If n->poll_list is not empty, we need to mask irqs */ local_irq_save(flags); @@ -6119,10 +6337,19 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) * Ideally, a new ndo_busy_poll_stop() could avoid another round. */ rc = napi->poll(napi, BUSY_POLL_BUDGET); + /* We can't gro_normal_list() here, because napi->poll() might have + * rearmed the napi (napi_complete_done()) in which case it could + * already be running on another CPU. + */ trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); netpoll_poll_unlock(have_poll_lock); - if (rc == BUSY_POLL_BUDGET) + if (rc == BUSY_POLL_BUDGET) { + /* As the whole budget was spent, we still own the napi so can + * safely handle the rx_list. + */ + gro_normal_list(napi); __napi_schedule(napi); + } local_bh_enable(); } @@ -6167,6 +6394,7 @@ restart: } work = napi_poll(napi, BUSY_POLL_BUDGET); trace_napi_poll(napi, work, BUSY_POLL_BUDGET); + gro_normal_list(napi); count: if (work > 0) __NET_ADD_STATS(dev_net(napi->dev), @@ -6272,6 +6500,8 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, napi->timer.function = napi_watchdog; init_gro_hash(napi); napi->skb = NULL; + INIT_LIST_HEAD(&napi->rx_list); + napi->rx_count = 0; napi->poll = poll; if (weight > NAPI_POLL_WEIGHT) netdev_err_once(dev, "%s() called with weight %d\n", __func__, @@ -6375,6 +6605,8 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) napi_gro_flush(n, HZ >= 1000); } + gro_normal_list(n); + /* Some drivers may have called napi_schedule * prior to exhausting their budget. */ @@ -6447,6 +6679,9 @@ struct netdev_adjacent { /* upper master flag, there can only be one master device per list */ bool master; + /* lookup ignore flag */ + bool ignore; + /* counter for the number of times this device was added to us */ u16 ref_nr; @@ -6469,7 +6704,7 @@ static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev, return NULL; } -static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data) +static int ____netdev_has_upper_dev(struct net_device *upper_dev, void *data) { struct net_device *dev = data; @@ -6490,7 +6725,7 @@ bool netdev_has_upper_dev(struct net_device *dev, { ASSERT_RTNL(); - return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev, + return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev, upper_dev); } EXPORT_SYMBOL(netdev_has_upper_dev); @@ -6508,7 +6743,7 @@ EXPORT_SYMBOL(netdev_has_upper_dev); bool netdev_has_upper_dev_all_rcu(struct net_device *dev, struct net_device *upper_dev) { - return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev, + return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev, upper_dev); } EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu); @@ -6552,6 +6787,22 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev) } EXPORT_SYMBOL(netdev_master_upper_dev_get); +static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev) +{ + struct netdev_adjacent *upper; + + ASSERT_RTNL(); + + if (list_empty(&dev->adj_list.upper)) + return NULL; + + upper = list_first_entry(&dev->adj_list.upper, + struct netdev_adjacent, list); + if (likely(upper->master) && !upper->ignore) + return upper->dev; + return NULL; +} + /** * netdev_has_any_lower_dev - Check if device is linked to some device * @dev: device @@ -6602,6 +6853,23 @@ struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, } EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); +static struct net_device *__netdev_next_upper_dev(struct net_device *dev, + struct list_head **iter, + bool *ignore) +{ + struct netdev_adjacent *upper; + + upper = list_entry((*iter)->next, struct netdev_adjacent, list); + + if (&upper->list == &dev->adj_list.upper) + return NULL; + + *iter = &upper->list; + *ignore = upper->ignore; + + return upper->dev; +} + static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev, struct list_head **iter) { @@ -6619,34 +6887,111 @@ static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev, return upper->dev; } +static int __netdev_walk_all_upper_dev(struct net_device *dev, + int (*fn)(struct net_device *dev, + void *data), + void *data) +{ + struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; + struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; + int ret, cur = 0; + bool ignore; + + now = dev; + iter = &dev->adj_list.upper; + + while (1) { + if (now != dev) { + ret = fn(now, data); + if (ret) + return ret; + } + + next = NULL; + while (1) { + udev = __netdev_next_upper_dev(now, &iter, &ignore); + if (!udev) + break; + if (ignore) + continue; + + next = udev; + niter = &udev->adj_list.upper; + dev_stack[cur] = now; + iter_stack[cur++] = iter; + break; + } + + if (!next) { + if (!cur) + return 0; + next = dev_stack[--cur]; + niter = iter_stack[cur]; + } + + now = next; + iter = niter; + } + + return 0; +} + int netdev_walk_all_upper_dev_rcu(struct net_device *dev, int (*fn)(struct net_device *dev, void *data), void *data) { - struct net_device *udev; - struct list_head *iter; - int ret; + struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; + struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; + int ret, cur = 0; - for (iter = &dev->adj_list.upper, - udev = netdev_next_upper_dev_rcu(dev, &iter); - udev; - udev = netdev_next_upper_dev_rcu(dev, &iter)) { - /* first is the upper device itself */ - ret = fn(udev, data); - if (ret) - return ret; + now = dev; + iter = &dev->adj_list.upper; - /* then look at all of its upper devices */ - ret = netdev_walk_all_upper_dev_rcu(udev, fn, data); - if (ret) - return ret; + while (1) { + if (now != dev) { + ret = fn(now, data); + if (ret) + return ret; + } + + next = NULL; + while (1) { + udev = netdev_next_upper_dev_rcu(now, &iter); + if (!udev) + break; + + next = udev; + niter = &udev->adj_list.upper; + dev_stack[cur] = now; + iter_stack[cur++] = iter; + break; + } + + if (!next) { + if (!cur) + return 0; + next = dev_stack[--cur]; + niter = iter_stack[cur]; + } + + now = next; + iter = niter; } return 0; } EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu); +static bool __netdev_has_upper_dev(struct net_device *dev, + struct net_device *upper_dev) +{ + ASSERT_RTNL(); + + return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev, + upper_dev); +} + /** * netdev_lower_get_next_private - Get the next ->private from the * lower neighbour list @@ -6743,34 +7088,119 @@ static struct net_device *netdev_next_lower_dev(struct net_device *dev, return lower->dev; } +static struct net_device *__netdev_next_lower_dev(struct net_device *dev, + struct list_head **iter, + bool *ignore) +{ + struct netdev_adjacent *lower; + + lower = list_entry((*iter)->next, struct netdev_adjacent, list); + + if (&lower->list == &dev->adj_list.lower) + return NULL; + + *iter = &lower->list; + *ignore = lower->ignore; + + return lower->dev; +} + int netdev_walk_all_lower_dev(struct net_device *dev, int (*fn)(struct net_device *dev, void *data), void *data) { - struct net_device *ldev; - struct list_head *iter; - int ret; + struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; + struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; + int ret, cur = 0; - for (iter = &dev->adj_list.lower, - ldev = netdev_next_lower_dev(dev, &iter); - ldev; - ldev = netdev_next_lower_dev(dev, &iter)) { - /* first is the lower device itself */ - ret = fn(ldev, data); - if (ret) - return ret; + now = dev; + iter = &dev->adj_list.lower; - /* then look at all of its lower devices */ - ret = netdev_walk_all_lower_dev(ldev, fn, data); - if (ret) - return ret; + while (1) { + if (now != dev) { + ret = fn(now, data); + if (ret) + return ret; + } + + next = NULL; + while (1) { + ldev = netdev_next_lower_dev(now, &iter); + if (!ldev) + break; + + next = ldev; + niter = &ldev->adj_list.lower; + dev_stack[cur] = now; + iter_stack[cur++] = iter; + break; + } + + if (!next) { + if (!cur) + return 0; + next = dev_stack[--cur]; + niter = iter_stack[cur]; + } + + now = next; + iter = niter; } return 0; } EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev); +static int __netdev_walk_all_lower_dev(struct net_device *dev, + int (*fn)(struct net_device *dev, + void *data), + void *data) +{ + struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; + struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; + int ret, cur = 0; + bool ignore; + + now = dev; + iter = &dev->adj_list.lower; + + while (1) { + if (now != dev) { + ret = fn(now, data); + if (ret) + return ret; + } + + next = NULL; + while (1) { + ldev = __netdev_next_lower_dev(now, &iter, &ignore); + if (!ldev) + break; + if (ignore) + continue; + + next = ldev; + niter = &ldev->adj_list.lower; + dev_stack[cur] = now; + iter_stack[cur++] = iter; + break; + } + + if (!next) { + if (!cur) + return 0; + next = dev_stack[--cur]; + niter = iter_stack[cur]; + } + + now = next; + iter = niter; + } + + return 0; +} + static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, struct list_head **iter) { @@ -6785,28 +7215,99 @@ static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, return lower->dev; } -int netdev_walk_all_lower_dev_rcu(struct net_device *dev, - int (*fn)(struct net_device *dev, - void *data), - void *data) +static u8 __netdev_upper_depth(struct net_device *dev) +{ + struct net_device *udev; + struct list_head *iter; + u8 max_depth = 0; + bool ignore; + + for (iter = &dev->adj_list.upper, + udev = __netdev_next_upper_dev(dev, &iter, &ignore); + udev; + udev = __netdev_next_upper_dev(dev, &iter, &ignore)) { + if (ignore) + continue; + if (max_depth < udev->upper_level) + max_depth = udev->upper_level; + } + + return max_depth; +} + +static u8 __netdev_lower_depth(struct net_device *dev) { struct net_device *ldev; struct list_head *iter; - int ret; + u8 max_depth = 0; + bool ignore; for (iter = &dev->adj_list.lower, - ldev = netdev_next_lower_dev_rcu(dev, &iter); + ldev = __netdev_next_lower_dev(dev, &iter, &ignore); ldev; - ldev = netdev_next_lower_dev_rcu(dev, &iter)) { - /* first is the lower device itself */ - ret = fn(ldev, data); - if (ret) - return ret; + ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) { + if (ignore) + continue; + if (max_depth < ldev->lower_level) + max_depth = ldev->lower_level; + } - /* then look at all of its lower devices */ - ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data); - if (ret) - return ret; + return max_depth; +} + +static int __netdev_update_upper_level(struct net_device *dev, void *data) +{ + dev->upper_level = __netdev_upper_depth(dev) + 1; + return 0; +} + +static int __netdev_update_lower_level(struct net_device *dev, void *data) +{ + dev->lower_level = __netdev_lower_depth(dev) + 1; + return 0; +} + +int netdev_walk_all_lower_dev_rcu(struct net_device *dev, + int (*fn)(struct net_device *dev, + void *data), + void *data) +{ + struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; + struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; + int ret, cur = 0; + + now = dev; + iter = &dev->adj_list.lower; + + while (1) { + if (now != dev) { + ret = fn(now, data); + if (ret) + return ret; + } + + next = NULL; + while (1) { + ldev = netdev_next_lower_dev_rcu(now, &iter); + if (!ldev) + break; + + next = ldev; + niter = &ldev->adj_list.lower; + dev_stack[cur] = now; + iter_stack[cur++] = iter; + break; + } + + if (!next) { + if (!cur) + return 0; + next = dev_stack[--cur]; + niter = iter_stack[cur]; + } + + now = next; + iter = niter; } return 0; @@ -6910,6 +7411,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, adj->master = master; adj->ref_nr = 1; adj->private = private; + adj->ignore = false; dev_hold(adj_dev); pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n", @@ -7060,14 +7562,17 @@ static int __netdev_upper_dev_link(struct net_device *dev, return -EBUSY; /* To prevent loops, check if dev is not upper device to upper_dev. */ - if (netdev_has_upper_dev(upper_dev, dev)) + if (__netdev_has_upper_dev(upper_dev, dev)) return -EBUSY; + if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV) + return -EMLINK; + if (!master) { - if (netdev_has_upper_dev(dev, upper_dev)) + if (__netdev_has_upper_dev(dev, upper_dev)) return -EEXIST; } else { - master_dev = netdev_master_upper_dev_get(dev); + master_dev = __netdev_master_upper_dev_get(dev); if (master_dev) return master_dev == upper_dev ? -EEXIST : -EBUSY; } @@ -7089,6 +7594,13 @@ static int __netdev_upper_dev_link(struct net_device *dev, if (ret) goto rollback; + __netdev_update_upper_level(dev, NULL); + __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL); + + __netdev_update_lower_level(upper_dev, NULL); + __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, + NULL); + return 0; rollback: @@ -7171,9 +7683,96 @@ void netdev_upper_dev_unlink(struct net_device *dev, call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, &changeupper_info.info); + + __netdev_update_upper_level(dev, NULL); + __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL); + + __netdev_update_lower_level(upper_dev, NULL); + __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, + NULL); } EXPORT_SYMBOL(netdev_upper_dev_unlink); +static void __netdev_adjacent_dev_set(struct net_device *upper_dev, + struct net_device *lower_dev, + bool val) +{ + struct netdev_adjacent *adj; + + adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower); + if (adj) + adj->ignore = val; + + adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper); + if (adj) + adj->ignore = val; +} + +static void netdev_adjacent_dev_disable(struct net_device *upper_dev, + struct net_device *lower_dev) +{ + __netdev_adjacent_dev_set(upper_dev, lower_dev, true); +} + +static void netdev_adjacent_dev_enable(struct net_device *upper_dev, + struct net_device *lower_dev) +{ + __netdev_adjacent_dev_set(upper_dev, lower_dev, false); +} + +int netdev_adjacent_change_prepare(struct net_device *old_dev, + struct net_device *new_dev, + struct net_device *dev, + struct netlink_ext_ack *extack) +{ + int err; + + if (!new_dev) + return 0; + + if (old_dev && new_dev != old_dev) + netdev_adjacent_dev_disable(dev, old_dev); + + err = netdev_upper_dev_link(new_dev, dev, extack); + if (err) { + if (old_dev && new_dev != old_dev) + netdev_adjacent_dev_enable(dev, old_dev); + return err; + } + + return 0; +} +EXPORT_SYMBOL(netdev_adjacent_change_prepare); + +void netdev_adjacent_change_commit(struct net_device *old_dev, + struct net_device *new_dev, + struct net_device *dev) +{ + if (!new_dev || !old_dev) + return; + + if (new_dev == old_dev) + return; + + netdev_adjacent_dev_enable(dev, old_dev); + netdev_upper_dev_unlink(old_dev, dev); +} +EXPORT_SYMBOL(netdev_adjacent_change_commit); + +void netdev_adjacent_change_abort(struct net_device *old_dev, + struct net_device *new_dev, + struct net_device *dev) +{ + if (!new_dev) + return; + + if (old_dev && new_dev != old_dev) + netdev_adjacent_dev_enable(dev, old_dev); + + netdev_upper_dev_unlink(new_dev, dev); +} +EXPORT_SYMBOL(netdev_adjacent_change_abort); + /** * netdev_bonding_info_change - Dispatch event about slave change * @dev: device @@ -7287,25 +7886,6 @@ void *netdev_lower_dev_get_private(struct net_device *dev, EXPORT_SYMBOL(netdev_lower_dev_get_private); -int dev_get_nest_level(struct net_device *dev) -{ - struct net_device *lower = NULL; - struct list_head *iter; - int max_nest = -1; - int nest; - - ASSERT_RTNL(); - - netdev_for_each_lower_dev(dev, lower, iter) { - nest = dev_get_nest_level(lower); - if (max_nest < nest) - max_nest = nest; - } - - return max_nest + 1; -} -EXPORT_SYMBOL(dev_get_nest_level); - /** * netdev_lower_change - Dispatch event about lower device state change * @lower_dev: device @@ -7658,11 +8238,28 @@ int __dev_set_mtu(struct net_device *dev, int new_mtu) if (ops->ndo_change_mtu) return ops->ndo_change_mtu(dev, new_mtu); - dev->mtu = new_mtu; + /* Pairs with all the lockless reads of dev->mtu in the stack */ + WRITE_ONCE(dev->mtu, new_mtu); return 0; } EXPORT_SYMBOL(__dev_set_mtu); +int dev_validate_mtu(struct net_device *dev, int new_mtu, + struct netlink_ext_ack *extack) +{ + /* MTU must be positive, and in range */ + if (new_mtu < 0 || new_mtu < dev->min_mtu) { + NL_SET_ERR_MSG(extack, "mtu less than device minimum"); + return -EINVAL; + } + + if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) { + NL_SET_ERR_MSG(extack, "mtu greater than device maximum"); + return -EINVAL; + } + return 0; +} + /** * dev_set_mtu_ext - Change maximum transfer unit * @dev: device @@ -7679,16 +8276,9 @@ int dev_set_mtu_ext(struct net_device *dev, int new_mtu, if (new_mtu == dev->mtu) return 0; - /* MTU must be positive, and in range */ - if (new_mtu < 0 || new_mtu < dev->min_mtu) { - NL_SET_ERR_MSG(extack, "mtu less than device minimum"); - return -EINVAL; - } - - if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) { - NL_SET_ERR_MSG(extack, "mtu greater than device maximum"); - return -EINVAL; - } + err = dev_validate_mtu(dev, new_mtu, extack); + if (err) + return err; if (!netif_device_present(dev)) return -ENODEV; @@ -8011,7 +8601,17 @@ static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op, struct netlink_ext_ack *extack, u32 flags, struct bpf_prog *prog) { + bool non_hw = !(flags & XDP_FLAGS_HW_MODE); + struct bpf_prog *prev_prog = NULL; struct netdev_bpf xdp; + int err; + + if (non_hw) { + prev_prog = bpf_prog_by_id(__dev_xdp_query(dev, bpf_op, + XDP_QUERY_PROG)); + if (IS_ERR(prev_prog)) + prev_prog = NULL; + } memset(&xdp, 0, sizeof(xdp)); if (flags & XDP_FLAGS_HW_MODE) @@ -8022,7 +8622,14 @@ static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op, xdp.flags = flags; xdp.prog = prog; - return bpf_op(dev, &xdp); + err = bpf_op(dev, &xdp); + if (!err && non_hw) + bpf_prog_change_xdp(prev_prog, prog); + + if (prev_prog) + bpf_prog_put(prev_prog); + + return err; } static void dev_xdp_uninstall(struct net_device *dev) @@ -8088,12 +8695,15 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, bpf_chk = generic_xdp_install; if (fd >= 0) { + u32 prog_id; + if (!offload && __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG)) { NL_SET_ERR_MSG(extack, "native and generic XDP can't be active at the same time"); return -EEXIST; } - if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && - __dev_xdp_query(dev, bpf_op, query)) { + + prog_id = __dev_xdp_query(dev, bpf_op, query); + if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && prog_id) { NL_SET_ERR_MSG(extack, "XDP program already attached"); return -EBUSY; } @@ -8108,6 +8718,15 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, bpf_prog_put(prog); return -EINVAL; } + + /* prog->aux->id may be 0 for orphaned device-bound progs */ + if (prog->aux->id && prog->aux->id == prog_id) { + bpf_prog_put(prog); + return 0; + } + } else { + if (!__dev_xdp_query(dev, bpf_op, query)) + return 0; } err = dev_xdp_install(dev, bpf_op, extack, flags, prog); @@ -8211,6 +8830,9 @@ static void rollback_registered_many(struct list_head *head) dev_uc_flush(dev); dev_mc_flush(dev); + netdev_name_node_alt_flush(dev); + netdev_name_node_free(dev->name_node); + if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); @@ -8566,7 +9188,7 @@ static void netdev_init_one_queue(struct net_device *dev, { /* Initialize queue lock */ spin_lock_init(&queue->_xmit_lock); - netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); + lockdep_set_class(&queue->_xmit_lock, &dev->qdisc_xmit_lock_key); queue->xmit_lock_owner = -1; netdev_queue_numa_node_write(queue, NUMA_NO_NODE); queue->dev = dev; @@ -8613,6 +9235,31 @@ void netif_tx_stop_all_queues(struct net_device *dev) } EXPORT_SYMBOL(netif_tx_stop_all_queues); +static void netdev_register_lockdep_key(struct net_device *dev) +{ + lockdep_register_key(&dev->qdisc_tx_busylock_key); + lockdep_register_key(&dev->qdisc_running_key); + lockdep_register_key(&dev->qdisc_xmit_lock_key); + lockdep_register_key(&dev->addr_list_lock_key); +} + +static void netdev_unregister_lockdep_key(struct net_device *dev) +{ + lockdep_unregister_key(&dev->qdisc_tx_busylock_key); + lockdep_unregister_key(&dev->qdisc_running_key); + lockdep_unregister_key(&dev->qdisc_xmit_lock_key); + lockdep_unregister_key(&dev->addr_list_lock_key); +} + +void netdev_update_lockdep_key(struct net_device *dev) +{ + lockdep_unregister_key(&dev->addr_list_lock_key); + lockdep_register_key(&dev->addr_list_lock_key); + + lockdep_set_class(&dev->addr_list_lock, &dev->addr_list_lock_key); +} +EXPORT_SYMBOL(netdev_update_lockdep_key); + /** * register_netdevice - register a network device * @dev: device to register @@ -8647,19 +9294,24 @@ int register_netdevice(struct net_device *dev) BUG_ON(!net); spin_lock_init(&dev->addr_list_lock); - netdev_set_addr_lockdep_class(dev); + lockdep_set_class(&dev->addr_list_lock, &dev->addr_list_lock_key); ret = dev_get_valid_name(net, dev, dev->name); if (ret < 0) goto out; + ret = -ENOMEM; + dev->name_node = netdev_name_node_head_alloc(dev); + if (!dev->name_node) + goto out; + /* Init, if this function is available */ if (dev->netdev_ops->ndo_init) { ret = dev->netdev_ops->ndo_init(dev); if (ret) { if (ret > 0) ret = -EIO; - goto out; + goto err_free_name; } } @@ -8681,7 +9333,7 @@ int register_netdevice(struct net_device *dev) /* Transfer changeable features to wanted_features and enable * software offloads (GSO and GRO). */ - dev->hw_features |= NETIF_F_SOFT_FEATURES; + dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF); dev->features |= NETIF_F_SOFT_FEATURES; if (dev->netdev_ops->ndo_udp_tunnel_add) { @@ -8726,8 +9378,10 @@ int register_netdevice(struct net_device *dev) goto err_uninit; ret = netdev_register_kobject(dev); - if (ret) + if (ret) { + dev->reg_state = NETREG_UNREGISTERED; goto err_uninit; + } dev->reg_state = NETREG_REGISTERED; __netdev_update_features(dev); @@ -8758,6 +9412,8 @@ int register_netdevice(struct net_device *dev) ret = notifier_to_errno(ret); if (ret) { rollback_registered(dev); + rcu_barrier(); + dev->reg_state = NETREG_UNREGISTERED; } /* @@ -8776,6 +9432,8 @@ err_uninit: dev->netdev_ops->ndo_uninit(dev); if (dev->priv_destructor) dev->priv_destructor(dev); +err_free_name: + netdev_name_node_free(dev->name_node); goto out; } EXPORT_SYMBOL(register_netdevice); @@ -9155,8 +9813,12 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); + netdev_register_lockdep_key(dev); + dev->gso_max_size = GSO_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; + dev->upper_level = 1; + dev->lower_level = 1; INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); @@ -9166,6 +9828,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, INIT_LIST_HEAD(&dev->adj_list.lower); INIT_LIST_HEAD(&dev->ptype_all); INIT_LIST_HEAD(&dev->ptype_specific); + INIT_LIST_HEAD(&dev->net_notifier_list); #ifdef CONFIG_NET_SCHED hash_init(dev->qdisc_hash); #endif @@ -9236,6 +9899,10 @@ void free_netdev(struct net_device *dev) free_percpu(dev->pcpu_refcnt); dev->pcpu_refcnt = NULL; + free_percpu(dev->xdp_bulkq); + dev->xdp_bulkq = NULL; + + netdev_unregister_lockdep_key(dev); /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { @@ -9405,7 +10072,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char call_netdevice_notifiers(NETDEV_UNREGISTER, dev); rcu_barrier(); - new_nsid = peernet2id_alloc(dev_net(dev), net); + new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL); /* If there is an ifindex conflict assign a new one */ if (__dev_get_by_index(net, dev->ifindex)) new_ifindex = dev_new_index(net); @@ -9425,6 +10092,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE); netdev_adjacent_del_links(dev); + /* Move per-net netdevice notifiers that are following the netdevice */ + move_netdevice_notifiers_dev_net(dev, net); + /* Actually switch the network namespace */ dev_net_set(dev, net); dev->ifindex = new_ifindex; @@ -9567,7 +10237,7 @@ static struct hlist_head * __net_init netdev_create_hash(void) static int __net_init netdev_init(struct net *net) { BUILD_BUG_ON(GRO_HASH_BUCKETS > - 8 * FIELD_SIZEOF(struct napi_struct, gro_bitmask)); + 8 * sizeof_field(struct napi_struct, gro_bitmask)); if (net != &init_net) INIT_LIST_HEAD(&net->dev_base_head); @@ -9580,6 +10250,8 @@ static int __net_init netdev_init(struct net *net) if (net->dev_index_head == NULL) goto err_idx; + RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain); + return 0; err_idx: diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 6393ba930097..2f949b5a1eb9 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -637,7 +637,7 @@ int dev_uc_sync(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return -EINVAL; - netif_addr_lock_nested(to); + netif_addr_lock(to); err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len); if (!err) __dev_set_rx_mode(to); @@ -667,7 +667,7 @@ int dev_uc_sync_multiple(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return -EINVAL; - netif_addr_lock_nested(to); + netif_addr_lock(to); err = __hw_addr_sync_multiple(&to->uc, &from->uc, to->addr_len); if (!err) __dev_set_rx_mode(to); @@ -691,7 +691,7 @@ void dev_uc_unsync(struct net_device *to, struct net_device *from) return; netif_addr_lock_bh(from); - netif_addr_lock_nested(to); + netif_addr_lock(to); __hw_addr_unsync(&to->uc, &from->uc, to->addr_len); __dev_set_rx_mode(to); netif_addr_unlock(to); @@ -858,7 +858,7 @@ int dev_mc_sync(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return -EINVAL; - netif_addr_lock_nested(to); + netif_addr_lock(to); err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len); if (!err) __dev_set_rx_mode(to); @@ -888,7 +888,7 @@ int dev_mc_sync_multiple(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return -EINVAL; - netif_addr_lock_nested(to); + netif_addr_lock(to); err = __hw_addr_sync_multiple(&to->mc, &from->mc, to->addr_len); if (!err) __dev_set_rx_mode(to); @@ -912,7 +912,7 @@ void dev_mc_unsync(struct net_device *to, struct net_device *from) return; netif_addr_lock_bh(from); - netif_addr_lock_nested(to); + netif_addr_lock(to); __hw_addr_unsync(&to->mc, &from->mc, to->addr_len); __dev_set_rx_mode(to); netif_addr_unlock(to); diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 5163d900bb4f..dbaebbe573f0 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -187,6 +187,7 @@ static int net_hwtstamp_validate(struct ifreq *ifr) case HWTSTAMP_TX_OFF: case HWTSTAMP_TX_ON: case HWTSTAMP_TX_ONESTEP_SYNC: + case HWTSTAMP_TX_ONESTEP_P2P: tx_type_valid = 1; break; } diff --git a/net/core/devlink.c b/net/core/devlink.c index 4f40aeace902..549ee56b7a21 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -18,6 +18,8 @@ #include <linux/spinlock.h> #include <linux/refcount.h> #include <linux/workqueue.h> +#include <linux/u64_stats_sync.h> +#include <linux/timekeeping.h> #include <rdma/ib_verbs.h> #include <net/netlink.h> #include <net/genetlink.h> @@ -25,6 +27,7 @@ #include <net/net_namespace.h> #include <net/sock.h> #include <net/devlink.h> +#include <net/drop_monitor.h> #define CREATE_TRACE_POINTS #include <trace/events/devlink.h> @@ -92,16 +95,25 @@ static LIST_HEAD(devlink_list); */ static DEFINE_MUTEX(devlink_mutex); -static struct net *devlink_net(const struct devlink *devlink) +struct net *devlink_net(const struct devlink *devlink) { return read_pnet(&devlink->_net); } +EXPORT_SYMBOL_GPL(devlink_net); -static void devlink_net_set(struct devlink *devlink, struct net *net) +static void __devlink_net_set(struct devlink *devlink, struct net *net) { write_pnet(&devlink->_net, net); } +void devlink_net_set(struct devlink *devlink, struct net *net) +{ + if (WARN_ON(devlink->registered)) + return; + __devlink_net_set(devlink, net); +} +EXPORT_SYMBOL_GPL(devlink_net_set); + static struct devlink *devlink_get_from_attrs(struct net *net, struct nlattr **attrs) { @@ -133,7 +145,7 @@ static struct devlink *devlink_get_from_info(struct genl_info *info) } static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink, - int port_index) + unsigned int port_index) { struct devlink_port *devlink_port; @@ -144,7 +156,8 @@ static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink, return NULL; } -static bool devlink_port_index_exists(struct devlink *devlink, int port_index) +static bool devlink_port_index_exists(struct devlink *devlink, + unsigned int port_index) { return devlink_port_get_by_index(devlink, port_index); } @@ -342,7 +355,6 @@ struct devlink_snapshot { struct list_head list; struct devlink_region *region; devlink_snapshot_data_dest_t *data_destructor; - u64 data_len; u8 *data; u32 id; }; @@ -371,14 +383,6 @@ devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id) return NULL; } -static void devlink_region_snapshot_del(struct devlink_snapshot *snapshot) -{ - snapshot->region->cur_snapshots--; - list_del(&snapshot->list); - (*snapshot->data_destructor)(snapshot->data); - kfree(snapshot); -} - #define DEVLINK_NL_FLAG_NEED_DEVLINK BIT(0) #define DEVLINK_NL_FLAG_NEED_PORT BIT(1) #define DEVLINK_NL_FLAG_NEED_SB BIT(2) @@ -439,8 +443,16 @@ static void devlink_nl_post_doit(const struct genl_ops *ops, { struct devlink *devlink; - devlink = devlink_get_from_info(info); - if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) + /* When devlink changes netns, it would not be found + * by devlink_get_from_info(). So try if it is stored first. + */ + if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK) { + devlink = info->user_ptr[0]; + } else { + devlink = devlink_get_from_info(info); + WARN_ON(IS_ERR(devlink)); + } + if (!IS_ERR(devlink) && ~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) mutex_unlock(&devlink->lock); mutex_unlock(&devlink_mutex); } @@ -476,6 +488,8 @@ static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink, if (devlink_nl_put_handle(msg, devlink)) goto nla_put_failure; + if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_FAILED, devlink->reload_failed)) + goto nla_put_failure; genlmsg_end(msg, hdr); return 0; @@ -515,32 +529,37 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg, return 0; if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour)) return -EMSGSIZE; - if (devlink_port->attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_PF) { + switch (devlink_port->attrs.flavour) { + case DEVLINK_PORT_FLAVOUR_PCI_PF: if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_pf.pf)) return -EMSGSIZE; - } else if (devlink_port->attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_VF) { + break; + case DEVLINK_PORT_FLAVOUR_PCI_VF: if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_vf.pf) || nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER, attrs->pci_vf.vf)) return -EMSGSIZE; + break; + case DEVLINK_PORT_FLAVOUR_PHYSICAL: + case DEVLINK_PORT_FLAVOUR_CPU: + case DEVLINK_PORT_FLAVOUR_DSA: + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, + attrs->phys.port_number)) + return -EMSGSIZE; + if (!attrs->split) + return 0; + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, + attrs->phys.port_number)) + return -EMSGSIZE; + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER, + attrs->phys.split_subport_number)) + return -EMSGSIZE; + break; + default: + break; } - if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PHYSICAL && - devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU && - devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA) - return 0; - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, - attrs->phys.port_number)) - return -EMSGSIZE; - if (!attrs->split) - return 0; - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, - attrs->phys.port_number)) - return -EMSGSIZE; - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER, - attrs->phys.split_subport_number)) - return -EMSGSIZE; return 0; } @@ -560,7 +579,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) goto nla_put_failure; - spin_lock(&devlink_port->type_lock); + spin_lock_bh(&devlink_port->type_lock); if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type)) goto nla_put_failure_type_locked; if (devlink_port->desired_type != DEVLINK_PORT_TYPE_NOTSET && @@ -585,7 +604,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, ibdev->name)) goto nla_put_failure_type_locked; } - spin_unlock(&devlink_port->type_lock); + spin_unlock_bh(&devlink_port->type_lock); if (devlink_nl_port_attrs_put(msg, devlink_port)) goto nla_put_failure; @@ -593,7 +612,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, return 0; nla_put_failure_type_locked: - spin_unlock(&devlink_port->type_lock); + spin_unlock_bh(&devlink_port->type_lock); nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; @@ -1033,7 +1052,7 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg, struct devlink_sb *devlink_sb; int start = cb->args[0]; int idx = 0; - int err; + int err = 0; mutex_lock(&devlink_mutex); list_for_each_entry(devlink, &devlink_list, list) { @@ -1056,6 +1075,9 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg, out: mutex_unlock(&devlink_mutex); + if (err != -EMSGSIZE) + return err; + cb->args[0] = idx; return msg->len; } @@ -1231,7 +1253,7 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg, struct devlink_sb *devlink_sb; int start = cb->args[0]; int idx = 0; - int err; + int err = 0; mutex_lock(&devlink_mutex); list_for_each_entry(devlink, &devlink_list, list) { @@ -1254,6 +1276,9 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg, out: mutex_unlock(&devlink_mutex); + if (err != -EMSGSIZE) + return err; + cb->args[0] = idx; return msg->len; } @@ -1458,7 +1483,7 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, struct devlink_sb *devlink_sb; int start = cb->args[0]; int idx = 0; - int err; + int err = 0; mutex_lock(&devlink_mutex); list_for_each_entry(devlink, &devlink_list, list) { @@ -1483,6 +1508,9 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, out: mutex_unlock(&devlink_mutex); + if (err != -EMSGSIZE) + return err; + cb->args[0] = idx; return msg->len; } @@ -2672,12 +2700,119 @@ devlink_resources_validate(struct devlink *devlink, return err; } +static struct net *devlink_netns_get(struct sk_buff *skb, + struct genl_info *info) +{ + struct nlattr *netns_pid_attr = info->attrs[DEVLINK_ATTR_NETNS_PID]; + struct nlattr *netns_fd_attr = info->attrs[DEVLINK_ATTR_NETNS_FD]; + struct nlattr *netns_id_attr = info->attrs[DEVLINK_ATTR_NETNS_ID]; + struct net *net; + + if (!!netns_pid_attr + !!netns_fd_attr + !!netns_id_attr > 1) { + NL_SET_ERR_MSG(info->extack, "multiple netns identifying attributes specified"); + return ERR_PTR(-EINVAL); + } + + if (netns_pid_attr) { + net = get_net_ns_by_pid(nla_get_u32(netns_pid_attr)); + } else if (netns_fd_attr) { + net = get_net_ns_by_fd(nla_get_u32(netns_fd_attr)); + } else if (netns_id_attr) { + net = get_net_ns_by_id(sock_net(skb->sk), + nla_get_u32(netns_id_attr)); + if (!net) + net = ERR_PTR(-EINVAL); + } else { + WARN_ON(1); + net = ERR_PTR(-EINVAL); + } + if (IS_ERR(net)) { + NL_SET_ERR_MSG(info->extack, "Unknown network namespace"); + return ERR_PTR(-EINVAL); + } + if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { + put_net(net); + return ERR_PTR(-EPERM); + } + return net; +} + +static void devlink_param_notify(struct devlink *devlink, + unsigned int port_index, + struct devlink_param_item *param_item, + enum devlink_command cmd); + +static void devlink_reload_netns_change(struct devlink *devlink, + struct net *dest_net) +{ + struct devlink_param_item *param_item; + + /* Userspace needs to be notified about devlink objects + * removed from original and entering new network namespace. + * The rest of the devlink objects are re-created during + * reload process so the notifications are generated separatelly. + */ + + list_for_each_entry(param_item, &devlink->param_list, list) + devlink_param_notify(devlink, 0, param_item, + DEVLINK_CMD_PARAM_DEL); + devlink_notify(devlink, DEVLINK_CMD_DEL); + + __devlink_net_set(devlink, dest_net); + + devlink_notify(devlink, DEVLINK_CMD_NEW); + list_for_each_entry(param_item, &devlink->param_list, list) + devlink_param_notify(devlink, 0, param_item, + DEVLINK_CMD_PARAM_NEW); +} + +static bool devlink_reload_supported(struct devlink *devlink) +{ + return devlink->ops->reload_down && devlink->ops->reload_up; +} + +static void devlink_reload_failed_set(struct devlink *devlink, + bool reload_failed) +{ + if (devlink->reload_failed == reload_failed) + return; + devlink->reload_failed = reload_failed; + devlink_notify(devlink, DEVLINK_CMD_NEW); +} + +bool devlink_is_reload_failed(const struct devlink *devlink) +{ + return devlink->reload_failed; +} +EXPORT_SYMBOL_GPL(devlink_is_reload_failed); + +static int devlink_reload(struct devlink *devlink, struct net *dest_net, + struct netlink_ext_ack *extack) +{ + int err; + + if (!devlink->reload_enabled) + return -EOPNOTSUPP; + + err = devlink->ops->reload_down(devlink, !!dest_net, extack); + if (err) + return err; + + if (dest_net && !net_eq(dest_net, devlink_net(devlink))) + devlink_reload_netns_change(devlink, dest_net); + + err = devlink->ops->reload_up(devlink, extack); + devlink_reload_failed_set(devlink, !!err); + return err; +} + static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; + struct net *dest_net = NULL; int err; - if (!devlink->ops->reload) + if (!devlink_reload_supported(devlink) || !devlink->reload_enabled) return -EOPNOTSUPP; err = devlink_resources_validate(devlink, NULL, info); @@ -2685,7 +2820,21 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) NL_SET_ERR_MSG_MOD(info->extack, "resources size validation failed"); return err; } - return devlink->ops->reload(devlink, info->extack); + + if (info->attrs[DEVLINK_ATTR_NETNS_PID] || + info->attrs[DEVLINK_ATTR_NETNS_FD] || + info->attrs[DEVLINK_ATTR_NETNS_ID]) { + dest_net = devlink_netns_get(skb, info); + if (IS_ERR(dest_net)) + return PTR_ERR(dest_net); + } + + err = devlink_reload(devlink, dest_net, info->extack); + + if (dest_net) + put_net(dest_net); + + return err; } static int devlink_nl_flash_update_fill(struct sk_buff *msg, @@ -2852,6 +3001,16 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_NAME, .type = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_RESET_DEV_ON_DRV_PROBE, + .name = DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_NAME, + .type = DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE, + .name = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_NAME, + .type = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) @@ -3123,7 +3282,7 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg, struct devlink *devlink; int start = cb->args[0]; int idx = 0; - int err; + int err = 0; mutex_lock(&devlink_mutex); list_for_each_entry(devlink, &devlink_list, list) { @@ -3140,7 +3299,7 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI); - if (err) { + if (err && err != -EOPNOTSUPP) { mutex_unlock(&devlink->lock); goto out; } @@ -3151,6 +3310,9 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg, out: mutex_unlock(&devlink_mutex); + if (err != -EMSGSIZE) + return err; + cb->args[0] = idx; return msg->len; } @@ -3379,7 +3541,7 @@ static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg, struct devlink *devlink; int start = cb->args[0]; int idx = 0; - int err; + int err = 0; mutex_lock(&devlink_mutex); list_for_each_entry(devlink, &devlink_list, list) { @@ -3400,7 +3562,7 @@ static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI); - if (err) { + if (err && err != -EOPNOTSUPP) { mutex_unlock(&devlink->lock); goto out; } @@ -3412,6 +3574,9 @@ static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg, out: mutex_unlock(&devlink_mutex); + if (err != -EMSGSIZE) + return err; + cb->args[0] = idx; return msg->len; } @@ -3596,6 +3761,16 @@ out_free_msg: nlmsg_free(msg); } +static void devlink_region_snapshot_del(struct devlink_region *region, + struct devlink_snapshot *snapshot) +{ + devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_DEL); + region->cur_snapshots--; + list_del(&snapshot->list); + (*snapshot->data_destructor)(snapshot->data); + kfree(snapshot); +} + static int devlink_nl_cmd_region_get_doit(struct sk_buff *skb, struct genl_info *info) { @@ -3691,8 +3866,7 @@ static int devlink_nl_cmd_region_del(struct sk_buff *skb, if (!snapshot) return -EINVAL; - devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_DEL); - devlink_region_snapshot_del(snapshot); + devlink_region_snapshot_del(region, snapshot); return 0; } @@ -3748,8 +3922,8 @@ static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb, if (!snapshot) return -EINVAL; - if (end_offset > snapshot->data_len || dump) - end_offset = snapshot->data_len; + if (end_offset > region->size || dump) + end_offset = region->size; while (curr_offset < end_offset) { u32 data_size; @@ -3777,29 +3951,19 @@ static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb, static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { + const struct genl_dumpit_info *info = genl_dumpit_info(cb); u64 ret_offset, start_offset, end_offset = 0; + struct nlattr **attrs = info->attrs; struct devlink_region *region; struct nlattr *chunks_attr; const char *region_name; struct devlink *devlink; - struct nlattr **attrs; bool dump = true; void *hdr; int err; start_offset = *((u64 *)&cb->args[0]); - attrs = kmalloc_array(DEVLINK_ATTR_MAX + 1, sizeof(*attrs), GFP_KERNEL); - if (!attrs) - return -ENOMEM; - - err = nlmsg_parse_deprecated(cb->nlh, - GENL_HDRLEN + devlink_nl_family.hdrsize, - attrs, DEVLINK_ATTR_MAX, - devlink_nl_family.policy, cb->extack); - if (err) - goto out_free; - mutex_lock(&devlink_mutex); devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs); if (IS_ERR(devlink)) { @@ -3822,6 +3986,12 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, goto out_unlock; } + /* return 0 if there is no further data to read */ + if (start_offset >= region->size) { + err = 0; + goto out_unlock; + } + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI, DEVLINK_CMD_REGION_READ); @@ -3876,7 +4046,6 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, genlmsg_end(skb, hdr); mutex_unlock(&devlink->lock); mutex_unlock(&devlink_mutex); - kfree(attrs); return skb->len; @@ -3886,8 +4055,6 @@ out_unlock: mutex_unlock(&devlink->lock); out_dev: mutex_unlock(&devlink_mutex); -out_free: - kfree(attrs); return err; } @@ -4025,7 +4192,7 @@ static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg, struct devlink *devlink; int start = cb->args[0]; int idx = 0; - int err; + int err = 0; mutex_lock(&devlink_mutex); list_for_each_entry(devlink, &devlink_list, list) { @@ -4047,12 +4214,15 @@ static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->extack); mutex_unlock(&devlink->lock); - if (err) + if (err && err != -EOPNOTSUPP) break; idx++; } mutex_unlock(&devlink_mutex); + if (err != -EMSGSIZE) + return err; + cb->args[0] = idx; return msg->len; } @@ -4255,12 +4425,11 @@ int devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value) } EXPORT_SYMBOL_GPL(devlink_fmsg_string_put); -int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value, - u16 value_len) +static int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value, + u16 value_len) { return devlink_fmsg_put_value(fmsg, value, value_len, NLA_BINARY); } -EXPORT_SYMBOL_GPL(devlink_fmsg_binary_put); int devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name, bool value) @@ -4368,19 +4537,26 @@ int devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name, EXPORT_SYMBOL_GPL(devlink_fmsg_string_pair_put); int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name, - const void *value, u16 value_len) + const void *value, u32 value_len) { + u32 data_size; + u32 offset; int err; - err = devlink_fmsg_pair_nest_start(fmsg, name); + err = devlink_fmsg_arr_pair_nest_start(fmsg, name); if (err) return err; - err = devlink_fmsg_binary_put(fmsg, value, value_len); - if (err) - return err; + for (offset = 0; offset < value_len; offset += data_size) { + data_size = value_len - offset; + if (data_size > DEVLINK_FMSG_MAX_SIZE) + data_size = DEVLINK_FMSG_MAX_SIZE; + err = devlink_fmsg_binary_put(fmsg, value + offset, data_size); + if (err) + return err; + } - err = devlink_fmsg_pair_nest_end(fmsg); + err = devlink_fmsg_arr_pair_nest_end(fmsg); if (err) return err; @@ -4577,6 +4753,7 @@ struct devlink_health_reporter { bool auto_recover; u8 health_state; u64 dump_ts; + u64 dump_real_ts; u64 error_count; u64 recovery_count; u64 last_recovery_ts; @@ -4672,39 +4849,120 @@ devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) } EXPORT_SYMBOL_GPL(devlink_health_reporter_destroy); -void -devlink_health_reporter_state_update(struct devlink_health_reporter *reporter, - enum devlink_health_reporter_state state) +static int +devlink_nl_health_reporter_fill(struct sk_buff *msg, + struct devlink *devlink, + struct devlink_health_reporter *reporter, + enum devlink_command cmd, u32 portid, + u32 seq, int flags) { - if (WARN_ON(state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY && - state != DEVLINK_HEALTH_REPORTER_STATE_ERROR)) + struct nlattr *reporter_attr; + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto genlmsg_cancel; + + reporter_attr = nla_nest_start_noflag(msg, + DEVLINK_ATTR_HEALTH_REPORTER); + if (!reporter_attr) + goto genlmsg_cancel; + if (nla_put_string(msg, DEVLINK_ATTR_HEALTH_REPORTER_NAME, + reporter->ops->name)) + goto reporter_nest_cancel; + if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_STATE, + reporter->health_state)) + goto reporter_nest_cancel; + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_ERR_COUNT, + reporter->error_count, DEVLINK_ATTR_PAD)) + goto reporter_nest_cancel; + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_RECOVER_COUNT, + reporter->recovery_count, DEVLINK_ATTR_PAD)) + goto reporter_nest_cancel; + if (reporter->ops->recover && + nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD, + reporter->graceful_period, + DEVLINK_ATTR_PAD)) + goto reporter_nest_cancel; + if (reporter->ops->recover && + nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER, + reporter->auto_recover)) + goto reporter_nest_cancel; + if (reporter->dump_fmsg && + nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS, + jiffies_to_msecs(reporter->dump_ts), + DEVLINK_ATTR_PAD)) + goto reporter_nest_cancel; + if (reporter->dump_fmsg && + nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS, + reporter->dump_real_ts, DEVLINK_ATTR_PAD)) + goto reporter_nest_cancel; + + nla_nest_end(msg, reporter_attr); + genlmsg_end(msg, hdr); + return 0; + +reporter_nest_cancel: + nla_nest_end(msg, reporter_attr); +genlmsg_cancel: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static void devlink_recover_notify(struct devlink_health_reporter *reporter, + enum devlink_command cmd) +{ + struct sk_buff *msg; + int err; + + WARN_ON(cmd != DEVLINK_CMD_HEALTH_REPORTER_RECOVER); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) return; - if (reporter->health_state == state) + err = devlink_nl_health_reporter_fill(msg, reporter->devlink, + reporter, cmd, 0, 0, 0); + if (err) { + nlmsg_free(msg); return; + } - reporter->health_state = state; - trace_devlink_health_reporter_state_update(reporter->devlink, - reporter->ops->name, state); + genlmsg_multicast_netns(&devlink_nl_family, + devlink_net(reporter->devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } -EXPORT_SYMBOL_GPL(devlink_health_reporter_state_update); + +void +devlink_health_reporter_recovery_done(struct devlink_health_reporter *reporter) +{ + reporter->recovery_count++; + reporter->last_recovery_ts = jiffies; +} +EXPORT_SYMBOL_GPL(devlink_health_reporter_recovery_done); static int devlink_health_reporter_recover(struct devlink_health_reporter *reporter, - void *priv_ctx) + void *priv_ctx, struct netlink_ext_ack *extack) { int err; + if (reporter->health_state == DEVLINK_HEALTH_REPORTER_STATE_HEALTHY) + return 0; + if (!reporter->ops->recover) return -EOPNOTSUPP; - err = reporter->ops->recover(reporter, priv_ctx); + err = reporter->ops->recover(reporter, priv_ctx, extack); if (err) return err; - reporter->recovery_count++; + devlink_health_reporter_recovery_done(reporter); reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_HEALTHY; - reporter->last_recovery_ts = jiffies; + devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER); return 0; } @@ -4719,7 +4977,8 @@ devlink_health_dump_clear(struct devlink_health_reporter *reporter) } static int devlink_health_do_dump(struct devlink_health_reporter *reporter, - void *priv_ctx) + void *priv_ctx, + struct netlink_ext_ack *extack) { int err; @@ -4740,7 +4999,7 @@ static int devlink_health_do_dump(struct devlink_health_reporter *reporter, goto dump_err; err = reporter->ops->dump(reporter, reporter->dump_fmsg, - priv_ctx); + priv_ctx, extack); if (err) goto dump_err; @@ -4749,6 +5008,7 @@ static int devlink_health_do_dump(struct devlink_health_reporter *reporter, goto dump_err; reporter->dump_ts = jiffies; + reporter->dump_real_ts = ktime_get_real_ns(); return 0; @@ -4769,6 +5029,7 @@ int devlink_health_report(struct devlink_health_reporter *reporter, reporter->error_count++; prev_health_state = reporter->health_state; reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR; + devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER); /* abort if the previous error wasn't recovered */ if (reporter->auto_recover && @@ -4787,11 +5048,12 @@ int devlink_health_report(struct devlink_health_reporter *reporter, mutex_lock(&reporter->dump_lock); /* store current dump of current error, for later analysis */ - devlink_health_do_dump(reporter, priv_ctx); + devlink_health_do_dump(reporter, priv_ctx, NULL); mutex_unlock(&reporter->dump_lock); if (reporter->auto_recover) - return devlink_health_reporter_recover(reporter, priv_ctx); + return devlink_health_reporter_recover(reporter, + priv_ctx, NULL); return 0; } @@ -4826,21 +5088,10 @@ devlink_health_reporter_get_from_info(struct devlink *devlink, static struct devlink_health_reporter * devlink_health_reporter_get_from_cb(struct netlink_callback *cb) { + const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct devlink_health_reporter *reporter; + struct nlattr **attrs = info->attrs; struct devlink *devlink; - struct nlattr **attrs; - int err; - - attrs = kmalloc_array(DEVLINK_ATTR_MAX + 1, sizeof(*attrs), GFP_KERNEL); - if (!attrs) - return NULL; - - err = nlmsg_parse_deprecated(cb->nlh, - GENL_HDRLEN + devlink_nl_family.hdrsize, - attrs, DEVLINK_ATTR_MAX, - devlink_nl_family.policy, cb->extack); - if (err) - goto free; mutex_lock(&devlink_mutex); devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs); @@ -4849,12 +5100,9 @@ devlink_health_reporter_get_from_cb(struct netlink_callback *cb) reporter = devlink_health_reporter_get_from_attrs(devlink, attrs); mutex_unlock(&devlink_mutex); - kfree(attrs); return reporter; unlock: mutex_unlock(&devlink_mutex); -free: - kfree(attrs); return NULL; } @@ -4864,64 +5112,23 @@ devlink_health_reporter_put(struct devlink_health_reporter *reporter) refcount_dec(&reporter->refcount); } -static int -devlink_nl_health_reporter_fill(struct sk_buff *msg, - struct devlink *devlink, - struct devlink_health_reporter *reporter, - enum devlink_command cmd, u32 portid, - u32 seq, int flags) +void +devlink_health_reporter_state_update(struct devlink_health_reporter *reporter, + enum devlink_health_reporter_state state) { - struct nlattr *reporter_attr; - void *hdr; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto genlmsg_cancel; - - reporter_attr = nla_nest_start_noflag(msg, - DEVLINK_ATTR_HEALTH_REPORTER); - if (!reporter_attr) - goto genlmsg_cancel; - if (nla_put_string(msg, DEVLINK_ATTR_HEALTH_REPORTER_NAME, - reporter->ops->name)) - goto reporter_nest_cancel; - if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_STATE, - reporter->health_state)) - goto reporter_nest_cancel; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_ERR_COUNT, - reporter->error_count, DEVLINK_ATTR_PAD)) - goto reporter_nest_cancel; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_RECOVER_COUNT, - reporter->recovery_count, DEVLINK_ATTR_PAD)) - goto reporter_nest_cancel; - if (reporter->ops->recover && - nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD, - reporter->graceful_period, - DEVLINK_ATTR_PAD)) - goto reporter_nest_cancel; - if (reporter->ops->recover && - nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER, - reporter->auto_recover)) - goto reporter_nest_cancel; - if (reporter->dump_fmsg && - nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS, - jiffies_to_msecs(reporter->dump_ts), - DEVLINK_ATTR_PAD)) - goto reporter_nest_cancel; + if (WARN_ON(state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY && + state != DEVLINK_HEALTH_REPORTER_STATE_ERROR)) + return; - nla_nest_end(msg, reporter_attr); - genlmsg_end(msg, hdr); - return 0; + if (reporter->health_state == state) + return; -reporter_nest_cancel: - nla_nest_end(msg, reporter_attr); -genlmsg_cancel: - genlmsg_cancel(msg, hdr); - return -EMSGSIZE; + reporter->health_state = state; + trace_devlink_health_reporter_state_update(reporter->devlink, + reporter->ops->name, state); + devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER); } +EXPORT_SYMBOL_GPL(devlink_health_reporter_state_update); static int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb, struct genl_info *info) @@ -5043,7 +5250,7 @@ static int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb, if (!reporter) return -EINVAL; - err = devlink_health_reporter_recover(reporter, NULL); + err = devlink_health_reporter_recover(reporter, NULL, info->extack); devlink_health_reporter_put(reporter); return err; @@ -5076,7 +5283,7 @@ static int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb, if (err) goto out; - err = reporter->ops->diagnose(reporter, fmsg); + err = reporter->ops->diagnose(reporter, fmsg, info->extack); if (err) goto out; @@ -5111,7 +5318,7 @@ devlink_nl_cmd_health_reporter_dump_get_dumpit(struct sk_buff *skb, } mutex_lock(&reporter->dump_lock); if (!start) { - err = devlink_health_do_dump(reporter, NULL); + err = devlink_health_do_dump(reporter, NULL, cb->extack); if (err) goto unlock; cb->args[1] = reporter->dump_ts; @@ -5154,6 +5361,571 @@ devlink_nl_cmd_health_reporter_dump_clear_doit(struct sk_buff *skb, return 0; } +struct devlink_stats { + u64 rx_bytes; + u64 rx_packets; + struct u64_stats_sync syncp; +}; + +/** + * struct devlink_trap_group_item - Packet trap group attributes. + * @group: Immutable packet trap group attributes. + * @refcount: Number of trap items using the group. + * @list: trap_group_list member. + * @stats: Trap group statistics. + * + * Describes packet trap group attributes. Created by devlink during trap + * registration. + */ +struct devlink_trap_group_item { + const struct devlink_trap_group *group; + refcount_t refcount; + struct list_head list; + struct devlink_stats __percpu *stats; +}; + +/** + * struct devlink_trap_item - Packet trap attributes. + * @trap: Immutable packet trap attributes. + * @group_item: Associated group item. + * @list: trap_list member. + * @action: Trap action. + * @stats: Trap statistics. + * @priv: Driver private information. + * + * Describes both mutable and immutable packet trap attributes. Created by + * devlink during trap registration and used for all trap related operations. + */ +struct devlink_trap_item { + const struct devlink_trap *trap; + struct devlink_trap_group_item *group_item; + struct list_head list; + enum devlink_trap_action action; + struct devlink_stats __percpu *stats; + void *priv; +}; + +static struct devlink_trap_item * +devlink_trap_item_lookup(struct devlink *devlink, const char *name) +{ + struct devlink_trap_item *trap_item; + + list_for_each_entry(trap_item, &devlink->trap_list, list) { + if (!strcmp(trap_item->trap->name, name)) + return trap_item; + } + + return NULL; +} + +static struct devlink_trap_item * +devlink_trap_item_get_from_info(struct devlink *devlink, + struct genl_info *info) +{ + struct nlattr *attr; + + if (!info->attrs[DEVLINK_ATTR_TRAP_NAME]) + return NULL; + attr = info->attrs[DEVLINK_ATTR_TRAP_NAME]; + + return devlink_trap_item_lookup(devlink, nla_data(attr)); +} + +static int +devlink_trap_action_get_from_info(struct genl_info *info, + enum devlink_trap_action *p_trap_action) +{ + u8 val; + + val = nla_get_u8(info->attrs[DEVLINK_ATTR_TRAP_ACTION]); + switch (val) { + case DEVLINK_TRAP_ACTION_DROP: /* fall-through */ + case DEVLINK_TRAP_ACTION_TRAP: + *p_trap_action = val; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int devlink_trap_metadata_put(struct sk_buff *msg, + const struct devlink_trap *trap) +{ + struct nlattr *attr; + + attr = nla_nest_start(msg, DEVLINK_ATTR_TRAP_METADATA); + if (!attr) + return -EMSGSIZE; + + if ((trap->metadata_cap & DEVLINK_TRAP_METADATA_TYPE_F_IN_PORT) && + nla_put_flag(msg, DEVLINK_ATTR_TRAP_METADATA_TYPE_IN_PORT)) + goto nla_put_failure; + + nla_nest_end(msg, attr); + + return 0; + +nla_put_failure: + nla_nest_cancel(msg, attr); + return -EMSGSIZE; +} + +static void devlink_trap_stats_read(struct devlink_stats __percpu *trap_stats, + struct devlink_stats *stats) +{ + int i; + + memset(stats, 0, sizeof(*stats)); + for_each_possible_cpu(i) { + struct devlink_stats *cpu_stats; + u64 rx_packets, rx_bytes; + unsigned int start; + + cpu_stats = per_cpu_ptr(trap_stats, i); + do { + start = u64_stats_fetch_begin_irq(&cpu_stats->syncp); + rx_packets = cpu_stats->rx_packets; + rx_bytes = cpu_stats->rx_bytes; + } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start)); + + stats->rx_packets += rx_packets; + stats->rx_bytes += rx_bytes; + } +} + +static int devlink_trap_stats_put(struct sk_buff *msg, + struct devlink_stats __percpu *trap_stats) +{ + struct devlink_stats stats; + struct nlattr *attr; + + devlink_trap_stats_read(trap_stats, &stats); + + attr = nla_nest_start(msg, DEVLINK_ATTR_STATS); + if (!attr) + return -EMSGSIZE; + + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_PACKETS, + stats.rx_packets, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_BYTES, + stats.rx_bytes, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + nla_nest_end(msg, attr); + + return 0; + +nla_put_failure: + nla_nest_cancel(msg, attr); + return -EMSGSIZE; +} + +static int devlink_nl_trap_fill(struct sk_buff *msg, struct devlink *devlink, + const struct devlink_trap_item *trap_item, + enum devlink_command cmd, u32 portid, u32 seq, + int flags) +{ + struct devlink_trap_group_item *group_item = trap_item->group_item; + void *hdr; + int err; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + + if (nla_put_string(msg, DEVLINK_ATTR_TRAP_GROUP_NAME, + group_item->group->name)) + goto nla_put_failure; + + if (nla_put_string(msg, DEVLINK_ATTR_TRAP_NAME, trap_item->trap->name)) + goto nla_put_failure; + + if (nla_put_u8(msg, DEVLINK_ATTR_TRAP_TYPE, trap_item->trap->type)) + goto nla_put_failure; + + if (trap_item->trap->generic && + nla_put_flag(msg, DEVLINK_ATTR_TRAP_GENERIC)) + goto nla_put_failure; + + if (nla_put_u8(msg, DEVLINK_ATTR_TRAP_ACTION, trap_item->action)) + goto nla_put_failure; + + err = devlink_trap_metadata_put(msg, trap_item->trap); + if (err) + goto nla_put_failure; + + err = devlink_trap_stats_put(msg, trap_item->stats); + if (err) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int devlink_nl_cmd_trap_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct netlink_ext_ack *extack = info->extack; + struct devlink *devlink = info->user_ptr[0]; + struct devlink_trap_item *trap_item; + struct sk_buff *msg; + int err; + + if (list_empty(&devlink->trap_list)) + return -EOPNOTSUPP; + + trap_item = devlink_trap_item_get_from_info(devlink, info); + if (!trap_item) { + NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap"); + return -ENOENT; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_trap_fill(msg, devlink, trap_item, + DEVLINK_CMD_TRAP_NEW, info->snd_portid, + info->snd_seq, 0); + if (err) + goto err_trap_fill; + + return genlmsg_reply(msg, info); + +err_trap_fill: + nlmsg_free(msg); + return err; +} + +static int devlink_nl_cmd_trap_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink_trap_item *trap_item; + struct devlink *devlink; + int start = cb->args[0]; + int idx = 0; + int err; + + mutex_lock(&devlink_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + continue; + mutex_lock(&devlink->lock); + list_for_each_entry(trap_item, &devlink->trap_list, list) { + if (idx < start) { + idx++; + continue; + } + err = devlink_nl_trap_fill(msg, devlink, trap_item, + DEVLINK_CMD_TRAP_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) { + mutex_unlock(&devlink->lock); + goto out; + } + idx++; + } + mutex_unlock(&devlink->lock); + } +out: + mutex_unlock(&devlink_mutex); + + cb->args[0] = idx; + return msg->len; +} + +static int __devlink_trap_action_set(struct devlink *devlink, + struct devlink_trap_item *trap_item, + enum devlink_trap_action trap_action, + struct netlink_ext_ack *extack) +{ + int err; + + if (trap_item->action != trap_action && + trap_item->trap->type != DEVLINK_TRAP_TYPE_DROP) { + NL_SET_ERR_MSG_MOD(extack, "Cannot change action of non-drop traps. Skipping"); + return 0; + } + + err = devlink->ops->trap_action_set(devlink, trap_item->trap, + trap_action); + if (err) + return err; + + trap_item->action = trap_action; + + return 0; +} + +static int devlink_trap_action_set(struct devlink *devlink, + struct devlink_trap_item *trap_item, + struct genl_info *info) +{ + enum devlink_trap_action trap_action; + int err; + + if (!info->attrs[DEVLINK_ATTR_TRAP_ACTION]) + return 0; + + err = devlink_trap_action_get_from_info(info, &trap_action); + if (err) { + NL_SET_ERR_MSG_MOD(info->extack, "Invalid trap action"); + return -EINVAL; + } + + return __devlink_trap_action_set(devlink, trap_item, trap_action, + info->extack); +} + +static int devlink_nl_cmd_trap_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct netlink_ext_ack *extack = info->extack; + struct devlink *devlink = info->user_ptr[0]; + struct devlink_trap_item *trap_item; + int err; + + if (list_empty(&devlink->trap_list)) + return -EOPNOTSUPP; + + trap_item = devlink_trap_item_get_from_info(devlink, info); + if (!trap_item) { + NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap"); + return -ENOENT; + } + + err = devlink_trap_action_set(devlink, trap_item, info); + if (err) + return err; + + return 0; +} + +static struct devlink_trap_group_item * +devlink_trap_group_item_lookup(struct devlink *devlink, const char *name) +{ + struct devlink_trap_group_item *group_item; + + list_for_each_entry(group_item, &devlink->trap_group_list, list) { + if (!strcmp(group_item->group->name, name)) + return group_item; + } + + return NULL; +} + +static struct devlink_trap_group_item * +devlink_trap_group_item_get_from_info(struct devlink *devlink, + struct genl_info *info) +{ + char *name; + + if (!info->attrs[DEVLINK_ATTR_TRAP_GROUP_NAME]) + return NULL; + name = nla_data(info->attrs[DEVLINK_ATTR_TRAP_GROUP_NAME]); + + return devlink_trap_group_item_lookup(devlink, name); +} + +static int +devlink_nl_trap_group_fill(struct sk_buff *msg, struct devlink *devlink, + const struct devlink_trap_group_item *group_item, + enum devlink_command cmd, u32 portid, u32 seq, + int flags) +{ + void *hdr; + int err; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + + if (nla_put_string(msg, DEVLINK_ATTR_TRAP_GROUP_NAME, + group_item->group->name)) + goto nla_put_failure; + + if (group_item->group->generic && + nla_put_flag(msg, DEVLINK_ATTR_TRAP_GENERIC)) + goto nla_put_failure; + + err = devlink_trap_stats_put(msg, group_item->stats); + if (err) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int devlink_nl_cmd_trap_group_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct netlink_ext_ack *extack = info->extack; + struct devlink *devlink = info->user_ptr[0]; + struct devlink_trap_group_item *group_item; + struct sk_buff *msg; + int err; + + if (list_empty(&devlink->trap_group_list)) + return -EOPNOTSUPP; + + group_item = devlink_trap_group_item_get_from_info(devlink, info); + if (!group_item) { + NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap group"); + return -ENOENT; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_trap_group_fill(msg, devlink, group_item, + DEVLINK_CMD_TRAP_GROUP_NEW, + info->snd_portid, info->snd_seq, 0); + if (err) + goto err_trap_group_fill; + + return genlmsg_reply(msg, info); + +err_trap_group_fill: + nlmsg_free(msg); + return err; +} + +static int devlink_nl_cmd_trap_group_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + enum devlink_command cmd = DEVLINK_CMD_TRAP_GROUP_NEW; + struct devlink_trap_group_item *group_item; + u32 portid = NETLINK_CB(cb->skb).portid; + struct devlink *devlink; + int start = cb->args[0]; + int idx = 0; + int err; + + mutex_lock(&devlink_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + continue; + mutex_lock(&devlink->lock); + list_for_each_entry(group_item, &devlink->trap_group_list, + list) { + if (idx < start) { + idx++; + continue; + } + err = devlink_nl_trap_group_fill(msg, devlink, + group_item, cmd, + portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) { + mutex_unlock(&devlink->lock); + goto out; + } + idx++; + } + mutex_unlock(&devlink->lock); + } +out: + mutex_unlock(&devlink_mutex); + + cb->args[0] = idx; + return msg->len; +} + +static int +__devlink_trap_group_action_set(struct devlink *devlink, + struct devlink_trap_group_item *group_item, + enum devlink_trap_action trap_action, + struct netlink_ext_ack *extack) +{ + const char *group_name = group_item->group->name; + struct devlink_trap_item *trap_item; + int err; + + list_for_each_entry(trap_item, &devlink->trap_list, list) { + if (strcmp(trap_item->trap->group.name, group_name)) + continue; + err = __devlink_trap_action_set(devlink, trap_item, + trap_action, extack); + if (err) + return err; + } + + return 0; +} + +static int +devlink_trap_group_action_set(struct devlink *devlink, + struct devlink_trap_group_item *group_item, + struct genl_info *info) +{ + enum devlink_trap_action trap_action; + int err; + + if (!info->attrs[DEVLINK_ATTR_TRAP_ACTION]) + return 0; + + err = devlink_trap_action_get_from_info(info, &trap_action); + if (err) { + NL_SET_ERR_MSG_MOD(info->extack, "Invalid trap action"); + return -EINVAL; + } + + err = __devlink_trap_group_action_set(devlink, group_item, trap_action, + info->extack); + if (err) + return err; + + return 0; +} + +static int devlink_nl_cmd_trap_group_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct netlink_ext_ack *extack = info->extack; + struct devlink *devlink = info->user_ptr[0]; + struct devlink_trap_group_item *group_item; + int err; + + if (list_empty(&devlink->trap_group_list)) + return -EOPNOTSUPP; + + group_item = devlink_trap_group_item_get_from_info(devlink, info); + if (!group_item) { + NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap group"); + return -ENOENT; + } + + err = devlink_trap_group_action_set(devlink, group_item, info); + if (err) + return err; + + return 0; +} + static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, @@ -5184,6 +5956,12 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 }, [DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_FLASH_UPDATE_COMPONENT] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_TRAP_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_TRAP_ACTION] = { .type = NLA_U8 }, + [DEVLINK_ATTR_TRAP_GROUP_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_NETNS_PID] = { .type = NLA_U32 }, + [DEVLINK_ATTR_NETNS_FD] = { .type = NLA_U32 }, + [DEVLINK_ATTR_NETNS_ID] = { .type = NLA_U32 }, }; static const struct genl_ops devlink_nl_ops[] = { @@ -5414,7 +6192,8 @@ static const struct genl_ops devlink_nl_ops[] = { }, { .cmd = DEVLINK_CMD_REGION_READ, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = devlink_nl_cmd_region_read_dumpit, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, @@ -5462,7 +6241,8 @@ static const struct genl_ops devlink_nl_ops[] = { }, { .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = devlink_nl_cmd_health_reporter_dump_get_dumpit, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | @@ -5483,6 +6263,32 @@ static const struct genl_ops devlink_nl_ops[] = { .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, + { + .cmd = DEVLINK_CMD_TRAP_GET, + .doit = devlink_nl_cmd_trap_get_doit, + .dumpit = devlink_nl_cmd_trap_get_dumpit, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_TRAP_SET, + .doit = devlink_nl_cmd_trap_set_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, + { + .cmd = DEVLINK_CMD_TRAP_GROUP_GET, + .doit = devlink_nl_cmd_trap_group_get_doit, + .dumpit = devlink_nl_cmd_trap_group_get_dumpit, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_TRAP_GROUP_SET, + .doit = devlink_nl_cmd_trap_group_set_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, }; static struct genl_family devlink_nl_family __ro_after_init = { @@ -5520,7 +6326,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) if (!devlink) return NULL; devlink->ops = ops; - devlink_net_set(devlink, &init_net); + __devlink_net_set(devlink, &init_net); INIT_LIST_HEAD(&devlink->port_list); INIT_LIST_HEAD(&devlink->sb_list); INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); @@ -5528,6 +6334,8 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) INIT_LIST_HEAD(&devlink->param_list); INIT_LIST_HEAD(&devlink->region_list); INIT_LIST_HEAD(&devlink->reporter_list); + INIT_LIST_HEAD(&devlink->trap_list); + INIT_LIST_HEAD(&devlink->trap_group_list); mutex_init(&devlink->lock); mutex_init(&devlink->reporters_lock); return devlink; @@ -5544,6 +6352,7 @@ int devlink_register(struct devlink *devlink, struct device *dev) { mutex_lock(&devlink_mutex); devlink->dev = dev; + devlink->registered = true; list_add_tail(&devlink->list, &devlink_list); devlink_notify(devlink, DEVLINK_CMD_NEW); mutex_unlock(&devlink_mutex); @@ -5559,6 +6368,8 @@ EXPORT_SYMBOL_GPL(devlink_register); void devlink_unregister(struct devlink *devlink) { mutex_lock(&devlink_mutex); + WARN_ON(devlink_reload_supported(devlink) && + devlink->reload_enabled); devlink_notify(devlink, DEVLINK_CMD_DEL); list_del(&devlink->list); mutex_unlock(&devlink_mutex); @@ -5566,6 +6377,41 @@ void devlink_unregister(struct devlink *devlink) EXPORT_SYMBOL_GPL(devlink_unregister); /** + * devlink_reload_enable - Enable reload of devlink instance + * + * @devlink: devlink + * + * Should be called at end of device initialization + * process when reload operation is supported. + */ +void devlink_reload_enable(struct devlink *devlink) +{ + mutex_lock(&devlink_mutex); + devlink->reload_enabled = true; + mutex_unlock(&devlink_mutex); +} +EXPORT_SYMBOL_GPL(devlink_reload_enable); + +/** + * devlink_reload_disable - Disable reload of devlink instance + * + * @devlink: devlink + * + * Should be called at the beginning of device cleanup + * process when reload operation is supported. + */ +void devlink_reload_disable(struct devlink *devlink) +{ + mutex_lock(&devlink_mutex); + /* Mutex is taken which ensures that no reload operation is in + * progress while setting up forbidded flag. + */ + devlink->reload_enabled = false; + mutex_unlock(&devlink_mutex); +} +EXPORT_SYMBOL_GPL(devlink_reload_disable); + +/** * devlink_free - Free devlink instance resources * * @devlink: devlink @@ -5574,6 +6420,8 @@ void devlink_free(struct devlink *devlink) { mutex_destroy(&devlink->reporters_lock); mutex_destroy(&devlink->lock); + WARN_ON(!list_empty(&devlink->trap_group_list)); + WARN_ON(!list_empty(&devlink->trap_list)); WARN_ON(!list_empty(&devlink->reporter_list)); WARN_ON(!list_empty(&devlink->region_list)); WARN_ON(!list_empty(&devlink->param_list)); @@ -5598,7 +6446,7 @@ static bool devlink_port_type_should_warn(struct devlink_port *devlink_port) devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA; } -#define DEVLINK_PORT_TYPE_WARN_TIMEOUT (HZ * 30) +#define DEVLINK_PORT_TYPE_WARN_TIMEOUT (HZ * 3600) static void devlink_port_type_warn_schedule(struct devlink_port *devlink_port) { @@ -5678,10 +6526,10 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port, if (WARN_ON(!devlink_port->registered)) return; devlink_port_type_warn_cancel(devlink_port); - spin_lock(&devlink_port->type_lock); + spin_lock_bh(&devlink_port->type_lock); devlink_port->type = type; devlink_port->type_dev = type_dev; - spin_unlock(&devlink_port->type_lock); + spin_unlock_bh(&devlink_port->type_lock); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); } @@ -6538,7 +7386,7 @@ __devlink_param_driverinit_value_set(struct devlink *devlink, int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id, union devlink_param_value *init_val) { - if (!devlink->ops->reload) + if (!devlink_reload_supported(devlink)) return -EOPNOTSUPP; return __devlink_param_driverinit_value_get(&devlink->param_list, @@ -6585,7 +7433,7 @@ int devlink_port_param_driverinit_value_get(struct devlink_port *devlink_port, { struct devlink *devlink = devlink_port->devlink; - if (!devlink->ops->reload) + if (!devlink_reload_supported(devlink)) return -EOPNOTSUPP; return __devlink_param_driverinit_value_get(&devlink_port->param_list, @@ -6744,7 +7592,7 @@ void devlink_region_destroy(struct devlink_region *region) /* Free all snapshots of region */ list_for_each_entry_safe(snapshot, ts, ®ion->snapshot_list, list) - devlink_region_snapshot_del(snapshot); + devlink_region_snapshot_del(region, snapshot); list_del(®ion->list); @@ -6755,7 +7603,7 @@ void devlink_region_destroy(struct devlink_region *region) EXPORT_SYMBOL_GPL(devlink_region_destroy); /** - * devlink_region_shapshot_id_get - get snapshot ID + * devlink_region_snapshot_id_get - get snapshot ID * * This callback should be called when adding a new snapshot, * Driver should use the same id for multiple snapshots taken @@ -6763,7 +7611,7 @@ EXPORT_SYMBOL_GPL(devlink_region_destroy); * * @devlink: devlink */ -u32 devlink_region_shapshot_id_get(struct devlink *devlink) +u32 devlink_region_snapshot_id_get(struct devlink *devlink) { u32 id; @@ -6773,7 +7621,7 @@ u32 devlink_region_shapshot_id_get(struct devlink *devlink) return id; } -EXPORT_SYMBOL_GPL(devlink_region_shapshot_id_get); +EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_get); /** * devlink_region_snapshot_create - create a new snapshot @@ -6784,12 +7632,11 @@ EXPORT_SYMBOL_GPL(devlink_region_shapshot_id_get); * The @snapshot_id should be obtained using the getter function. * * @region: devlink region of the snapshot - * @data_len: size of snapshot data * @data: snapshot data * @snapshot_id: snapshot id to be created * @data_destructor: pointer to destructor function to free data */ -int devlink_region_snapshot_create(struct devlink_region *region, u64 data_len, +int devlink_region_snapshot_create(struct devlink_region *region, u8 *data, u32 snapshot_id, devlink_snapshot_data_dest_t *data_destructor) { @@ -6819,7 +7666,6 @@ int devlink_region_snapshot_create(struct devlink_region *region, u64 data_len, snapshot->id = snapshot_id; snapshot->region = region; snapshot->data = data; - snapshot->data_len = data_len; snapshot->data_destructor = data_destructor; list_add_tail(&snapshot->list, ®ion->snapshot_list); @@ -6836,6 +7682,494 @@ unlock: } EXPORT_SYMBOL_GPL(devlink_region_snapshot_create); +#define DEVLINK_TRAP(_id, _type) \ + { \ + .type = DEVLINK_TRAP_TYPE_##_type, \ + .id = DEVLINK_TRAP_GENERIC_ID_##_id, \ + .name = DEVLINK_TRAP_GENERIC_NAME_##_id, \ + } + +static const struct devlink_trap devlink_trap_generic[] = { + DEVLINK_TRAP(SMAC_MC, DROP), + DEVLINK_TRAP(VLAN_TAG_MISMATCH, DROP), + DEVLINK_TRAP(INGRESS_VLAN_FILTER, DROP), + DEVLINK_TRAP(INGRESS_STP_FILTER, DROP), + DEVLINK_TRAP(EMPTY_TX_LIST, DROP), + DEVLINK_TRAP(PORT_LOOPBACK_FILTER, DROP), + DEVLINK_TRAP(BLACKHOLE_ROUTE, DROP), + DEVLINK_TRAP(TTL_ERROR, EXCEPTION), + DEVLINK_TRAP(TAIL_DROP, DROP), + DEVLINK_TRAP(NON_IP_PACKET, DROP), + DEVLINK_TRAP(UC_DIP_MC_DMAC, DROP), + DEVLINK_TRAP(DIP_LB, DROP), + DEVLINK_TRAP(SIP_MC, DROP), + DEVLINK_TRAP(SIP_LB, DROP), + DEVLINK_TRAP(CORRUPTED_IP_HDR, DROP), + DEVLINK_TRAP(IPV4_SIP_BC, DROP), + DEVLINK_TRAP(IPV6_MC_DIP_RESERVED_SCOPE, DROP), + DEVLINK_TRAP(IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE, DROP), + DEVLINK_TRAP(MTU_ERROR, EXCEPTION), + DEVLINK_TRAP(UNRESOLVED_NEIGH, EXCEPTION), + DEVLINK_TRAP(RPF, EXCEPTION), + DEVLINK_TRAP(REJECT_ROUTE, EXCEPTION), + DEVLINK_TRAP(IPV4_LPM_UNICAST_MISS, EXCEPTION), + DEVLINK_TRAP(IPV6_LPM_UNICAST_MISS, EXCEPTION), + DEVLINK_TRAP(NON_ROUTABLE, DROP), + DEVLINK_TRAP(DECAP_ERROR, EXCEPTION), + DEVLINK_TRAP(OVERLAY_SMAC_MC, DROP), +}; + +#define DEVLINK_TRAP_GROUP(_id) \ + { \ + .id = DEVLINK_TRAP_GROUP_GENERIC_ID_##_id, \ + .name = DEVLINK_TRAP_GROUP_GENERIC_NAME_##_id, \ + } + +static const struct devlink_trap_group devlink_trap_group_generic[] = { + DEVLINK_TRAP_GROUP(L2_DROPS), + DEVLINK_TRAP_GROUP(L3_DROPS), + DEVLINK_TRAP_GROUP(BUFFER_DROPS), + DEVLINK_TRAP_GROUP(TUNNEL_DROPS), +}; + +static int devlink_trap_generic_verify(const struct devlink_trap *trap) +{ + if (trap->id > DEVLINK_TRAP_GENERIC_ID_MAX) + return -EINVAL; + + if (strcmp(trap->name, devlink_trap_generic[trap->id].name)) + return -EINVAL; + + if (trap->type != devlink_trap_generic[trap->id].type) + return -EINVAL; + + return 0; +} + +static int devlink_trap_driver_verify(const struct devlink_trap *trap) +{ + int i; + + if (trap->id <= DEVLINK_TRAP_GENERIC_ID_MAX) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(devlink_trap_generic); i++) { + if (!strcmp(trap->name, devlink_trap_generic[i].name)) + return -EEXIST; + } + + return 0; +} + +static int devlink_trap_verify(const struct devlink_trap *trap) +{ + if (!trap || !trap->name || !trap->group.name) + return -EINVAL; + + if (trap->generic) + return devlink_trap_generic_verify(trap); + else + return devlink_trap_driver_verify(trap); +} + +static int +devlink_trap_group_generic_verify(const struct devlink_trap_group *group) +{ + if (group->id > DEVLINK_TRAP_GROUP_GENERIC_ID_MAX) + return -EINVAL; + + if (strcmp(group->name, devlink_trap_group_generic[group->id].name)) + return -EINVAL; + + return 0; +} + +static int +devlink_trap_group_driver_verify(const struct devlink_trap_group *group) +{ + int i; + + if (group->id <= DEVLINK_TRAP_GROUP_GENERIC_ID_MAX) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(devlink_trap_group_generic); i++) { + if (!strcmp(group->name, devlink_trap_group_generic[i].name)) + return -EEXIST; + } + + return 0; +} + +static int devlink_trap_group_verify(const struct devlink_trap_group *group) +{ + if (group->generic) + return devlink_trap_group_generic_verify(group); + else + return devlink_trap_group_driver_verify(group); +} + +static void +devlink_trap_group_notify(struct devlink *devlink, + const struct devlink_trap_group_item *group_item, + enum devlink_command cmd) +{ + struct sk_buff *msg; + int err; + + WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_GROUP_NEW && + cmd != DEVLINK_CMD_TRAP_GROUP_DEL); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_trap_group_fill(msg, devlink, group_item, cmd, 0, 0, + 0); + if (err) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + +static struct devlink_trap_group_item * +devlink_trap_group_item_create(struct devlink *devlink, + const struct devlink_trap_group *group) +{ + struct devlink_trap_group_item *group_item; + int err; + + err = devlink_trap_group_verify(group); + if (err) + return ERR_PTR(err); + + group_item = kzalloc(sizeof(*group_item), GFP_KERNEL); + if (!group_item) + return ERR_PTR(-ENOMEM); + + group_item->stats = netdev_alloc_pcpu_stats(struct devlink_stats); + if (!group_item->stats) { + err = -ENOMEM; + goto err_stats_alloc; + } + + group_item->group = group; + refcount_set(&group_item->refcount, 1); + + if (devlink->ops->trap_group_init) { + err = devlink->ops->trap_group_init(devlink, group); + if (err) + goto err_group_init; + } + + list_add_tail(&group_item->list, &devlink->trap_group_list); + devlink_trap_group_notify(devlink, group_item, + DEVLINK_CMD_TRAP_GROUP_NEW); + + return group_item; + +err_group_init: + free_percpu(group_item->stats); +err_stats_alloc: + kfree(group_item); + return ERR_PTR(err); +} + +static void +devlink_trap_group_item_destroy(struct devlink *devlink, + struct devlink_trap_group_item *group_item) +{ + devlink_trap_group_notify(devlink, group_item, + DEVLINK_CMD_TRAP_GROUP_DEL); + list_del(&group_item->list); + free_percpu(group_item->stats); + kfree(group_item); +} + +static struct devlink_trap_group_item * +devlink_trap_group_item_get(struct devlink *devlink, + const struct devlink_trap_group *group) +{ + struct devlink_trap_group_item *group_item; + + group_item = devlink_trap_group_item_lookup(devlink, group->name); + if (group_item) { + refcount_inc(&group_item->refcount); + return group_item; + } + + return devlink_trap_group_item_create(devlink, group); +} + +static void +devlink_trap_group_item_put(struct devlink *devlink, + struct devlink_trap_group_item *group_item) +{ + if (!refcount_dec_and_test(&group_item->refcount)) + return; + + devlink_trap_group_item_destroy(devlink, group_item); +} + +static int +devlink_trap_item_group_link(struct devlink *devlink, + struct devlink_trap_item *trap_item) +{ + struct devlink_trap_group_item *group_item; + + group_item = devlink_trap_group_item_get(devlink, + &trap_item->trap->group); + if (IS_ERR(group_item)) + return PTR_ERR(group_item); + + trap_item->group_item = group_item; + + return 0; +} + +static void +devlink_trap_item_group_unlink(struct devlink *devlink, + struct devlink_trap_item *trap_item) +{ + devlink_trap_group_item_put(devlink, trap_item->group_item); +} + +static void devlink_trap_notify(struct devlink *devlink, + const struct devlink_trap_item *trap_item, + enum devlink_command cmd) +{ + struct sk_buff *msg; + int err; + + WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_NEW && + cmd != DEVLINK_CMD_TRAP_DEL); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_trap_fill(msg, devlink, trap_item, cmd, 0, 0, 0); + if (err) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + +static int +devlink_trap_register(struct devlink *devlink, + const struct devlink_trap *trap, void *priv) +{ + struct devlink_trap_item *trap_item; + int err; + + if (devlink_trap_item_lookup(devlink, trap->name)) + return -EEXIST; + + trap_item = kzalloc(sizeof(*trap_item), GFP_KERNEL); + if (!trap_item) + return -ENOMEM; + + trap_item->stats = netdev_alloc_pcpu_stats(struct devlink_stats); + if (!trap_item->stats) { + err = -ENOMEM; + goto err_stats_alloc; + } + + trap_item->trap = trap; + trap_item->action = trap->init_action; + trap_item->priv = priv; + + err = devlink_trap_item_group_link(devlink, trap_item); + if (err) + goto err_group_link; + + err = devlink->ops->trap_init(devlink, trap, trap_item); + if (err) + goto err_trap_init; + + list_add_tail(&trap_item->list, &devlink->trap_list); + devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_NEW); + + return 0; + +err_trap_init: + devlink_trap_item_group_unlink(devlink, trap_item); +err_group_link: + free_percpu(trap_item->stats); +err_stats_alloc: + kfree(trap_item); + return err; +} + +static void devlink_trap_unregister(struct devlink *devlink, + const struct devlink_trap *trap) +{ + struct devlink_trap_item *trap_item; + + trap_item = devlink_trap_item_lookup(devlink, trap->name); + if (WARN_ON_ONCE(!trap_item)) + return; + + devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_DEL); + list_del(&trap_item->list); + if (devlink->ops->trap_fini) + devlink->ops->trap_fini(devlink, trap, trap_item); + devlink_trap_item_group_unlink(devlink, trap_item); + free_percpu(trap_item->stats); + kfree(trap_item); +} + +static void devlink_trap_disable(struct devlink *devlink, + const struct devlink_trap *trap) +{ + struct devlink_trap_item *trap_item; + + trap_item = devlink_trap_item_lookup(devlink, trap->name); + if (WARN_ON_ONCE(!trap_item)) + return; + + devlink->ops->trap_action_set(devlink, trap, DEVLINK_TRAP_ACTION_DROP); + trap_item->action = DEVLINK_TRAP_ACTION_DROP; +} + +/** + * devlink_traps_register - Register packet traps with devlink. + * @devlink: devlink. + * @traps: Packet traps. + * @traps_count: Count of provided packet traps. + * @priv: Driver private information. + * + * Return: Non-zero value on failure. + */ +int devlink_traps_register(struct devlink *devlink, + const struct devlink_trap *traps, + size_t traps_count, void *priv) +{ + int i, err; + + if (!devlink->ops->trap_init || !devlink->ops->trap_action_set) + return -EINVAL; + + mutex_lock(&devlink->lock); + for (i = 0; i < traps_count; i++) { + const struct devlink_trap *trap = &traps[i]; + + err = devlink_trap_verify(trap); + if (err) + goto err_trap_verify; + + err = devlink_trap_register(devlink, trap, priv); + if (err) + goto err_trap_register; + } + mutex_unlock(&devlink->lock); + + return 0; + +err_trap_register: +err_trap_verify: + for (i--; i >= 0; i--) + devlink_trap_unregister(devlink, &traps[i]); + mutex_unlock(&devlink->lock); + return err; +} +EXPORT_SYMBOL_GPL(devlink_traps_register); + +/** + * devlink_traps_unregister - Unregister packet traps from devlink. + * @devlink: devlink. + * @traps: Packet traps. + * @traps_count: Count of provided packet traps. + */ +void devlink_traps_unregister(struct devlink *devlink, + const struct devlink_trap *traps, + size_t traps_count) +{ + int i; + + mutex_lock(&devlink->lock); + /* Make sure we do not have any packets in-flight while unregistering + * traps by disabling all of them and waiting for a grace period. + */ + for (i = traps_count - 1; i >= 0; i--) + devlink_trap_disable(devlink, &traps[i]); + synchronize_rcu(); + for (i = traps_count - 1; i >= 0; i--) + devlink_trap_unregister(devlink, &traps[i]); + mutex_unlock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devlink_traps_unregister); + +static void +devlink_trap_stats_update(struct devlink_stats __percpu *trap_stats, + size_t skb_len) +{ + struct devlink_stats *stats; + + stats = this_cpu_ptr(trap_stats); + u64_stats_update_begin(&stats->syncp); + stats->rx_bytes += skb_len; + stats->rx_packets++; + u64_stats_update_end(&stats->syncp); +} + +static void +devlink_trap_report_metadata_fill(struct net_dm_hw_metadata *hw_metadata, + const struct devlink_trap_item *trap_item, + struct devlink_port *in_devlink_port) +{ + struct devlink_trap_group_item *group_item = trap_item->group_item; + + hw_metadata->trap_group_name = group_item->group->name; + hw_metadata->trap_name = trap_item->trap->name; + + spin_lock(&in_devlink_port->type_lock); + if (in_devlink_port->type == DEVLINK_PORT_TYPE_ETH) + hw_metadata->input_dev = in_devlink_port->type_dev; + spin_unlock(&in_devlink_port->type_lock); +} + +/** + * devlink_trap_report - Report trapped packet to drop monitor. + * @devlink: devlink. + * @skb: Trapped packet. + * @trap_ctx: Trap context. + * @in_devlink_port: Input devlink port. + */ +void devlink_trap_report(struct devlink *devlink, struct sk_buff *skb, + void *trap_ctx, struct devlink_port *in_devlink_port) +{ + struct devlink_trap_item *trap_item = trap_ctx; + struct net_dm_hw_metadata hw_metadata = {}; + + devlink_trap_stats_update(trap_item->stats, skb->len); + devlink_trap_stats_update(trap_item->group_item->stats, skb->len); + + devlink_trap_report_metadata_fill(&hw_metadata, trap_item, + in_devlink_port); + net_dm_hw_report(skb, &hw_metadata); +} +EXPORT_SYMBOL_GPL(devlink_trap_report); + +/** + * devlink_trap_ctx_priv - Trap context to driver private information. + * @trap_ctx: Trap context. + * + * Return: Driver private information passed during registration. + */ +void *devlink_trap_ctx_priv(void *trap_ctx) +{ + struct devlink_trap_item *trap_item = trap_ctx; + + return trap_item->priv; +} +EXPORT_SYMBOL_GPL(devlink_trap_ctx_priv); + static void __devlink_compat_running_version(struct devlink *devlink, char *buf, size_t len) { @@ -6941,11 +8275,10 @@ int devlink_compat_switch_id_get(struct net_device *dev, { struct devlink_port *devlink_port; - /* RTNL mutex is held here which ensures that devlink_port - * instance cannot disappear in the middle. No need to take + /* Caller must hold RTNL mutex or reference to dev, which ensures that + * devlink_port instance cannot disappear in the middle. No need to take * any devlink lock as only permanent values are accessed. */ - ASSERT_RTNL(); devlink_port = netdev_to_devlink_port(dev); if (!devlink_port || !devlink_port->attrs.switch_port) return -EOPNOTSUPP; @@ -6955,9 +8288,43 @@ int devlink_compat_switch_id_get(struct net_device *dev, return 0; } +static void __net_exit devlink_pernet_pre_exit(struct net *net) +{ + struct devlink *devlink; + int err; + + /* In case network namespace is getting destroyed, reload + * all devlink instances from this namespace into init_net. + */ + mutex_lock(&devlink_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (net_eq(devlink_net(devlink), net)) { + if (WARN_ON(!devlink_reload_supported(devlink))) + continue; + err = devlink_reload(devlink, &init_net, NULL); + if (err && err != -EOPNOTSUPP) + pr_warn("Failed to reload devlink instance into init_net\n"); + } + } + mutex_unlock(&devlink_mutex); +} + +static struct pernet_operations devlink_pernet_ops __net_initdata = { + .pre_exit = devlink_pernet_pre_exit, +}; + static int __init devlink_init(void) { - return genl_register_family(&devlink_nl_family); + int err; + + err = genl_register_family(&devlink_nl_family); + if (err) + goto out; + err = register_pernet_subsys(&devlink_pernet_ops); + +out: + WARN_ON(err); + return err; } subsys_initcall(devlink_init); diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 4ea4347f5062..31700e0c3928 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -26,6 +26,7 @@ #include <linux/bitops.h> #include <linux/slab.h> #include <linux/module.h> +#include <net/drop_monitor.h> #include <net/genetlink.h> #include <net/netevent.h> @@ -43,13 +44,44 @@ * netlink alerts */ static int trace_state = TRACE_OFF; -static DEFINE_MUTEX(trace_state_mutex); +static bool monitor_hw; + +/* net_dm_mutex + * + * An overall lock guarding every operation coming from userspace. + * It also guards the global 'hw_stats_list' list. + */ +static DEFINE_MUTEX(net_dm_mutex); + +struct net_dm_stats { + u64 dropped; + struct u64_stats_sync syncp; +}; + +#define NET_DM_MAX_HW_TRAP_NAME_LEN 40 + +struct net_dm_hw_entry { + char trap_name[NET_DM_MAX_HW_TRAP_NAME_LEN]; + u32 count; +}; + +struct net_dm_hw_entries { + u32 num_entries; + struct net_dm_hw_entry entries[0]; +}; struct per_cpu_dm_data { - spinlock_t lock; - struct sk_buff *skb; + spinlock_t lock; /* Protects 'skb', 'hw_entries' and + * 'send_timer' + */ + union { + struct sk_buff *skb; + struct net_dm_hw_entries *hw_entries; + }; + struct sk_buff_head drop_queue; struct work_struct dm_alert_work; struct timer_list send_timer; + struct net_dm_stats stats; }; struct dm_hw_stat_delta { @@ -63,12 +95,37 @@ struct dm_hw_stat_delta { static struct genl_family net_drop_monitor_family; static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); +static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_hw_cpu_data); static int dm_hit_limit = 64; static int dm_delay = 1; static unsigned long dm_hw_check_delta = 2*HZ; static LIST_HEAD(hw_stats_list); +static enum net_dm_alert_mode net_dm_alert_mode = NET_DM_ALERT_MODE_SUMMARY; +static u32 net_dm_trunc_len; +static u32 net_dm_queue_len = 1000; + +struct net_dm_alert_ops { + void (*kfree_skb_probe)(void *ignore, struct sk_buff *skb, + void *location); + void (*napi_poll_probe)(void *ignore, struct napi_struct *napi, + int work, int budget); + void (*work_item_func)(struct work_struct *work); + void (*hw_work_item_func)(struct work_struct *work); + void (*hw_probe)(struct sk_buff *skb, + const struct net_dm_hw_metadata *hw_metadata); +}; + +struct net_dm_skb_cb { + union { + struct net_dm_hw_metadata *hw_metadata; + void *pc; + }; +}; + +#define NET_DM_SKB_CB(__skb) ((struct net_dm_skb_cb *)&((__skb)->cb[0])) + static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data) { size_t al; @@ -235,48 +292,842 @@ static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi, rcu_read_unlock(); } -static int set_all_monitor_traces(int state) +static struct net_dm_hw_entries * +net_dm_hw_reset_per_cpu_data(struct per_cpu_dm_data *hw_data) { - int rc = 0; - struct dm_hw_stat_delta *new_stat = NULL; - struct dm_hw_stat_delta *temp; + struct net_dm_hw_entries *hw_entries; + unsigned long flags; + + hw_entries = kzalloc(struct_size(hw_entries, entries, dm_hit_limit), + GFP_KERNEL); + if (!hw_entries) { + /* If the memory allocation failed, we try to perform another + * allocation in 1/10 second. Otherwise, the probe function + * will constantly bail out. + */ + mod_timer(&hw_data->send_timer, jiffies + HZ / 10); + } - mutex_lock(&trace_state_mutex); + spin_lock_irqsave(&hw_data->lock, flags); + swap(hw_data->hw_entries, hw_entries); + spin_unlock_irqrestore(&hw_data->lock, flags); - if (state == trace_state) { - rc = -EAGAIN; - goto out_unlock; + return hw_entries; +} + +static int net_dm_hw_entry_put(struct sk_buff *msg, + const struct net_dm_hw_entry *hw_entry) +{ + struct nlattr *attr; + + attr = nla_nest_start(msg, NET_DM_ATTR_HW_ENTRY); + if (!attr) + return -EMSGSIZE; + + if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_NAME, hw_entry->trap_name)) + goto nla_put_failure; + + if (nla_put_u32(msg, NET_DM_ATTR_HW_TRAP_COUNT, hw_entry->count)) + goto nla_put_failure; + + nla_nest_end(msg, attr); + + return 0; + +nla_put_failure: + nla_nest_cancel(msg, attr); + return -EMSGSIZE; +} + +static int net_dm_hw_entries_put(struct sk_buff *msg, + const struct net_dm_hw_entries *hw_entries) +{ + struct nlattr *attr; + int i; + + attr = nla_nest_start(msg, NET_DM_ATTR_HW_ENTRIES); + if (!attr) + return -EMSGSIZE; + + for (i = 0; i < hw_entries->num_entries; i++) { + int rc; + + rc = net_dm_hw_entry_put(msg, &hw_entries->entries[i]); + if (rc) + goto nla_put_failure; } - switch (state) { - case TRACE_ON: - if (!try_module_get(THIS_MODULE)) { - rc = -ENODEV; - break; + nla_nest_end(msg, attr); + + return 0; + +nla_put_failure: + nla_nest_cancel(msg, attr); + return -EMSGSIZE; +} + +static int +net_dm_hw_summary_report_fill(struct sk_buff *msg, + const struct net_dm_hw_entries *hw_entries) +{ + struct net_dm_alert_msg anc_hdr = { 0 }; + void *hdr; + int rc; + + hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0, + NET_DM_CMD_ALERT); + if (!hdr) + return -EMSGSIZE; + + /* We need to put the ancillary header in order not to break user + * space. + */ + if (nla_put(msg, NLA_UNSPEC, sizeof(anc_hdr), &anc_hdr)) + goto nla_put_failure; + + rc = net_dm_hw_entries_put(msg, hw_entries); + if (rc) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static void net_dm_hw_summary_work(struct work_struct *work) +{ + struct net_dm_hw_entries *hw_entries; + struct per_cpu_dm_data *hw_data; + struct sk_buff *msg; + int rc; + + hw_data = container_of(work, struct per_cpu_dm_data, dm_alert_work); + + hw_entries = net_dm_hw_reset_per_cpu_data(hw_data); + if (!hw_entries) + return; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + goto out; + + rc = net_dm_hw_summary_report_fill(msg, hw_entries); + if (rc) { + nlmsg_free(msg); + goto out; + } + + genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL); + +out: + kfree(hw_entries); +} + +static void +net_dm_hw_summary_probe(struct sk_buff *skb, + const struct net_dm_hw_metadata *hw_metadata) +{ + struct net_dm_hw_entries *hw_entries; + struct net_dm_hw_entry *hw_entry; + struct per_cpu_dm_data *hw_data; + unsigned long flags; + int i; + + hw_data = this_cpu_ptr(&dm_hw_cpu_data); + spin_lock_irqsave(&hw_data->lock, flags); + hw_entries = hw_data->hw_entries; + + if (!hw_entries) + goto out; + + for (i = 0; i < hw_entries->num_entries; i++) { + hw_entry = &hw_entries->entries[i]; + if (!strncmp(hw_entry->trap_name, hw_metadata->trap_name, + NET_DM_MAX_HW_TRAP_NAME_LEN - 1)) { + hw_entry->count++; + goto out; } + } + if (WARN_ON_ONCE(hw_entries->num_entries == dm_hit_limit)) + goto out; - rc |= register_trace_kfree_skb(trace_kfree_skb_hit, NULL); - rc |= register_trace_napi_poll(trace_napi_poll_hit, NULL); - break; + hw_entry = &hw_entries->entries[hw_entries->num_entries]; + strlcpy(hw_entry->trap_name, hw_metadata->trap_name, + NET_DM_MAX_HW_TRAP_NAME_LEN - 1); + hw_entry->count = 1; + hw_entries->num_entries++; - case TRACE_OFF: - rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit, NULL); - rc |= unregister_trace_napi_poll(trace_napi_poll_hit, NULL); + if (!timer_pending(&hw_data->send_timer)) { + hw_data->send_timer.expires = jiffies + dm_delay * HZ; + add_timer(&hw_data->send_timer); + } - tracepoint_synchronize_unregister(); +out: + spin_unlock_irqrestore(&hw_data->lock, flags); +} - /* - * Clean the device list +static const struct net_dm_alert_ops net_dm_alert_summary_ops = { + .kfree_skb_probe = trace_kfree_skb_hit, + .napi_poll_probe = trace_napi_poll_hit, + .work_item_func = send_dm_alert, + .hw_work_item_func = net_dm_hw_summary_work, + .hw_probe = net_dm_hw_summary_probe, +}; + +static void net_dm_packet_trace_kfree_skb_hit(void *ignore, + struct sk_buff *skb, + void *location) +{ + ktime_t tstamp = ktime_get_real(); + struct per_cpu_dm_data *data; + struct sk_buff *nskb; + unsigned long flags; + + if (!skb_mac_header_was_set(skb)) + return; + + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return; + + NET_DM_SKB_CB(nskb)->pc = location; + /* Override the timestamp because we care about the time when the + * packet was dropped. + */ + nskb->tstamp = tstamp; + + data = this_cpu_ptr(&dm_cpu_data); + + spin_lock_irqsave(&data->drop_queue.lock, flags); + if (skb_queue_len(&data->drop_queue) < net_dm_queue_len) + __skb_queue_tail(&data->drop_queue, nskb); + else + goto unlock_free; + spin_unlock_irqrestore(&data->drop_queue.lock, flags); + + schedule_work(&data->dm_alert_work); + + return; + +unlock_free: + spin_unlock_irqrestore(&data->drop_queue.lock, flags); + u64_stats_update_begin(&data->stats.syncp); + data->stats.dropped++; + u64_stats_update_end(&data->stats.syncp); + consume_skb(nskb); +} + +static void net_dm_packet_trace_napi_poll_hit(void *ignore, + struct napi_struct *napi, + int work, int budget) +{ +} + +static size_t net_dm_in_port_size(void) +{ + /* NET_DM_ATTR_IN_PORT nest */ + return nla_total_size(0) + + /* NET_DM_ATTR_PORT_NETDEV_IFINDEX */ + nla_total_size(sizeof(u32)) + + /* NET_DM_ATTR_PORT_NETDEV_NAME */ + nla_total_size(IFNAMSIZ + 1); +} + +#define NET_DM_MAX_SYMBOL_LEN 40 + +static size_t net_dm_packet_report_size(size_t payload_len) +{ + size_t size; + + size = nlmsg_msg_size(GENL_HDRLEN + net_drop_monitor_family.hdrsize); + + return NLMSG_ALIGN(size) + + /* NET_DM_ATTR_ORIGIN */ + nla_total_size(sizeof(u16)) + + /* NET_DM_ATTR_PC */ + nla_total_size(sizeof(u64)) + + /* NET_DM_ATTR_SYMBOL */ + nla_total_size(NET_DM_MAX_SYMBOL_LEN + 1) + + /* NET_DM_ATTR_IN_PORT */ + net_dm_in_port_size() + + /* NET_DM_ATTR_TIMESTAMP */ + nla_total_size(sizeof(u64)) + + /* NET_DM_ATTR_ORIG_LEN */ + nla_total_size(sizeof(u32)) + + /* NET_DM_ATTR_PROTO */ + nla_total_size(sizeof(u16)) + + /* NET_DM_ATTR_PAYLOAD */ + nla_total_size(payload_len); +} + +static int net_dm_packet_report_in_port_put(struct sk_buff *msg, int ifindex, + const char *name) +{ + struct nlattr *attr; + + attr = nla_nest_start(msg, NET_DM_ATTR_IN_PORT); + if (!attr) + return -EMSGSIZE; + + if (ifindex && + nla_put_u32(msg, NET_DM_ATTR_PORT_NETDEV_IFINDEX, ifindex)) + goto nla_put_failure; + + if (name && nla_put_string(msg, NET_DM_ATTR_PORT_NETDEV_NAME, name)) + goto nla_put_failure; + + nla_nest_end(msg, attr); + + return 0; + +nla_put_failure: + nla_nest_cancel(msg, attr); + return -EMSGSIZE; +} + +static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb, + size_t payload_len) +{ + u64 pc = (u64)(uintptr_t) NET_DM_SKB_CB(skb)->pc; + char buf[NET_DM_MAX_SYMBOL_LEN]; + struct nlattr *attr; + void *hdr; + int rc; + + hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0, + NET_DM_CMD_PACKET_ALERT); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_u16(msg, NET_DM_ATTR_ORIGIN, NET_DM_ORIGIN_SW)) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, NET_DM_ATTR_PC, pc, NET_DM_ATTR_PAD)) + goto nla_put_failure; + + snprintf(buf, sizeof(buf), "%pS", NET_DM_SKB_CB(skb)->pc); + if (nla_put_string(msg, NET_DM_ATTR_SYMBOL, buf)) + goto nla_put_failure; + + rc = net_dm_packet_report_in_port_put(msg, skb->skb_iif, NULL); + if (rc) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, NET_DM_ATTR_TIMESTAMP, + ktime_to_ns(skb->tstamp), NET_DM_ATTR_PAD)) + goto nla_put_failure; + + if (nla_put_u32(msg, NET_DM_ATTR_ORIG_LEN, skb->len)) + goto nla_put_failure; + + if (!payload_len) + goto out; + + if (nla_put_u16(msg, NET_DM_ATTR_PROTO, be16_to_cpu(skb->protocol))) + goto nla_put_failure; + + attr = skb_put(msg, nla_total_size(payload_len)); + attr->nla_type = NET_DM_ATTR_PAYLOAD; + attr->nla_len = nla_attr_size(payload_len); + if (skb_copy_bits(skb, 0, nla_data(attr), payload_len)) + goto nla_put_failure; + +out: + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +#define NET_DM_MAX_PACKET_SIZE (0xffff - NLA_HDRLEN - NLA_ALIGNTO) + +static void net_dm_packet_report(struct sk_buff *skb) +{ + struct sk_buff *msg; + size_t payload_len; + int rc; + + /* Make sure we start copying the packet from the MAC header */ + if (skb->data > skb_mac_header(skb)) + skb_push(skb, skb->data - skb_mac_header(skb)); + else + skb_pull(skb, skb_mac_header(skb) - skb->data); + + /* Ensure packet fits inside a single netlink attribute */ + payload_len = min_t(size_t, skb->len, NET_DM_MAX_PACKET_SIZE); + if (net_dm_trunc_len) + payload_len = min_t(size_t, net_dm_trunc_len, payload_len); + + msg = nlmsg_new(net_dm_packet_report_size(payload_len), GFP_KERNEL); + if (!msg) + goto out; + + rc = net_dm_packet_report_fill(msg, skb, payload_len); + if (rc) { + nlmsg_free(msg); + goto out; + } + + genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL); + +out: + consume_skb(skb); +} + +static void net_dm_packet_work(struct work_struct *work) +{ + struct per_cpu_dm_data *data; + struct sk_buff_head list; + struct sk_buff *skb; + unsigned long flags; + + data = container_of(work, struct per_cpu_dm_data, dm_alert_work); + + __skb_queue_head_init(&list); + + spin_lock_irqsave(&data->drop_queue.lock, flags); + skb_queue_splice_tail_init(&data->drop_queue, &list); + spin_unlock_irqrestore(&data->drop_queue.lock, flags); + + while ((skb = __skb_dequeue(&list))) + net_dm_packet_report(skb); +} + +static size_t +net_dm_hw_packet_report_size(size_t payload_len, + const struct net_dm_hw_metadata *hw_metadata) +{ + size_t size; + + size = nlmsg_msg_size(GENL_HDRLEN + net_drop_monitor_family.hdrsize); + + return NLMSG_ALIGN(size) + + /* NET_DM_ATTR_ORIGIN */ + nla_total_size(sizeof(u16)) + + /* NET_DM_ATTR_HW_TRAP_GROUP_NAME */ + nla_total_size(strlen(hw_metadata->trap_group_name) + 1) + + /* NET_DM_ATTR_HW_TRAP_NAME */ + nla_total_size(strlen(hw_metadata->trap_name) + 1) + + /* NET_DM_ATTR_IN_PORT */ + net_dm_in_port_size() + + /* NET_DM_ATTR_TIMESTAMP */ + nla_total_size(sizeof(u64)) + + /* NET_DM_ATTR_ORIG_LEN */ + nla_total_size(sizeof(u32)) + + /* NET_DM_ATTR_PROTO */ + nla_total_size(sizeof(u16)) + + /* NET_DM_ATTR_PAYLOAD */ + nla_total_size(payload_len); +} + +static int net_dm_hw_packet_report_fill(struct sk_buff *msg, + struct sk_buff *skb, size_t payload_len) +{ + struct net_dm_hw_metadata *hw_metadata; + struct nlattr *attr; + void *hdr; + + hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata; + + hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0, + NET_DM_CMD_PACKET_ALERT); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_u16(msg, NET_DM_ATTR_ORIGIN, NET_DM_ORIGIN_HW)) + goto nla_put_failure; + + if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_GROUP_NAME, + hw_metadata->trap_group_name)) + goto nla_put_failure; + + if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_NAME, + hw_metadata->trap_name)) + goto nla_put_failure; + + if (hw_metadata->input_dev) { + struct net_device *dev = hw_metadata->input_dev; + int rc; + + rc = net_dm_packet_report_in_port_put(msg, dev->ifindex, + dev->name); + if (rc) + goto nla_put_failure; + } + + if (nla_put_u64_64bit(msg, NET_DM_ATTR_TIMESTAMP, + ktime_to_ns(skb->tstamp), NET_DM_ATTR_PAD)) + goto nla_put_failure; + + if (nla_put_u32(msg, NET_DM_ATTR_ORIG_LEN, skb->len)) + goto nla_put_failure; + + if (!payload_len) + goto out; + + if (nla_put_u16(msg, NET_DM_ATTR_PROTO, be16_to_cpu(skb->protocol))) + goto nla_put_failure; + + attr = skb_put(msg, nla_total_size(payload_len)); + attr->nla_type = NET_DM_ATTR_PAYLOAD; + attr->nla_len = nla_attr_size(payload_len); + if (skb_copy_bits(skb, 0, nla_data(attr), payload_len)) + goto nla_put_failure; + +out: + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static struct net_dm_hw_metadata * +net_dm_hw_metadata_clone(const struct net_dm_hw_metadata *hw_metadata) +{ + struct net_dm_hw_metadata *n_hw_metadata; + const char *trap_group_name; + const char *trap_name; + + n_hw_metadata = kmalloc(sizeof(*hw_metadata), GFP_ATOMIC); + if (!n_hw_metadata) + return NULL; + + trap_group_name = kstrdup(hw_metadata->trap_group_name, GFP_ATOMIC); + if (!trap_group_name) + goto free_hw_metadata; + n_hw_metadata->trap_group_name = trap_group_name; + + trap_name = kstrdup(hw_metadata->trap_name, GFP_ATOMIC); + if (!trap_name) + goto free_trap_group; + n_hw_metadata->trap_name = trap_name; + + n_hw_metadata->input_dev = hw_metadata->input_dev; + if (n_hw_metadata->input_dev) + dev_hold(n_hw_metadata->input_dev); + + return n_hw_metadata; + +free_trap_group: + kfree(trap_group_name); +free_hw_metadata: + kfree(n_hw_metadata); + return NULL; +} + +static void +net_dm_hw_metadata_free(const struct net_dm_hw_metadata *hw_metadata) +{ + if (hw_metadata->input_dev) + dev_put(hw_metadata->input_dev); + kfree(hw_metadata->trap_name); + kfree(hw_metadata->trap_group_name); + kfree(hw_metadata); +} + +static void net_dm_hw_packet_report(struct sk_buff *skb) +{ + struct net_dm_hw_metadata *hw_metadata; + struct sk_buff *msg; + size_t payload_len; + int rc; + + if (skb->data > skb_mac_header(skb)) + skb_push(skb, skb->data - skb_mac_header(skb)); + else + skb_pull(skb, skb_mac_header(skb) - skb->data); + + payload_len = min_t(size_t, skb->len, NET_DM_MAX_PACKET_SIZE); + if (net_dm_trunc_len) + payload_len = min_t(size_t, net_dm_trunc_len, payload_len); + + hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata; + msg = nlmsg_new(net_dm_hw_packet_report_size(payload_len, hw_metadata), + GFP_KERNEL); + if (!msg) + goto out; + + rc = net_dm_hw_packet_report_fill(msg, skb, payload_len); + if (rc) { + nlmsg_free(msg); + goto out; + } + + genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL); + +out: + net_dm_hw_metadata_free(NET_DM_SKB_CB(skb)->hw_metadata); + consume_skb(skb); +} + +static void net_dm_hw_packet_work(struct work_struct *work) +{ + struct per_cpu_dm_data *hw_data; + struct sk_buff_head list; + struct sk_buff *skb; + unsigned long flags; + + hw_data = container_of(work, struct per_cpu_dm_data, dm_alert_work); + + __skb_queue_head_init(&list); + + spin_lock_irqsave(&hw_data->drop_queue.lock, flags); + skb_queue_splice_tail_init(&hw_data->drop_queue, &list); + spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags); + + while ((skb = __skb_dequeue(&list))) + net_dm_hw_packet_report(skb); +} + +static void +net_dm_hw_packet_probe(struct sk_buff *skb, + const struct net_dm_hw_metadata *hw_metadata) +{ + struct net_dm_hw_metadata *n_hw_metadata; + ktime_t tstamp = ktime_get_real(); + struct per_cpu_dm_data *hw_data; + struct sk_buff *nskb; + unsigned long flags; + + if (!skb_mac_header_was_set(skb)) + return; + + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return; + + n_hw_metadata = net_dm_hw_metadata_clone(hw_metadata); + if (!n_hw_metadata) + goto free; + + NET_DM_SKB_CB(nskb)->hw_metadata = n_hw_metadata; + nskb->tstamp = tstamp; + + hw_data = this_cpu_ptr(&dm_hw_cpu_data); + + spin_lock_irqsave(&hw_data->drop_queue.lock, flags); + if (skb_queue_len(&hw_data->drop_queue) < net_dm_queue_len) + __skb_queue_tail(&hw_data->drop_queue, nskb); + else + goto unlock_free; + spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags); + + schedule_work(&hw_data->dm_alert_work); + + return; + +unlock_free: + spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags); + u64_stats_update_begin(&hw_data->stats.syncp); + hw_data->stats.dropped++; + u64_stats_update_end(&hw_data->stats.syncp); + net_dm_hw_metadata_free(n_hw_metadata); +free: + consume_skb(nskb); +} + +static const struct net_dm_alert_ops net_dm_alert_packet_ops = { + .kfree_skb_probe = net_dm_packet_trace_kfree_skb_hit, + .napi_poll_probe = net_dm_packet_trace_napi_poll_hit, + .work_item_func = net_dm_packet_work, + .hw_work_item_func = net_dm_hw_packet_work, + .hw_probe = net_dm_hw_packet_probe, +}; + +static const struct net_dm_alert_ops *net_dm_alert_ops_arr[] = { + [NET_DM_ALERT_MODE_SUMMARY] = &net_dm_alert_summary_ops, + [NET_DM_ALERT_MODE_PACKET] = &net_dm_alert_packet_ops, +}; + +void net_dm_hw_report(struct sk_buff *skb, + const struct net_dm_hw_metadata *hw_metadata) +{ + rcu_read_lock(); + + if (!monitor_hw) + goto out; + + net_dm_alert_ops_arr[net_dm_alert_mode]->hw_probe(skb, hw_metadata); + +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(net_dm_hw_report); + +static int net_dm_hw_monitor_start(struct netlink_ext_ack *extack) +{ + const struct net_dm_alert_ops *ops; + int cpu; + + if (monitor_hw) { + NL_SET_ERR_MSG_MOD(extack, "Hardware monitoring already enabled"); + return -EAGAIN; + } + + ops = net_dm_alert_ops_arr[net_dm_alert_mode]; + + if (!try_module_get(THIS_MODULE)) { + NL_SET_ERR_MSG_MOD(extack, "Failed to take reference on module"); + return -ENODEV; + } + + for_each_possible_cpu(cpu) { + struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); + struct net_dm_hw_entries *hw_entries; + + INIT_WORK(&hw_data->dm_alert_work, ops->hw_work_item_func); + timer_setup(&hw_data->send_timer, sched_send_work, 0); + hw_entries = net_dm_hw_reset_per_cpu_data(hw_data); + kfree(hw_entries); + } + + monitor_hw = true; + + return 0; +} + +static void net_dm_hw_monitor_stop(struct netlink_ext_ack *extack) +{ + int cpu; + + if (!monitor_hw) { + NL_SET_ERR_MSG_MOD(extack, "Hardware monitoring already disabled"); + return; + } + + monitor_hw = false; + + /* After this call returns we are guaranteed that no CPU is processing + * any hardware drops. + */ + synchronize_rcu(); + + for_each_possible_cpu(cpu) { + struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); + struct sk_buff *skb; + + del_timer_sync(&hw_data->send_timer); + cancel_work_sync(&hw_data->dm_alert_work); + while ((skb = __skb_dequeue(&hw_data->drop_queue))) { + struct net_dm_hw_metadata *hw_metadata; + + hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata; + net_dm_hw_metadata_free(hw_metadata); + consume_skb(skb); + } + } + + module_put(THIS_MODULE); +} + +static int net_dm_trace_on_set(struct netlink_ext_ack *extack) +{ + const struct net_dm_alert_ops *ops; + int cpu, rc; + + ops = net_dm_alert_ops_arr[net_dm_alert_mode]; + + if (!try_module_get(THIS_MODULE)) { + NL_SET_ERR_MSG_MOD(extack, "Failed to take reference on module"); + return -ENODEV; + } + + for_each_possible_cpu(cpu) { + struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu); + struct sk_buff *skb; + + INIT_WORK(&data->dm_alert_work, ops->work_item_func); + timer_setup(&data->send_timer, sched_send_work, 0); + /* Allocate a new per-CPU skb for the summary alert message and + * free the old one which might contain stale data from + * previous tracing. */ - list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) { - if (new_stat->dev == NULL) { - list_del_rcu(&new_stat->list); - kfree_rcu(new_stat, rcu); - } + skb = reset_per_cpu_data(data); + consume_skb(skb); + } + + rc = register_trace_kfree_skb(ops->kfree_skb_probe, NULL); + if (rc) { + NL_SET_ERR_MSG_MOD(extack, "Failed to connect probe to kfree_skb() tracepoint"); + goto err_module_put; + } + + rc = register_trace_napi_poll(ops->napi_poll_probe, NULL); + if (rc) { + NL_SET_ERR_MSG_MOD(extack, "Failed to connect probe to napi_poll() tracepoint"); + goto err_unregister_trace; + } + + return 0; + +err_unregister_trace: + unregister_trace_kfree_skb(ops->kfree_skb_probe, NULL); +err_module_put: + module_put(THIS_MODULE); + return rc; +} + +static void net_dm_trace_off_set(void) +{ + struct dm_hw_stat_delta *new_stat, *temp; + const struct net_dm_alert_ops *ops; + int cpu; + + ops = net_dm_alert_ops_arr[net_dm_alert_mode]; + + unregister_trace_napi_poll(ops->napi_poll_probe, NULL); + unregister_trace_kfree_skb(ops->kfree_skb_probe, NULL); + + tracepoint_synchronize_unregister(); + + /* Make sure we do not send notifications to user space after request + * to stop tracing returns. + */ + for_each_possible_cpu(cpu) { + struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu); + struct sk_buff *skb; + + del_timer_sync(&data->send_timer); + cancel_work_sync(&data->dm_alert_work); + while ((skb = __skb_dequeue(&data->drop_queue))) + consume_skb(skb); + } + + list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) { + if (new_stat->dev == NULL) { + list_del_rcu(&new_stat->list); + kfree_rcu(new_stat, rcu); } + } - module_put(THIS_MODULE); + module_put(THIS_MODULE); +} + +static int set_all_monitor_traces(int state, struct netlink_ext_ack *extack) +{ + int rc = 0; + if (state == trace_state) { + NL_SET_ERR_MSG_MOD(extack, "Trace state already set to requested state"); + return -EAGAIN; + } + + switch (state) { + case TRACE_ON: + rc = net_dm_trace_on_set(extack); + break; + case TRACE_OFF: + net_dm_trace_off_set(); break; default: rc = 1; @@ -288,30 +1139,331 @@ static int set_all_monitor_traces(int state) else rc = -EINPROGRESS; -out_unlock: - mutex_unlock(&trace_state_mutex); - return rc; } +static bool net_dm_is_monitoring(void) +{ + return trace_state == TRACE_ON || monitor_hw; +} + +static int net_dm_alert_mode_get_from_info(struct genl_info *info, + enum net_dm_alert_mode *p_alert_mode) +{ + u8 val; + + val = nla_get_u8(info->attrs[NET_DM_ATTR_ALERT_MODE]); + + switch (val) { + case NET_DM_ALERT_MODE_SUMMARY: /* fall-through */ + case NET_DM_ALERT_MODE_PACKET: + *p_alert_mode = val; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int net_dm_alert_mode_set(struct genl_info *info) +{ + struct netlink_ext_ack *extack = info->extack; + enum net_dm_alert_mode alert_mode; + int rc; + + if (!info->attrs[NET_DM_ATTR_ALERT_MODE]) + return 0; + + rc = net_dm_alert_mode_get_from_info(info, &alert_mode); + if (rc) { + NL_SET_ERR_MSG_MOD(extack, "Invalid alert mode"); + return -EINVAL; + } + + net_dm_alert_mode = alert_mode; + + return 0; +} + +static void net_dm_trunc_len_set(struct genl_info *info) +{ + if (!info->attrs[NET_DM_ATTR_TRUNC_LEN]) + return; + + net_dm_trunc_len = nla_get_u32(info->attrs[NET_DM_ATTR_TRUNC_LEN]); +} + +static void net_dm_queue_len_set(struct genl_info *info) +{ + if (!info->attrs[NET_DM_ATTR_QUEUE_LEN]) + return; + + net_dm_queue_len = nla_get_u32(info->attrs[NET_DM_ATTR_QUEUE_LEN]); +} static int net_dm_cmd_config(struct sk_buff *skb, struct genl_info *info) { - return -ENOTSUPP; + struct netlink_ext_ack *extack = info->extack; + int rc; + + if (net_dm_is_monitoring()) { + NL_SET_ERR_MSG_MOD(extack, "Cannot configure drop monitor during monitoring"); + return -EBUSY; + } + + rc = net_dm_alert_mode_set(info); + if (rc) + return rc; + + net_dm_trunc_len_set(info); + + net_dm_queue_len_set(info); + + return 0; +} + +static int net_dm_monitor_start(bool set_sw, bool set_hw, + struct netlink_ext_ack *extack) +{ + bool sw_set = false; + int rc; + + if (set_sw) { + rc = set_all_monitor_traces(TRACE_ON, extack); + if (rc) + return rc; + sw_set = true; + } + + if (set_hw) { + rc = net_dm_hw_monitor_start(extack); + if (rc) + goto err_monitor_hw; + } + + return 0; + +err_monitor_hw: + if (sw_set) + set_all_monitor_traces(TRACE_OFF, extack); + return rc; +} + +static void net_dm_monitor_stop(bool set_sw, bool set_hw, + struct netlink_ext_ack *extack) +{ + if (set_hw) + net_dm_hw_monitor_stop(extack); + if (set_sw) + set_all_monitor_traces(TRACE_OFF, extack); } static int net_dm_cmd_trace(struct sk_buff *skb, struct genl_info *info) { + bool set_sw = !!info->attrs[NET_DM_ATTR_SW_DROPS]; + bool set_hw = !!info->attrs[NET_DM_ATTR_HW_DROPS]; + struct netlink_ext_ack *extack = info->extack; + + /* To maintain backward compatibility, we start / stop monitoring of + * software drops if no flag is specified. + */ + if (!set_sw && !set_hw) + set_sw = true; + switch (info->genlhdr->cmd) { case NET_DM_CMD_START: - return set_all_monitor_traces(TRACE_ON); + return net_dm_monitor_start(set_sw, set_hw, extack); case NET_DM_CMD_STOP: - return set_all_monitor_traces(TRACE_OFF); + net_dm_monitor_stop(set_sw, set_hw, extack); + return 0; } - return -ENOTSUPP; + return -EOPNOTSUPP; +} + +static int net_dm_config_fill(struct sk_buff *msg, struct genl_info *info) +{ + void *hdr; + + hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, + &net_drop_monitor_family, 0, NET_DM_CMD_CONFIG_NEW); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_u8(msg, NET_DM_ATTR_ALERT_MODE, net_dm_alert_mode)) + goto nla_put_failure; + + if (nla_put_u32(msg, NET_DM_ATTR_TRUNC_LEN, net_dm_trunc_len)) + goto nla_put_failure; + + if (nla_put_u32(msg, NET_DM_ATTR_QUEUE_LEN, net_dm_queue_len)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int net_dm_cmd_config_get(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *msg; + int rc; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + rc = net_dm_config_fill(msg, info); + if (rc) + goto free_msg; + + return genlmsg_reply(msg, info); + +free_msg: + nlmsg_free(msg); + return rc; +} + +static void net_dm_stats_read(struct net_dm_stats *stats) +{ + int cpu; + + memset(stats, 0, sizeof(*stats)); + for_each_possible_cpu(cpu) { + struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu); + struct net_dm_stats *cpu_stats = &data->stats; + unsigned int start; + u64 dropped; + + do { + start = u64_stats_fetch_begin_irq(&cpu_stats->syncp); + dropped = cpu_stats->dropped; + } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start)); + + stats->dropped += dropped; + } +} + +static int net_dm_stats_put(struct sk_buff *msg) +{ + struct net_dm_stats stats; + struct nlattr *attr; + + net_dm_stats_read(&stats); + + attr = nla_nest_start(msg, NET_DM_ATTR_STATS); + if (!attr) + return -EMSGSIZE; + + if (nla_put_u64_64bit(msg, NET_DM_ATTR_STATS_DROPPED, + stats.dropped, NET_DM_ATTR_PAD)) + goto nla_put_failure; + + nla_nest_end(msg, attr); + + return 0; + +nla_put_failure: + nla_nest_cancel(msg, attr); + return -EMSGSIZE; +} + +static void net_dm_hw_stats_read(struct net_dm_stats *stats) +{ + int cpu; + + memset(stats, 0, sizeof(*stats)); + for_each_possible_cpu(cpu) { + struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); + struct net_dm_stats *cpu_stats = &hw_data->stats; + unsigned int start; + u64 dropped; + + do { + start = u64_stats_fetch_begin_irq(&cpu_stats->syncp); + dropped = cpu_stats->dropped; + } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start)); + + stats->dropped += dropped; + } +} + +static int net_dm_hw_stats_put(struct sk_buff *msg) +{ + struct net_dm_stats stats; + struct nlattr *attr; + + net_dm_hw_stats_read(&stats); + + attr = nla_nest_start(msg, NET_DM_ATTR_HW_STATS); + if (!attr) + return -EMSGSIZE; + + if (nla_put_u64_64bit(msg, NET_DM_ATTR_STATS_DROPPED, + stats.dropped, NET_DM_ATTR_PAD)) + goto nla_put_failure; + + nla_nest_end(msg, attr); + + return 0; + +nla_put_failure: + nla_nest_cancel(msg, attr); + return -EMSGSIZE; +} + +static int net_dm_stats_fill(struct sk_buff *msg, struct genl_info *info) +{ + void *hdr; + int rc; + + hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, + &net_drop_monitor_family, 0, NET_DM_CMD_STATS_NEW); + if (!hdr) + return -EMSGSIZE; + + rc = net_dm_stats_put(msg); + if (rc) + goto nla_put_failure; + + rc = net_dm_hw_stats_put(msg); + if (rc) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int net_dm_cmd_stats_get(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *msg; + int rc; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + rc = net_dm_stats_fill(msg, info); + if (rc) + goto free_msg; + + return genlmsg_reply(msg, info); + +free_msg: + nlmsg_free(msg); + return rc; } static int dropmon_net_event(struct notifier_block *ev_block, @@ -330,12 +1482,12 @@ static int dropmon_net_event(struct notifier_block *ev_block, new_stat->dev = dev; new_stat->last_rx = jiffies; - mutex_lock(&trace_state_mutex); + mutex_lock(&net_dm_mutex); list_add_rcu(&new_stat->list, &hw_stats_list); - mutex_unlock(&trace_state_mutex); + mutex_unlock(&net_dm_mutex); break; case NETDEV_UNREGISTER: - mutex_lock(&trace_state_mutex); + mutex_lock(&net_dm_mutex); list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) { if (new_stat->dev == dev) { new_stat->dev = NULL; @@ -346,18 +1498,28 @@ static int dropmon_net_event(struct notifier_block *ev_block, } } } - mutex_unlock(&trace_state_mutex); + mutex_unlock(&net_dm_mutex); break; } out: return NOTIFY_DONE; } +static const struct nla_policy net_dm_nl_policy[NET_DM_ATTR_MAX + 1] = { + [NET_DM_ATTR_UNSPEC] = { .strict_start_type = NET_DM_ATTR_UNSPEC + 1 }, + [NET_DM_ATTR_ALERT_MODE] = { .type = NLA_U8 }, + [NET_DM_ATTR_TRUNC_LEN] = { .type = NLA_U32 }, + [NET_DM_ATTR_QUEUE_LEN] = { .type = NLA_U32 }, + [NET_DM_ATTR_SW_DROPS] = {. type = NLA_FLAG }, + [NET_DM_ATTR_HW_DROPS] = {. type = NLA_FLAG }, +}; + static const struct genl_ops dropmon_ops[] = { { .cmd = NET_DM_CMD_CONFIG, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = net_dm_cmd_config, + .flags = GENL_ADMIN_PERM, }, { .cmd = NET_DM_CMD_START, @@ -369,12 +1531,38 @@ static const struct genl_ops dropmon_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = net_dm_cmd_trace, }, + { + .cmd = NET_DM_CMD_CONFIG_GET, + .doit = net_dm_cmd_config_get, + }, + { + .cmd = NET_DM_CMD_STATS_GET, + .doit = net_dm_cmd_stats_get, + }, }; +static int net_dm_nl_pre_doit(const struct genl_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + mutex_lock(&net_dm_mutex); + + return 0; +} + +static void net_dm_nl_post_doit(const struct genl_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + mutex_unlock(&net_dm_mutex); +} + static struct genl_family net_drop_monitor_family __ro_after_init = { .hdrsize = 0, .name = "NET_DM", .version = 2, + .maxattr = NET_DM_ATTR_MAX, + .policy = net_dm_nl_policy, + .pre_doit = net_dm_nl_pre_doit, + .post_doit = net_dm_nl_post_doit, .module = THIS_MODULE, .ops = dropmon_ops, .n_ops = ARRAY_SIZE(dropmon_ops), @@ -386,9 +1574,57 @@ static struct notifier_block dropmon_net_notifier = { .notifier_call = dropmon_net_event }; -static int __init init_net_drop_monitor(void) +static void __net_dm_cpu_data_init(struct per_cpu_dm_data *data) +{ + spin_lock_init(&data->lock); + skb_queue_head_init(&data->drop_queue); + u64_stats_init(&data->stats.syncp); +} + +static void __net_dm_cpu_data_fini(struct per_cpu_dm_data *data) +{ + WARN_ON(!skb_queue_empty(&data->drop_queue)); +} + +static void net_dm_cpu_data_init(int cpu) +{ + struct per_cpu_dm_data *data; + + data = &per_cpu(dm_cpu_data, cpu); + __net_dm_cpu_data_init(data); +} + +static void net_dm_cpu_data_fini(int cpu) { struct per_cpu_dm_data *data; + + data = &per_cpu(dm_cpu_data, cpu); + /* At this point, we should have exclusive access + * to this struct and can free the skb inside it. + */ + consume_skb(data->skb); + __net_dm_cpu_data_fini(data); +} + +static void net_dm_hw_cpu_data_init(int cpu) +{ + struct per_cpu_dm_data *hw_data; + + hw_data = &per_cpu(dm_hw_cpu_data, cpu); + __net_dm_cpu_data_init(hw_data); +} + +static void net_dm_hw_cpu_data_fini(int cpu) +{ + struct per_cpu_dm_data *hw_data; + + hw_data = &per_cpu(dm_hw_cpu_data, cpu); + kfree(hw_data->hw_entries); + __net_dm_cpu_data_fini(hw_data); +} + +static int __init init_net_drop_monitor(void) +{ int cpu, rc; pr_info("Initializing network drop monitor service\n"); @@ -414,14 +1650,10 @@ static int __init init_net_drop_monitor(void) rc = 0; for_each_possible_cpu(cpu) { - data = &per_cpu(dm_cpu_data, cpu); - INIT_WORK(&data->dm_alert_work, send_dm_alert); - timer_setup(&data->send_timer, sched_send_work, 0); - spin_lock_init(&data->lock); - reset_per_cpu_data(data); + net_dm_cpu_data_init(cpu); + net_dm_hw_cpu_data_init(cpu); } - goto out; out_unreg: @@ -432,7 +1664,6 @@ out: static void exit_net_drop_monitor(void) { - struct per_cpu_dm_data *data; int cpu; BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); @@ -440,19 +1671,11 @@ static void exit_net_drop_monitor(void) /* * Because of the module_get/put we do in the trace state change path * we are guarnateed not to have any current users when we get here - * all we need to do is make sure that we don't have any running timers - * or pending schedule calls */ for_each_possible_cpu(cpu) { - data = &per_cpu(dm_cpu_data, cpu); - del_timer_sync(&data->send_timer); - cancel_work_sync(&data->dm_alert_work); - /* - * At this point, we should have exclusive access - * to this struct and can free the skb inside it - */ - kfree_skb(data->skb); + net_dm_hw_cpu_data_fini(cpu); + net_dm_cpu_data_fini(cpu); } BUG_ON(genl_unregister_family(&net_drop_monitor_family)); diff --git a/net/core/dst.c b/net/core/dst.c index 1325316d9eab..193af526e908 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -172,7 +172,7 @@ void dst_release(struct dst_entry *dst) int newrefcnt; newrefcnt = atomic_dec_return(&dst->__refcnt); - if (unlikely(newrefcnt < 0)) + if (WARN_ONCE(newrefcnt < 0, "dst_release underflow")) net_warn_ratelimited("%s: dst:%p refcnt:%d\n", __func__, dst, newrefcnt); if (!newrefcnt) @@ -187,7 +187,7 @@ void dst_release_immediate(struct dst_entry *dst) int newrefcnt; newrefcnt = atomic_dec_return(&dst->__refcnt); - if (unlikely(newrefcnt < 0)) + if (WARN_ONCE(newrefcnt < 0, "dst_release_immediate underflow")) net_warn_ratelimited("%s: dst:%p refcnt:%d\n", __func__, dst, newrefcnt); if (!newrefcnt) diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c index 13a40b831d6d..fc96259807b6 100644 --- a/net/core/fib_notifier.c +++ b/net/core/fib_notifier.c @@ -5,17 +5,22 @@ #include <linux/module.h> #include <linux/init.h> #include <net/net_namespace.h> +#include <net/netns/generic.h> #include <net/fib_notifier.h> -static ATOMIC_NOTIFIER_HEAD(fib_chain); +static unsigned int fib_notifier_net_id; -int call_fib_notifier(struct notifier_block *nb, struct net *net, +struct fib_notifier_net { + struct list_head fib_notifier_ops; + struct atomic_notifier_head fib_chain; +}; + +int call_fib_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info) { int err; - info->net = net; err = nb->notifier_call(nb, event_type, info); return notifier_to_errno(err); } @@ -24,115 +29,113 @@ EXPORT_SYMBOL(call_fib_notifier); int call_fib_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info) { + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); int err; - info->net = net; - err = atomic_notifier_call_chain(&fib_chain, event_type, info); + err = atomic_notifier_call_chain(&fn_net->fib_chain, event_type, info); return notifier_to_errno(err); } EXPORT_SYMBOL(call_fib_notifiers); -static unsigned int fib_seq_sum(void) +static unsigned int fib_seq_sum(struct net *net) { + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); struct fib_notifier_ops *ops; unsigned int fib_seq = 0; - struct net *net; rtnl_lock(); - down_read(&net_rwsem); - for_each_net(net) { - rcu_read_lock(); - list_for_each_entry_rcu(ops, &net->fib_notifier_ops, list) { - if (!try_module_get(ops->owner)) - continue; - fib_seq += ops->fib_seq_read(net); - module_put(ops->owner); - } - rcu_read_unlock(); + rcu_read_lock(); + list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) { + if (!try_module_get(ops->owner)) + continue; + fib_seq += ops->fib_seq_read(net); + module_put(ops->owner); } - up_read(&net_rwsem); + rcu_read_unlock(); rtnl_unlock(); return fib_seq; } -static int fib_net_dump(struct net *net, struct notifier_block *nb) +static int fib_net_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); struct fib_notifier_ops *ops; + int err = 0; - list_for_each_entry_rcu(ops, &net->fib_notifier_ops, list) { - int err; - + rcu_read_lock(); + list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) { if (!try_module_get(ops->owner)) continue; - err = ops->fib_dump(net, nb); + err = ops->fib_dump(net, nb, extack); module_put(ops->owner); if (err) - return err; + goto unlock; } - return 0; +unlock: + rcu_read_unlock(); + + return err; } -static bool fib_dump_is_consistent(struct notifier_block *nb, +static bool fib_dump_is_consistent(struct net *net, struct notifier_block *nb, void (*cb)(struct notifier_block *nb), unsigned int fib_seq) { - atomic_notifier_chain_register(&fib_chain, nb); - if (fib_seq == fib_seq_sum()) + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); + + atomic_notifier_chain_register(&fn_net->fib_chain, nb); + if (fib_seq == fib_seq_sum(net)) return true; - atomic_notifier_chain_unregister(&fib_chain, nb); + atomic_notifier_chain_unregister(&fn_net->fib_chain, nb); if (cb) cb(nb); return false; } #define FIB_DUMP_MAX_RETRIES 5 -int register_fib_notifier(struct notifier_block *nb, - void (*cb)(struct notifier_block *nb)) +int register_fib_notifier(struct net *net, struct notifier_block *nb, + void (*cb)(struct notifier_block *nb), + struct netlink_ext_ack *extack) { int retries = 0; int err; do { - unsigned int fib_seq = fib_seq_sum(); - struct net *net; - - rcu_read_lock(); - for_each_net_rcu(net) { - err = fib_net_dump(net, nb); - if (err) - goto err_fib_net_dump; - } - rcu_read_unlock(); - - if (fib_dump_is_consistent(nb, cb, fib_seq)) + unsigned int fib_seq = fib_seq_sum(net); + + err = fib_net_dump(net, nb, extack); + if (err) + return err; + + if (fib_dump_is_consistent(net, nb, cb, fib_seq)) return 0; } while (++retries < FIB_DUMP_MAX_RETRIES); return -EBUSY; - -err_fib_net_dump: - rcu_read_unlock(); - return err; } EXPORT_SYMBOL(register_fib_notifier); -int unregister_fib_notifier(struct notifier_block *nb) +int unregister_fib_notifier(struct net *net, struct notifier_block *nb) { - return atomic_notifier_chain_unregister(&fib_chain, nb); + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); + + return atomic_notifier_chain_unregister(&fn_net->fib_chain, nb); } EXPORT_SYMBOL(unregister_fib_notifier); static int __fib_notifier_ops_register(struct fib_notifier_ops *ops, struct net *net) { + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); struct fib_notifier_ops *o; - list_for_each_entry(o, &net->fib_notifier_ops, list) + list_for_each_entry(o, &fn_net->fib_notifier_ops, list) if (ops->family == o->family) return -EEXIST; - list_add_tail_rcu(&ops->list, &net->fib_notifier_ops); + list_add_tail_rcu(&ops->list, &fn_net->fib_notifier_ops); return 0; } @@ -167,18 +170,25 @@ EXPORT_SYMBOL(fib_notifier_ops_unregister); static int __net_init fib_notifier_net_init(struct net *net) { - INIT_LIST_HEAD(&net->fib_notifier_ops); + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); + + INIT_LIST_HEAD(&fn_net->fib_notifier_ops); + ATOMIC_INIT_NOTIFIER_HEAD(&fn_net->fib_chain); return 0; } static void __net_exit fib_notifier_net_exit(struct net *net) { - WARN_ON_ONCE(!list_empty(&net->fib_notifier_ops)); + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); + + WARN_ON_ONCE(!list_empty(&fn_net->fib_notifier_ops)); } static struct pernet_operations fib_notifier_net_ops = { .init = fib_notifier_net_init, .exit = fib_notifier_net_exit, + .id = &fib_notifier_net_id, + .size = sizeof(struct fib_notifier_net), }; static int __init fib_notifier_init(void) diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index dd220ce7ca7a..3e7e15278c46 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -321,16 +321,18 @@ out: } EXPORT_SYMBOL_GPL(fib_rules_lookup); -static int call_fib_rule_notifier(struct notifier_block *nb, struct net *net, +static int call_fib_rule_notifier(struct notifier_block *nb, enum fib_event_type event_type, - struct fib_rule *rule, int family) + struct fib_rule *rule, int family, + struct netlink_ext_ack *extack) { struct fib_rule_notifier_info info = { .info.family = family, + .info.extack = extack, .rule = rule, }; - return call_fib_notifier(nb, net, event_type, &info.info); + return call_fib_notifier(nb, event_type, &info.info); } static int call_fib_rule_notifiers(struct net *net, @@ -350,20 +352,25 @@ static int call_fib_rule_notifiers(struct net *net, } /* Called with rcu_read_lock() */ -int fib_rules_dump(struct net *net, struct notifier_block *nb, int family) +int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, + struct netlink_ext_ack *extack) { struct fib_rules_ops *ops; struct fib_rule *rule; + int err = 0; ops = lookup_rules_ops(net, family); if (!ops) return -EAFNOSUPPORT; - list_for_each_entry_rcu(rule, &ops->rules_list, list) - call_fib_rule_notifier(nb, net, FIB_EVENT_RULE_ADD, rule, - family); + list_for_each_entry_rcu(rule, &ops->rules_list, list) { + err = call_fib_rule_notifier(nb, FIB_EVENT_RULE_ADD, + rule, family, extack); + if (err) + break; + } rules_ops_put(ops); - return 0; + return err; } EXPORT_SYMBOL_GPL(fib_rules_dump); diff --git a/net/core/filter.c b/net/core/filter.c index 7878f918b8c0..c180871e606d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -274,7 +274,7 @@ static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, switch (skb_field) { case SKF_AD_MARK: - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); + BUILD_BUG_ON(sizeof_field(struct sk_buff, mark) != 4); *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, offsetof(struct sk_buff, mark)); @@ -289,14 +289,14 @@ static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, break; case SKF_AD_QUEUE: - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2); + BUILD_BUG_ON(sizeof_field(struct sk_buff, queue_mapping) != 2); *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, offsetof(struct sk_buff, queue_mapping)); break; case SKF_AD_VLAN_TAG: - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); + BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_tci) != 2); /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, @@ -322,7 +322,7 @@ static bool convert_bpf_extensions(struct sock_filter *fp, switch (fp->k) { case SKF_AD_OFF + SKF_AD_PROTOCOL: - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); + BUILD_BUG_ON(sizeof_field(struct sk_buff, protocol) != 2); /* A = *(u16 *) (CTX + offsetof(protocol)) */ *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, @@ -338,8 +338,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, case SKF_AD_OFF + SKF_AD_IFINDEX: case SKF_AD_OFF + SKF_AD_HATYPE: - BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); - BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2); + BUILD_BUG_ON(sizeof_field(struct net_device, ifindex) != 4); + BUILD_BUG_ON(sizeof_field(struct net_device, type) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), BPF_REG_TMP, BPF_REG_CTX, @@ -361,7 +361,7 @@ static bool convert_bpf_extensions(struct sock_filter *fp, break; case SKF_AD_OFF + SKF_AD_RXHASH: - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); + BUILD_BUG_ON(sizeof_field(struct sk_buff, hash) != 4); *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, offsetof(struct sk_buff, hash)); @@ -385,7 +385,7 @@ static bool convert_bpf_extensions(struct sock_filter *fp, break; case SKF_AD_OFF + SKF_AD_VLAN_TPID: - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2); + BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_proto) != 2); /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */ *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, @@ -1573,7 +1573,7 @@ int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) return -EPERM; prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); - if (IS_ERR(prog) && PTR_ERR(prog) == -EINVAL) + if (PTR_ERR(prog) == -EINVAL) prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT); if (IS_ERR(prog)) return PTR_ERR(prog); @@ -2055,6 +2055,7 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) } skb->dev = dev; + skb->tstamp = 0; dev_xmit_recursion_inc(); ret = dev_queue_xmit(skb); @@ -2230,10 +2231,10 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, /* First find the starting scatterlist element */ i = msg->sg.start; do { + offset += len; len = sk_msg_elem(msg, i)->length; if (start < offset + len) break; - offset += len; sk_msg_iter_var_next(i); } while (i != msg->sg.end); @@ -2245,7 +2246,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, * account for the headroom. */ bytes_sg_total = start - offset + bytes; - if (!msg->sg.copy[i] && bytes_sg_total <= len) + if (!test_bit(i, &msg->sg.copy) && bytes_sg_total <= len) goto out; /* At this point we need to linearize multiple scatterlist @@ -2299,7 +2300,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, WARN_ON_ONCE(last_sge == first_sge); shift = last_sge > first_sge ? last_sge - first_sge - 1 : - MAX_SKB_FRAGS - first_sge + last_sge - 1; + NR_MSG_FRAG_IDS - first_sge + last_sge - 1; if (!shift) goto out; @@ -2308,8 +2309,8 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, do { u32 move_from; - if (i + shift >= MAX_MSG_FRAGS) - move_from = i + shift - MAX_MSG_FRAGS; + if (i + shift >= NR_MSG_FRAG_IDS) + move_from = i + shift - NR_MSG_FRAG_IDS; else move_from = i + shift; if (move_from == msg->sg.end) @@ -2323,7 +2324,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, } while (1); msg->sg.end = msg->sg.end - shift > msg->sg.end ? - msg->sg.end - shift + MAX_MSG_FRAGS : + msg->sg.end - shift + NR_MSG_FRAG_IDS : msg->sg.end - shift; out: msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset; @@ -2345,7 +2346,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, u32, len, u64, flags) { struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge; - u32 new, i = 0, l, space, copy = 0, offset = 0; + u32 new, i = 0, l = 0, space, copy = 0, offset = 0; u8 *raw, *to, *from; struct page *page; @@ -2355,11 +2356,11 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, /* First find the starting scatterlist element */ i = msg->sg.start; do { + offset += l; l = sk_msg_elem(msg, i)->length; if (start < offset + l) break; - offset += l; sk_msg_iter_var_next(i); } while (i != msg->sg.end); @@ -2414,6 +2415,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, sk_msg_iter_var_next(i); sg_unmark_end(psge); + sg_unmark_end(&rsge); sk_msg_iter_next(msg, end); } @@ -2450,7 +2452,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, /* Place newly allocated data buffer */ sk_mem_charge(msg->sk, len); msg->sg.size += len; - msg->sg.copy[new] = false; + __clear_bit(new, &msg->sg.copy); sg_set_page(&msg->sg.data[new], page, len + copy, 0); if (rsge.length) { get_page(sg_page(&rsge)); @@ -2505,7 +2507,7 @@ static void sk_msg_shift_right(struct sk_msg *msg, int i) BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, u32, len, u64, flags) { - u32 i = 0, l, space, offset = 0; + u32 i = 0, l = 0, space, offset = 0; u64 last = start + len; int pop; @@ -2515,11 +2517,11 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, /* First find the starting scatterlist element */ i = msg->sg.start; do { + offset += l; l = sk_msg_elem(msg, i)->length; if (start < offset + l) break; - offset += l; sk_msg_iter_var_next(i); } while (i != msg->sg.end); @@ -3457,123 +3459,38 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { .arg2_type = ARG_ANYTHING, }; -static int __bpf_tx_xdp(struct net_device *dev, - struct bpf_map *map, - struct xdp_buff *xdp, - u32 index) -{ - struct xdp_frame *xdpf; - int err, sent; - - if (!dev->netdev_ops->ndo_xdp_xmit) { - return -EOPNOTSUPP; - } - - err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data); - if (unlikely(err)) - return err; - - xdpf = convert_to_xdp_frame(xdp); - if (unlikely(!xdpf)) - return -EOVERFLOW; - - sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf, XDP_XMIT_FLUSH); - if (sent <= 0) - return sent; - return 0; -} - -static noinline int -xdp_do_redirect_slow(struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog, struct bpf_redirect_info *ri) -{ - struct net_device *fwd; - u32 index = ri->tgt_index; - int err; - - fwd = dev_get_by_index_rcu(dev_net(dev), index); - ri->tgt_index = 0; - if (unlikely(!fwd)) { - err = -EINVAL; - goto err; - } - - err = __bpf_tx_xdp(fwd, NULL, xdp, 0); - if (unlikely(err)) - goto err; - - _trace_xdp_redirect(dev, xdp_prog, index); - return 0; -err: - _trace_xdp_redirect_err(dev, xdp_prog, index, err); - return err; -} - static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, - struct bpf_map *map, - struct xdp_buff *xdp, - u32 index) + struct bpf_map *map, struct xdp_buff *xdp) { - int err; - switch (map->map_type) { - case BPF_MAP_TYPE_DEVMAP: { - struct bpf_dtab_netdev *dst = fwd; - - err = dev_map_enqueue(dst, xdp, dev_rx); - if (unlikely(err)) - return err; - break; - } - case BPF_MAP_TYPE_CPUMAP: { - struct bpf_cpu_map_entry *rcpu = fwd; - - err = cpu_map_enqueue(rcpu, xdp, dev_rx); - if (unlikely(err)) - return err; - break; - } - case BPF_MAP_TYPE_XSKMAP: { - struct xdp_sock *xs = fwd; - - err = __xsk_map_redirect(map, xdp, xs); - return err; - } + case BPF_MAP_TYPE_DEVMAP: + case BPF_MAP_TYPE_DEVMAP_HASH: + return dev_map_enqueue(fwd, xdp, dev_rx); + case BPF_MAP_TYPE_CPUMAP: + return cpu_map_enqueue(fwd, xdp, dev_rx); + case BPF_MAP_TYPE_XSKMAP: + return __xsk_map_redirect(fwd, xdp); default: - break; + return -EBADRQC; } return 0; } -void xdp_do_flush_map(void) +void xdp_do_flush(void) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - struct bpf_map *map = ri->map_to_flush; - - ri->map_to_flush = NULL; - if (map) { - switch (map->map_type) { - case BPF_MAP_TYPE_DEVMAP: - __dev_map_flush(map); - break; - case BPF_MAP_TYPE_CPUMAP: - __cpu_map_flush(map); - break; - case BPF_MAP_TYPE_XSKMAP: - __xsk_map_flush(map); - break; - default: - break; - } - } + __dev_flush(); + __cpu_map_flush(); + __xsk_map_flush(); } -EXPORT_SYMBOL_GPL(xdp_do_flush_map); +EXPORT_SYMBOL_GPL(xdp_do_flush); static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) { switch (map->map_type) { case BPF_MAP_TYPE_DEVMAP: return __dev_map_lookup_elem(map, index); + case BPF_MAP_TYPE_DEVMAP_HASH: + return __dev_map_hash_lookup_elem(map, index); case BPF_MAP_TYPE_CPUMAP: return __cpu_map_lookup_elem(map, index); case BPF_MAP_TYPE_XSKMAP: @@ -3600,10 +3517,11 @@ void bpf_clear_redirect_map(struct bpf_map *map) } } -static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog, struct bpf_map *map, - struct bpf_redirect_info *ri) +int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) { + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_map *map = READ_ONCE(ri->map); u32 index = ri->tgt_index; void *fwd = ri->tgt_value; int err; @@ -3612,32 +3530,27 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, ri->tgt_value = NULL; WRITE_ONCE(ri->map, NULL); - if (ri->map_to_flush && unlikely(ri->map_to_flush != map)) - xdp_do_flush_map(); + if (unlikely(!map)) { + fwd = dev_get_by_index_rcu(dev_net(dev), index); + if (unlikely(!fwd)) { + err = -EINVAL; + goto err; + } + + err = dev_xdp_enqueue(fwd, xdp, dev); + } else { + err = __bpf_tx_xdp_map(dev, fwd, map, xdp); + } - err = __bpf_tx_xdp_map(dev, fwd, map, xdp, index); if (unlikely(err)) goto err; - ri->map_to_flush = map; _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index); return 0; err: _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err); return err; } - -int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) -{ - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - struct bpf_map *map = READ_ONCE(ri->map); - - if (likely(map)) - return xdp_do_redirect_map(dev, xdp, xdp_prog, map, ri); - - return xdp_do_redirect_slow(dev, xdp, xdp_prog, ri); -} EXPORT_SYMBOL_GPL(xdp_do_redirect); static int xdp_do_generic_redirect_map(struct net_device *dev, @@ -3655,7 +3568,8 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, ri->tgt_value = NULL; WRITE_ONCE(ri->map, NULL); - if (map->map_type == BPF_MAP_TYPE_DEVMAP) { + if (map->map_type == BPF_MAP_TYPE_DEVMAP || + map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { struct bpf_dtab_netdev *dst = fwd; err = dev_map_generic_redirect(dst, skb, xdp_prog); @@ -3793,7 +3707,7 @@ BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map, if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; - if (unlikely(skb_size > skb->len)) + if (unlikely(!skb || skb_size > skb->len)) return -EFAULT; return bpf_event_output(map, flags, meta, meta_size, skb, skb_size, @@ -3811,6 +3725,19 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +static int bpf_skb_output_btf_ids[5]; +const struct bpf_func_proto bpf_skb_output_proto = { + .func = bpf_skb_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, + .btf_id = bpf_skb_output_btf_ids, +}; + static unsigned short bpf_tunnel_key_af(u64 flags) { return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; @@ -4084,7 +4011,7 @@ BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) return 0; cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - return cgrp->kn->id.id; + return cgroup_id(cgrp); } static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { @@ -4109,7 +4036,7 @@ BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, if (!ancestor) return 0; - return ancestor->kn->id.id; + return cgroup_id(ancestor); } static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { @@ -4247,12 +4174,14 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, case SO_RCVBUF: val = min_t(u32, val, sysctl_rmem_max); sk->sk_userlocks |= SOCK_RCVBUF_LOCK; - sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF); + WRITE_ONCE(sk->sk_rcvbuf, + max_t(int, val * 2, SOCK_MIN_RCVBUF)); break; case SO_SNDBUF: val = min_t(u32, val, sysctl_wmem_max); sk->sk_userlocks |= SOCK_SNDBUF_LOCK; - sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); + WRITE_ONCE(sk->sk_sndbuf, + max_t(int, val * 2, SOCK_MIN_SNDBUF)); break; case SO_MAX_PACING_RATE: /* 32bit version */ if (val != ~0U) @@ -4269,7 +4198,7 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, case SO_RCVLOWAT: if (val < 0) val = INT_MAX; - sk->sk_rcvlowat = val ? : 1; + WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); break; case SO_MARK: if (sk->sk_mark != val) { @@ -5297,8 +5226,7 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, if (sk) { sk = sk_to_full_sk(sk); if (!sk_fullsock(sk)) { - if (!sock_flag(sk, SOCK_RCU_FREE)) - sock_gen_put(sk); + sock_gen_put(sk); return NULL; } } @@ -5335,8 +5263,7 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, if (sk) { sk = sk_to_full_sk(sk); if (!sk_fullsock(sk)) { - if (!sock_flag(sk, SOCK_RCU_FREE)) - sock_gen_put(sk); + sock_gen_put(sk); return NULL; } } @@ -5403,7 +5330,8 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { BPF_CALL_1(bpf_sk_release, struct sock *, sk) { - if (!sock_flag(sk, SOCK_RCU_FREE)) + /* Only full sockets have sk->sk_flags. */ + if (!sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE)) sock_gen_put(sk); return 0; } @@ -5569,8 +5497,8 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, #define BPF_TCP_SOCK_GET_COMMON(FIELD) \ do { \ - BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD) > \ - FIELD_SIZEOF(struct bpf_tcp_sock, FIELD)); \ + BUILD_BUG_ON(sizeof_field(struct tcp_sock, FIELD) > \ + sizeof_field(struct bpf_tcp_sock, FIELD)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_sock, FIELD),\ si->dst_reg, si->src_reg, \ offsetof(struct tcp_sock, FIELD)); \ @@ -5578,9 +5506,9 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, #define BPF_INET_SOCK_GET_COMMON(FIELD) \ do { \ - BUILD_BUG_ON(FIELD_SIZEOF(struct inet_connection_sock, \ + BUILD_BUG_ON(sizeof_field(struct inet_connection_sock, \ FIELD) > \ - FIELD_SIZEOF(struct bpf_tcp_sock, FIELD)); \ + sizeof_field(struct bpf_tcp_sock, FIELD)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct inet_connection_sock, \ FIELD), \ @@ -5595,7 +5523,7 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, switch (si->off) { case offsetof(struct bpf_tcp_sock, rtt_min): - BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) != + BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) != sizeof(struct minmax)); BUILD_BUG_ON(sizeof(struct minmax) < sizeof(struct minmax_sample)); @@ -5760,8 +5688,8 @@ u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, #define BPF_XDP_SOCK_GET(FIELD) \ do { \ - BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_sock, FIELD) > \ - FIELD_SIZEOF(struct bpf_xdp_sock, FIELD)); \ + BUILD_BUG_ON(sizeof_field(struct xdp_sock, FIELD) > \ + sizeof_field(struct bpf_xdp_sock, FIELD)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_sock, FIELD),\ si->dst_reg, si->src_reg, \ offsetof(struct xdp_sock, FIELD)); \ @@ -5850,6 +5778,75 @@ static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = { .arg5_type = ARG_CONST_SIZE, }; +BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len, + struct tcphdr *, th, u32, th_len) +{ +#ifdef CONFIG_SYN_COOKIES + u32 cookie; + u16 mss; + + if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4)) + return -EINVAL; + + if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN) + return -EINVAL; + + if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies) + return -ENOENT; + + if (!th->syn || th->ack || th->fin || th->rst) + return -EINVAL; + + if (unlikely(iph_len < sizeof(struct iphdr))) + return -EINVAL; + + /* Both struct iphdr and struct ipv6hdr have the version field at the + * same offset so we can cast to the shorter header (struct iphdr). + */ + switch (((struct iphdr *)iph)->version) { + case 4: + if (sk->sk_family == AF_INET6 && sk->sk_ipv6only) + return -EINVAL; + + mss = tcp_v4_get_syncookie(sk, iph, th, &cookie); + break; + +#if IS_BUILTIN(CONFIG_IPV6) + case 6: + if (unlikely(iph_len < sizeof(struct ipv6hdr))) + return -EINVAL; + + if (sk->sk_family != AF_INET6) + return -EINVAL; + + mss = tcp_v6_get_syncookie(sk, iph, th, &cookie); + break; +#endif /* CONFIG_IPV6 */ + + default: + return -EPROTONOSUPPORT; + } + if (mss == 0) + return -ENOENT; + + return cookie | ((u64)mss << 32); +#else + return -EOPNOTSUPP; +#endif /* CONFIG_SYN_COOKIES */ +} + +static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = { + .func = bpf_tcp_gen_syncookie, + .gpl_only = true, /* __cookie_v*_init_sequence() is GPL */ + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_SOCK_COMMON, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -5886,7 +5883,7 @@ bool bpf_helper_changes_pkt_data(void *func) return false; } -static const struct bpf_func_proto * +const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -5926,6 +5923,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_spin_unlock_proto; case BPF_FUNC_trace_printk: return bpf_get_trace_printk_proto(); + case BPF_FUNC_jiffies64: + return &bpf_jiffies64_proto; default: return NULL; } @@ -5999,6 +5998,8 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; + case BPF_FUNC_perf_event_output: + return &bpf_skb_event_output_proto; default: return bpf_base_func_proto(func_id); } @@ -6019,6 +6020,8 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; + case BPF_FUNC_perf_event_output: + return &bpf_skb_event_output_proto; #ifdef CONFIG_SOCK_CGROUP_DATA case BPF_FUNC_skb_cgroup_id: return &bpf_skb_cgroup_id_proto; @@ -6135,6 +6138,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_tcp_check_syncookie_proto; case BPF_FUNC_skb_ecn_set_ce: return &bpf_skb_ecn_set_ce_proto; + case BPF_FUNC_tcp_gen_syncookie: + return &bpf_tcp_gen_syncookie_proto; #endif default: return bpf_base_func_proto(func_id); @@ -6174,6 +6179,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_skc_lookup_tcp_proto; case BPF_FUNC_tcp_check_syncookie: return &bpf_tcp_check_syncookie_proto; + case BPF_FUNC_tcp_gen_syncookie: + return &bpf_tcp_gen_syncookie_proto; #endif default: return bpf_base_func_proto(func_id); @@ -6267,6 +6274,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_redirect_map_proto; case BPF_FUNC_sk_redirect_hash: return &bpf_sk_redirect_hash_proto; + case BPF_FUNC_perf_event_output: + return &bpf_skb_event_output_proto; #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_sk_lookup_tcp_proto; @@ -7245,7 +7254,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, case offsetof(struct __sk_buff, cb[0]) ... offsetofend(struct __sk_buff, cb[4]) - 1: - BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20); + BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, data) < 20); BUILD_BUG_ON((offsetof(struct sk_buff, cb) + offsetof(struct qdisc_skb_cb, data)) % sizeof(__u64)); @@ -7264,7 +7273,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct __sk_buff, tc_classid): - BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, tc_classid) != 2); + BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, tc_classid) != 2); off = si->off; off -= offsetof(struct __sk_buff, tc_classid); @@ -7335,7 +7344,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, #endif break; case offsetof(struct __sk_buff, family): - BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2); + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, @@ -7346,7 +7355,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, 2, target_size)); break; case offsetof(struct __sk_buff, remote_ip4): - BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4); + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, @@ -7357,7 +7366,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, 4, target_size)); break; case offsetof(struct __sk_buff, local_ip4): - BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_rcv_saddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), @@ -7371,7 +7380,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, case offsetof(struct __sk_buff, remote_ip6[0]) ... offsetof(struct __sk_buff, remote_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) - BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_daddr.s6_addr32[0]) != 4); off = si->off; @@ -7391,7 +7400,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, case offsetof(struct __sk_buff, local_ip6[0]) ... offsetof(struct __sk_buff, local_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) - BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) != 4); off = si->off; @@ -7410,7 +7419,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct __sk_buff, remote_port): - BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2); + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, @@ -7425,7 +7434,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct __sk_buff, local_port): - BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2); + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, @@ -7436,7 +7445,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct __sk_buff, tstamp): - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tstamp) != 8); + BUILD_BUG_ON(sizeof_field(struct sk_buff, tstamp) != 8); if (type == BPF_WRITE) *insn++ = BPF_STX_MEM(BPF_DW, @@ -7474,7 +7483,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, target_size)); break; case offsetof(struct __sk_buff, wire_len): - BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, pkt_len) != 4); + BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, pkt_len) != 4); off = si->off; off -= offsetof(struct __sk_buff, wire_len); @@ -7504,7 +7513,7 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, switch (si->off) { case offsetof(struct bpf_sock, bound_dev_if): - BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4); + BUILD_BUG_ON(sizeof_field(struct sock, sk_bound_dev_if) != 4); if (type == BPF_WRITE) *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, @@ -7515,7 +7524,7 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct bpf_sock, mark): - BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_mark) != 4); + BUILD_BUG_ON(sizeof_field(struct sock, sk_mark) != 4); if (type == BPF_WRITE) *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, @@ -7526,7 +7535,7 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct bpf_sock, priority): - BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_priority) != 4); + BUILD_BUG_ON(sizeof_field(struct sock, sk_priority) != 4); if (type == BPF_WRITE) *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, @@ -7542,34 +7551,34 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_family, - FIELD_SIZEOF(struct sock_common, + sizeof_field(struct sock_common, skc_family), target_size)); break; case offsetof(struct bpf_sock, type): - BUILD_BUG_ON(HWEIGHT32(SK_FL_TYPE_MASK) != BITS_PER_BYTE * 2); - *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sock, __sk_flags_offset)); - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK); - *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT); - *target_size = 2; + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock, sk_type), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock, sk_type, + sizeof_field(struct sock, sk_type), + target_size)); break; case offsetof(struct bpf_sock, protocol): - BUILD_BUG_ON(HWEIGHT32(SK_FL_PROTO_MASK) != BITS_PER_BYTE); - *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sock, __sk_flags_offset)); - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); - *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT); - *target_size = 1; + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock, sk_protocol), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock, sk_protocol, + sizeof_field(struct sock, sk_protocol), + target_size)); break; case offsetof(struct bpf_sock, src_ip4): *insn++ = BPF_LDX_MEM( BPF_SIZE(si->code), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_rcv_saddr, - FIELD_SIZEOF(struct sock_common, + sizeof_field(struct sock_common, skc_rcv_saddr), target_size)); break; @@ -7578,7 +7587,7 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_LDX_MEM( BPF_SIZE(si->code), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_daddr, - FIELD_SIZEOF(struct sock_common, + sizeof_field(struct sock_common, skc_daddr), target_size)); break; @@ -7592,7 +7601,7 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, bpf_target_off( struct sock_common, skc_v6_rcv_saddr.s6_addr32[0], - FIELD_SIZEOF(struct sock_common, + sizeof_field(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]), target_size) + off); #else @@ -7609,7 +7618,7 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, BPF_SIZE(si->code), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_v6_daddr.s6_addr32[0], - FIELD_SIZEOF(struct sock_common, + sizeof_field(struct sock_common, skc_v6_daddr.s6_addr32[0]), target_size) + off); #else @@ -7623,7 +7632,7 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, BPF_FIELD_SIZEOF(struct sock_common, skc_num), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_num, - FIELD_SIZEOF(struct sock_common, + sizeof_field(struct sock_common, skc_num), target_size)); break; @@ -7633,7 +7642,7 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, BPF_FIELD_SIZEOF(struct sock_common, skc_dport), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_dport, - FIELD_SIZEOF(struct sock_common, + sizeof_field(struct sock_common, skc_dport), target_size)); break; @@ -7643,7 +7652,7 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, BPF_FIELD_SIZEOF(struct sock_common, skc_state), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_state, - FIELD_SIZEOF(struct sock_common, + sizeof_field(struct sock_common, skc_state), target_size)); break; @@ -7738,7 +7747,7 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, si->src_reg, offsetof(S, F)); \ *insn++ = BPF_LDX_MEM( \ SIZE, si->dst_reg, si->dst_reg, \ - bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \ + bpf_target_off(NS, NF, sizeof_field(NS, NF), \ target_size) \ + OFF); \ } while (0) @@ -7769,7 +7778,7 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg, \ si->dst_reg, offsetof(S, F)); \ *insn++ = BPF_STX_MEM(SIZE, tmp_reg, si->src_reg, \ - bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \ + bpf_target_off(NS, NF, sizeof_field(NS, NF), \ target_size) \ + OFF); \ *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg, \ @@ -7831,8 +7840,8 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, */ BUILD_BUG_ON(offsetof(struct sockaddr_in, sin_port) != offsetof(struct sockaddr_in6, sin6_port)); - BUILD_BUG_ON(FIELD_SIZEOF(struct sockaddr_in, sin_port) != - FIELD_SIZEOF(struct sockaddr_in6, sin6_port)); + BUILD_BUG_ON(sizeof_field(struct sockaddr_in, sin_port) != + sizeof_field(struct sockaddr_in6, sin6_port)); SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, sin6_port, tmp_reg); @@ -7844,20 +7853,13 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct bpf_sock_addr, type): - SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( - struct bpf_sock_addr_kern, struct sock, sk, - __sk_flags_offset, BPF_W, 0); - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK); - *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT); + SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, + struct sock, sk, sk_type); break; case offsetof(struct bpf_sock_addr, protocol): - SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( - struct bpf_sock_addr_kern, struct sock, sk, - __sk_flags_offset, BPF_W, 0); - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); - *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, - SK_FL_PROTO_SHIFT); + SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, + struct sock, sk, sk_protocol); break; case offsetof(struct bpf_sock_addr, msg_src_ip4): @@ -7898,8 +7900,8 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, /* Helper macro for adding read access to tcp_sock or sock fields. */ #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ do { \ - BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ - FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ + BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) > \ + sizeof_field(struct bpf_sock_ops, BPF_FIELD)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ is_fullsock), \ @@ -7932,8 +7934,8 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, #define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ do { \ int reg = BPF_REG_9; \ - BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ - FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ + BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) > \ + sizeof_field(struct bpf_sock_ops, BPF_FIELD)); \ if (si->dst_reg == reg || si->src_reg == reg) \ reg--; \ if (si->dst_reg == reg || si->src_reg == reg) \ @@ -7974,12 +7976,12 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, switch (si->off) { case offsetof(struct bpf_sock_ops, op) ... offsetof(struct bpf_sock_ops, replylong[3]): - BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, op) != - FIELD_SIZEOF(struct bpf_sock_ops_kern, op)); - BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, reply) != - FIELD_SIZEOF(struct bpf_sock_ops_kern, reply)); - BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, replylong) != - FIELD_SIZEOF(struct bpf_sock_ops_kern, replylong)); + BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, op) != + sizeof_field(struct bpf_sock_ops_kern, op)); + BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, reply) != + sizeof_field(struct bpf_sock_ops_kern, reply)); + BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, replylong) != + sizeof_field(struct bpf_sock_ops_kern, replylong)); off = si->off; off -= offsetof(struct bpf_sock_ops, op); off += offsetof(struct bpf_sock_ops_kern, op); @@ -7992,7 +7994,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct bpf_sock_ops, family): - BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2); + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), @@ -8003,7 +8005,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct bpf_sock_ops, remote_ip4): - BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4); + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), @@ -8014,7 +8016,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct bpf_sock_ops, local_ip4): - BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_rcv_saddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( @@ -8029,7 +8031,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, case offsetof(struct bpf_sock_ops, remote_ip6[0]) ... offsetof(struct bpf_sock_ops, remote_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) - BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_daddr.s6_addr32[0]) != 4); off = si->off; @@ -8050,7 +8052,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, case offsetof(struct bpf_sock_ops, local_ip6[0]) ... offsetof(struct bpf_sock_ops, local_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) - BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) != 4); off = si->off; @@ -8069,7 +8071,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct bpf_sock_ops, remote_port): - BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2); + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), @@ -8083,7 +8085,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct bpf_sock_ops, local_port): - BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2); + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), @@ -8103,7 +8105,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct bpf_sock_ops, state): - BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_state) != 1); + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_state) != 1); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), @@ -8114,7 +8116,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct bpf_sock_ops, rtt_min): - BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) != + BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) != sizeof(struct minmax)); BUILD_BUG_ON(sizeof(struct minmax) < sizeof(struct minmax_sample)); @@ -8125,7 +8127,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct tcp_sock, rtt_min) + - FIELD_SIZEOF(struct minmax_sample, t)); + sizeof_field(struct minmax_sample, t)); break; case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags): @@ -8267,7 +8269,7 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, offsetof(struct sk_msg, data_end)); break; case offsetof(struct sk_msg_md, family): - BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2); + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), @@ -8278,7 +8280,7 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct sk_msg_md, remote_ip4): - BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4); + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), @@ -8289,7 +8291,7 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct sk_msg_md, local_ip4): - BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_rcv_saddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( @@ -8304,7 +8306,7 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, case offsetof(struct sk_msg_md, remote_ip6[0]) ... offsetof(struct sk_msg_md, remote_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) - BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_daddr.s6_addr32[0]) != 4); off = si->off; @@ -8325,7 +8327,7 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, case offsetof(struct sk_msg_md, local_ip6[0]) ... offsetof(struct sk_msg_md, local_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) - BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) != 4); off = si->off; @@ -8344,7 +8346,7 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct sk_msg_md, remote_port): - BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2); + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), @@ -8358,7 +8360,7 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct sk_msg_md, local_port): - BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2); + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), @@ -8585,16 +8587,6 @@ out: } #ifdef CONFIG_INET -struct sk_reuseport_kern { - struct sk_buff *skb; - struct sock *sk; - struct sock *selected_sk; - void *data_end; - u32 hash; - u32 reuseport_id; - bool bind_inany; -}; - static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern, struct sock_reuseport *reuse, struct sock *sk, struct sk_buff *skb, @@ -8757,13 +8749,13 @@ sk_reuseport_is_valid_access(int off, int size, return size == size_default; /* Fields that allow narrowing */ - case offsetof(struct sk_reuseport_md, eth_protocol): - if (size < FIELD_SIZEOF(struct sk_buff, protocol)) + case bpf_ctx_range(struct sk_reuseport_md, eth_protocol): + if (size < sizeof_field(struct sk_buff, protocol)) return false; /* fall through */ - case offsetof(struct sk_reuseport_md, ip_protocol): - case offsetof(struct sk_reuseport_md, bind_inany): - case offsetof(struct sk_reuseport_md, len): + case bpf_ctx_range(struct sk_reuseport_md, ip_protocol): + case bpf_ctx_range(struct sk_reuseport_md, bind_inany): + case bpf_ctx_range(struct sk_reuseport_md, len): bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); @@ -8776,7 +8768,7 @@ sk_reuseport_is_valid_access(int off, int size, *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \ si->dst_reg, si->src_reg, \ bpf_target_off(struct sk_reuseport_kern, F, \ - FIELD_SIZEOF(struct sk_reuseport_kern, F), \ + sizeof_field(struct sk_reuseport_kern, F), \ target_size)); \ }) @@ -8786,11 +8778,11 @@ sk_reuseport_is_valid_access(int off, int size, skb, \ SKB_FIELD) -#define SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(SK_FIELD, BPF_SIZE, EXTRA_OFF) \ - SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(struct sk_reuseport_kern, \ - struct sock, \ - sk, \ - SK_FIELD, BPF_SIZE, EXTRA_OFF) +#define SK_REUSEPORT_LOAD_SK_FIELD(SK_FIELD) \ + SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \ + struct sock, \ + sk, \ + SK_FIELD) static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, @@ -8814,16 +8806,7 @@ static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct sk_reuseport_md, ip_protocol): - BUILD_BUG_ON(HWEIGHT32(SK_FL_PROTO_MASK) != BITS_PER_BYTE); - SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(__sk_flags_offset, - BPF_W, 0); - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); - *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, - SK_FL_PROTO_SHIFT); - /* SK_FL_PROTO_MASK and SK_FL_PROTO_SHIFT are endian - * aware. No further narrowing or masking is needed. - */ - *target_size = 1; + SK_REUSEPORT_LOAD_SK_FIELD(sk_protocol); break; case offsetof(struct sk_reuseport_md, data_end): @@ -8851,3 +8834,11 @@ const struct bpf_verifier_ops sk_reuseport_verifier_ops = { const struct bpf_prog_ops sk_reuseport_prog_ops = { }; #endif /* CONFIG_INET */ + +DEFINE_BPF_DISPATCHER(bpf_dispatcher_xdp) + +void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog) +{ + bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(bpf_dispatcher_xdp), + prev_prog, prog); +} diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 3e6fedb57bc1..a1670dff0629 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -114,19 +114,50 @@ int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, { struct bpf_prog *attached; struct net *net; + int ret = 0; net = current->nsproxy->net_ns; mutex_lock(&flow_dissector_mutex); + + if (net == &init_net) { + /* BPF flow dissector in the root namespace overrides + * any per-net-namespace one. When attaching to root, + * make sure we don't have any BPF program attached + * to the non-root namespaces. + */ + struct net *ns; + + for_each_net(ns) { + if (ns == &init_net) + continue; + if (rcu_access_pointer(ns->flow_dissector_prog)) { + ret = -EEXIST; + goto out; + } + } + } else { + /* Make sure root flow dissector is not attached + * when attaching to the non-root namespace. + */ + if (rcu_access_pointer(init_net.flow_dissector_prog)) { + ret = -EEXIST; + goto out; + } + } + attached = rcu_dereference_protected(net->flow_dissector_prog, lockdep_is_held(&flow_dissector_mutex)); - if (attached) { - /* Only one BPF program can be attached at a time */ - mutex_unlock(&flow_dissector_mutex); - return -EEXIST; + if (attached == prog) { + /* The same program cannot be attached twice */ + ret = -EINVAL; + goto out; } rcu_assign_pointer(net->flow_dissector_prog, prog); + if (attached) + bpf_prog_put(attached); +out: mutex_unlock(&flow_dissector_mutex); - return 0; + return ret; } int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) @@ -142,32 +173,11 @@ int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) mutex_unlock(&flow_dissector_mutex); return -ENOENT; } - bpf_prog_put(attached); RCU_INIT_POINTER(net->flow_dissector_prog, NULL); + bpf_prog_put(attached); mutex_unlock(&flow_dissector_mutex); return 0; } -/** - * skb_flow_get_be16 - extract be16 entity - * @skb: sk_buff to extract from - * @poff: offset to extract at - * @data: raw buffer pointer to the packet - * @hlen: packet header length - * - * The function will try to retrieve a be32 entity at - * offset poff - */ -static __be16 skb_flow_get_be16(const struct sk_buff *skb, int poff, - void *data, int hlen) -{ - __be16 *u, _u; - - u = __skb_header_pointer(skb, poff, sizeof(_u), data, hlen, &_u); - if (u) - return *u; - - return 0; -} /** * __skb_flow_get_ports - extract the upper layer ports and return them @@ -203,6 +213,72 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, } EXPORT_SYMBOL(__skb_flow_get_ports); +static bool icmp_has_id(u8 type) +{ + switch (type) { + case ICMP_ECHO: + case ICMP_ECHOREPLY: + case ICMP_TIMESTAMP: + case ICMP_TIMESTAMPREPLY: + case ICMPV6_ECHO_REQUEST: + case ICMPV6_ECHO_REPLY: + return true; + } + + return false; +} + +/** + * skb_flow_get_icmp_tci - extract ICMP(6) Type, Code and Identifier fields + * @skb: sk_buff to extract from + * @key_icmp: struct flow_dissector_key_icmp to fill + * @data: raw buffer pointer to the packet + * @thoff: offset to extract at + * @hlen: packet header length + */ +void skb_flow_get_icmp_tci(const struct sk_buff *skb, + struct flow_dissector_key_icmp *key_icmp, + void *data, int thoff, int hlen) +{ + struct icmphdr *ih, _ih; + + ih = __skb_header_pointer(skb, thoff, sizeof(_ih), data, hlen, &_ih); + if (!ih) + return; + + key_icmp->type = ih->type; + key_icmp->code = ih->code; + + /* As we use 0 to signal that the Id field is not present, + * avoid confusion with packets without such field + */ + if (icmp_has_id(ih->type)) + key_icmp->id = ih->un.echo.id ? : 1; + else + key_icmp->id = 0; +} +EXPORT_SYMBOL(skb_flow_get_icmp_tci); + +/* If FLOW_DISSECTOR_KEY_ICMP is set, dissect an ICMP packet + * using skb_flow_get_icmp_tci(). + */ +static void __skb_flow_dissect_icmp(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, + void *data, int thoff, int hlen) +{ + struct flow_dissector_key_icmp *key_icmp; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ICMP)) + return; + + key_icmp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ICMP, + target_container); + + skb_flow_get_icmp_tci(skb, key_icmp, data, thoff, hlen); +} + void skb_flow_dissect_meta(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container) @@ -523,8 +599,8 @@ __skb_flow_dissect_gre(const struct sk_buff *skb, offset += sizeof(struct gre_base_hdr); if (hdr->flags & GRE_CSUM) - offset += FIELD_SIZEOF(struct gre_full_hdr, csum) + - FIELD_SIZEOF(struct gre_full_hdr, reserved1); + offset += sizeof_field(struct gre_full_hdr, csum) + + sizeof_field(struct gre_full_hdr, reserved1); if (hdr->flags & GRE_KEY) { const __be32 *keyid; @@ -546,11 +622,11 @@ __skb_flow_dissect_gre(const struct sk_buff *skb, else key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK; } - offset += FIELD_SIZEOF(struct gre_full_hdr, key); + offset += sizeof_field(struct gre_full_hdr, key); } if (hdr->flags & GRE_SEQ) - offset += FIELD_SIZEOF(struct pptp_gre_header, seq); + offset += sizeof_field(struct pptp_gre_header, seq); if (gre_ver == 0) { if (*p_proto == htons(ETH_P_TEB)) { @@ -577,7 +653,7 @@ __skb_flow_dissect_gre(const struct sk_buff *skb, u8 *ppp_hdr; if (hdr->flags & GRE_ACK) - offset += FIELD_SIZEOF(struct pptp_gre_header, ack); + offset += sizeof_field(struct pptp_gre_header, ack); ppp_hdr = __skb_header_pointer(skb, *p_nhoff + offset, sizeof(_ppp_hdr), @@ -684,6 +760,31 @@ __skb_flow_dissect_tcp(const struct sk_buff *skb, } static void +__skb_flow_dissect_ports(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, void *data, int nhoff, + u8 ip_proto, int hlen) +{ + enum flow_dissector_key_id dissector_ports = FLOW_DISSECTOR_KEY_MAX; + struct flow_dissector_key_ports *key_ports; + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) + dissector_ports = FLOW_DISSECTOR_KEY_PORTS; + else if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE)) + dissector_ports = FLOW_DISSECTOR_KEY_PORTS_RANGE; + + if (dissector_ports == FLOW_DISSECTOR_KEY_MAX) + return; + + key_ports = skb_flow_dissector_target(flow_dissector, + dissector_ports, + target_container); + key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto, + data, hlen); +} + +static void __skb_flow_dissect_ipv4(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, void *data, const struct iphdr *iph) @@ -733,10 +834,11 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, struct flow_dissector *flow_dissector, void *target_container) { + struct flow_dissector_key_ports *key_ports = NULL; struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; struct flow_dissector_key_addrs *key_addrs; - struct flow_dissector_key_ports *key_ports; + struct flow_dissector_key_tags *key_tags; key_control = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CONTROL, @@ -774,17 +876,32 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } - if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) { + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) key_ports = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS, target_container); + else if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE)) + key_ports = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE, + target_container); + + if (key_ports) { key_ports->src = flow_keys->sport; key_ports->dst = flow_keys->dport; } + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL)) { + key_tags = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL, + target_container); + key_tags->flow_label = ntohl(flow_keys->flow_label); + } } bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, - __be16 proto, int nhoff, int hlen) + __be16 proto, int nhoff, int hlen, unsigned int flags) { struct bpf_flow_keys *flow_keys = ctx->flow_keys; u32 result; @@ -795,6 +912,14 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, flow_keys->nhoff = nhoff; flow_keys->thoff = flow_keys->nhoff; + BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG != + (int)FLOW_DISSECTOR_F_PARSE_1ST_FRAG); + BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL != + (int)FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); + BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP != + (int)FLOW_DISSECTOR_F_STOP_AT_ENCAP); + flow_keys->flags = flags; + preempt_disable(); result = BPF_PROG_RUN(prog, ctx); preempt_enable(); @@ -835,8 +960,6 @@ bool __skb_flow_dissect(const struct net *net, struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; struct flow_dissector_key_addrs *key_addrs; - struct flow_dissector_key_ports *key_ports; - struct flow_dissector_key_icmp *key_icmp; struct flow_dissector_key_tags *key_tags; struct flow_dissector_key_vlan *key_vlan; struct bpf_prog *attached = NULL; @@ -853,9 +976,10 @@ bool __skb_flow_dissect(const struct net *net, nhoff = skb_network_offset(skb); hlen = skb_headlen(skb); #if IS_ENABLED(CONFIG_NET_DSA) - if (unlikely(skb->dev && netdev_uses_dsa(skb->dev))) { + if (unlikely(skb->dev && netdev_uses_dsa(skb->dev) && + proto == htons(ETH_P_XDSA))) { const struct dsa_device_ops *ops; - int offset; + int offset = 0; ops = skb->dev->dsa_ptr->tag_ops; if (ops->flow_dissect && @@ -893,7 +1017,10 @@ bool __skb_flow_dissect(const struct net *net, WARN_ON_ONCE(!net); if (net) { rcu_read_lock(); - attached = rcu_dereference(net->flow_dissector_prog); + attached = rcu_dereference(init_net.flow_dissector_prog); + + if (!attached) + attached = rcu_dereference(net->flow_dissector_prog); if (attached) { struct bpf_flow_keys flow_keys; @@ -914,7 +1041,7 @@ bool __skb_flow_dissect(const struct net *net, } ret = bpf_flow_dissect(attached, &ctx, n_proto, nhoff, - hlen); + hlen, flags); __skb_flow_bpf_to_target(&flow_keys, flow_dissector, target_container); rcu_read_unlock(); @@ -1278,26 +1405,19 @@ ip_proto_again: data, nhoff, hlen); break; - default: + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: + __skb_flow_dissect_icmp(skb, flow_dissector, target_container, + data, nhoff, hlen); break; - } - if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS) && - !(key_control->flags & FLOW_DIS_IS_FRAGMENT)) { - key_ports = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS, - target_container); - key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto, - data, hlen); + default: + break; } - if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_ICMP)) { - key_icmp = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_ICMP, - target_container); - key_icmp->icmp = skb_flow_get_be16(skb, nhoff, data, hlen); - } + if (!(key_control->flags & FLOW_DIS_IS_FRAGMENT)) + __skb_flow_dissect_ports(skb, flow_dissector, target_container, + data, nhoff, ip_proto, hlen); /* Process result of IP proto processing */ switch (fdret) { @@ -1333,32 +1453,23 @@ out_bad: } EXPORT_SYMBOL(__skb_flow_dissect); -static u32 hashrnd __read_mostly; +static siphash_key_t hashrnd __read_mostly; static __always_inline void __flow_hash_secret_init(void) { net_get_random_once(&hashrnd, sizeof(hashrnd)); } -static __always_inline u32 __flow_hash_words(const u32 *words, u32 length, - u32 keyval) +static const void *flow_keys_hash_start(const struct flow_keys *flow) { - return jhash2(words, length, keyval); -} - -static inline const u32 *flow_keys_hash_start(const struct flow_keys *flow) -{ - const void *p = flow; - - BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % sizeof(u32)); - return (const u32 *)(p + FLOW_KEYS_HASH_OFFSET); + BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % SIPHASH_ALIGNMENT); + return &flow->FLOW_KEYS_HASH_START_FIELD; } static inline size_t flow_keys_hash_length(const struct flow_keys *flow) { size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs); + BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32)); - BUILD_BUG_ON(offsetof(typeof(*flow), addrs) != - sizeof(*flow) - sizeof(flow->addrs)); switch (flow->control.addr_type) { case FLOW_DISSECTOR_KEY_IPV4_ADDRS: @@ -1371,7 +1482,7 @@ static inline size_t flow_keys_hash_length(const struct flow_keys *flow) diff -= sizeof(flow->addrs.tipckey); break; } - return (sizeof(*flow) - diff) / sizeof(u32); + return sizeof(*flow) - diff; } __be32 flow_get_u32_src(const struct flow_keys *flow) @@ -1404,6 +1515,9 @@ __be32 flow_get_u32_dst(const struct flow_keys *flow) } EXPORT_SYMBOL(flow_get_u32_dst); +/* Sort the source and destination IP (and the ports if the IP are the same), + * to have consistent hash within the two directions + */ static inline void __flow_hash_consistentify(struct flow_keys *keys) { int addr_diff, i; @@ -1437,14 +1551,15 @@ static inline void __flow_hash_consistentify(struct flow_keys *keys) } } -static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval) +static inline u32 __flow_hash_from_keys(struct flow_keys *keys, + const siphash_key_t *keyval) { u32 hash; __flow_hash_consistentify(keys); - hash = __flow_hash_words(flow_keys_hash_start(keys), - flow_keys_hash_length(keys), keyval); + hash = siphash(flow_keys_hash_start(keys), + flow_keys_hash_length(keys), keyval); if (!hash) hash = 1; @@ -1454,12 +1569,13 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval) u32 flow_hash_from_keys(struct flow_keys *keys) { __flow_hash_secret_init(); - return __flow_hash_from_keys(keys, hashrnd); + return __flow_hash_from_keys(keys, &hashrnd); } EXPORT_SYMBOL(flow_hash_from_keys); static inline u32 ___skb_get_hash(const struct sk_buff *skb, - struct flow_keys *keys, u32 keyval) + struct flow_keys *keys, + const siphash_key_t *keyval) { skb_flow_dissect_flow_keys(skb, keys, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); @@ -1507,7 +1623,7 @@ u32 __skb_get_hash_symmetric(const struct sk_buff *skb) &keys, NULL, 0, 0, 0, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); - return __flow_hash_from_keys(&keys, hashrnd); + return __flow_hash_from_keys(&keys, &hashrnd); } EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric); @@ -1527,13 +1643,14 @@ void __skb_get_hash(struct sk_buff *skb) __flow_hash_secret_init(); - hash = ___skb_get_hash(skb, &keys, hashrnd); + hash = ___skb_get_hash(skb, &keys, &hashrnd); __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys)); } EXPORT_SYMBOL(__skb_get_hash); -__u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb) +__u32 skb_get_hash_perturb(const struct sk_buff *skb, + const siphash_key_t *perturb) { struct flow_keys keys; diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index d63b970784dc..45b6a59ac124 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -2,6 +2,8 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <net/flow_offload.h> +#include <linux/rtnetlink.h> +#include <linux/mutex.h> struct flow_rule *flow_rule_alloc(unsigned int num_actions) { @@ -280,3 +282,241 @@ int flow_block_cb_setup_simple(struct flow_block_offload *f, } } EXPORT_SYMBOL(flow_block_cb_setup_simple); + +static LIST_HEAD(block_cb_list); + +static struct rhashtable indr_setup_block_ht; + +struct flow_indr_block_cb { + struct list_head list; + void *cb_priv; + flow_indr_block_bind_cb_t *cb; + void *cb_ident; +}; + +struct flow_indr_block_dev { + struct rhash_head ht_node; + struct net_device *dev; + unsigned int refcnt; + struct list_head cb_list; +}; + +static const struct rhashtable_params flow_indr_setup_block_ht_params = { + .key_offset = offsetof(struct flow_indr_block_dev, dev), + .head_offset = offsetof(struct flow_indr_block_dev, ht_node), + .key_len = sizeof(struct net_device *), +}; + +static struct flow_indr_block_dev * +flow_indr_block_dev_lookup(struct net_device *dev) +{ + return rhashtable_lookup_fast(&indr_setup_block_ht, &dev, + flow_indr_setup_block_ht_params); +} + +static struct flow_indr_block_dev * +flow_indr_block_dev_get(struct net_device *dev) +{ + struct flow_indr_block_dev *indr_dev; + + indr_dev = flow_indr_block_dev_lookup(dev); + if (indr_dev) + goto inc_ref; + + indr_dev = kzalloc(sizeof(*indr_dev), GFP_KERNEL); + if (!indr_dev) + return NULL; + + INIT_LIST_HEAD(&indr_dev->cb_list); + indr_dev->dev = dev; + if (rhashtable_insert_fast(&indr_setup_block_ht, &indr_dev->ht_node, + flow_indr_setup_block_ht_params)) { + kfree(indr_dev); + return NULL; + } + +inc_ref: + indr_dev->refcnt++; + return indr_dev; +} + +static void flow_indr_block_dev_put(struct flow_indr_block_dev *indr_dev) +{ + if (--indr_dev->refcnt) + return; + + rhashtable_remove_fast(&indr_setup_block_ht, &indr_dev->ht_node, + flow_indr_setup_block_ht_params); + kfree(indr_dev); +} + +static struct flow_indr_block_cb * +flow_indr_block_cb_lookup(struct flow_indr_block_dev *indr_dev, + flow_indr_block_bind_cb_t *cb, void *cb_ident) +{ + struct flow_indr_block_cb *indr_block_cb; + + list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list) + if (indr_block_cb->cb == cb && + indr_block_cb->cb_ident == cb_ident) + return indr_block_cb; + return NULL; +} + +static struct flow_indr_block_cb * +flow_indr_block_cb_add(struct flow_indr_block_dev *indr_dev, void *cb_priv, + flow_indr_block_bind_cb_t *cb, void *cb_ident) +{ + struct flow_indr_block_cb *indr_block_cb; + + indr_block_cb = flow_indr_block_cb_lookup(indr_dev, cb, cb_ident); + if (indr_block_cb) + return ERR_PTR(-EEXIST); + + indr_block_cb = kzalloc(sizeof(*indr_block_cb), GFP_KERNEL); + if (!indr_block_cb) + return ERR_PTR(-ENOMEM); + + indr_block_cb->cb_priv = cb_priv; + indr_block_cb->cb = cb; + indr_block_cb->cb_ident = cb_ident; + list_add(&indr_block_cb->list, &indr_dev->cb_list); + + return indr_block_cb; +} + +static void flow_indr_block_cb_del(struct flow_indr_block_cb *indr_block_cb) +{ + list_del(&indr_block_cb->list); + kfree(indr_block_cb); +} + +static DEFINE_MUTEX(flow_indr_block_cb_lock); + +static void flow_block_cmd(struct net_device *dev, + flow_indr_block_bind_cb_t *cb, void *cb_priv, + enum flow_block_command command) +{ + struct flow_indr_block_entry *entry; + + mutex_lock(&flow_indr_block_cb_lock); + list_for_each_entry(entry, &block_cb_list, list) { + entry->cb(dev, cb, cb_priv, command); + } + mutex_unlock(&flow_indr_block_cb_lock); +} + +int __flow_indr_block_cb_register(struct net_device *dev, void *cb_priv, + flow_indr_block_bind_cb_t *cb, + void *cb_ident) +{ + struct flow_indr_block_cb *indr_block_cb; + struct flow_indr_block_dev *indr_dev; + int err; + + indr_dev = flow_indr_block_dev_get(dev); + if (!indr_dev) + return -ENOMEM; + + indr_block_cb = flow_indr_block_cb_add(indr_dev, cb_priv, cb, cb_ident); + err = PTR_ERR_OR_ZERO(indr_block_cb); + if (err) + goto err_dev_put; + + flow_block_cmd(dev, indr_block_cb->cb, indr_block_cb->cb_priv, + FLOW_BLOCK_BIND); + + return 0; + +err_dev_put: + flow_indr_block_dev_put(indr_dev); + return err; +} +EXPORT_SYMBOL_GPL(__flow_indr_block_cb_register); + +int flow_indr_block_cb_register(struct net_device *dev, void *cb_priv, + flow_indr_block_bind_cb_t *cb, + void *cb_ident) +{ + int err; + + rtnl_lock(); + err = __flow_indr_block_cb_register(dev, cb_priv, cb, cb_ident); + rtnl_unlock(); + + return err; +} +EXPORT_SYMBOL_GPL(flow_indr_block_cb_register); + +void __flow_indr_block_cb_unregister(struct net_device *dev, + flow_indr_block_bind_cb_t *cb, + void *cb_ident) +{ + struct flow_indr_block_cb *indr_block_cb; + struct flow_indr_block_dev *indr_dev; + + indr_dev = flow_indr_block_dev_lookup(dev); + if (!indr_dev) + return; + + indr_block_cb = flow_indr_block_cb_lookup(indr_dev, cb, cb_ident); + if (!indr_block_cb) + return; + + flow_block_cmd(dev, indr_block_cb->cb, indr_block_cb->cb_priv, + FLOW_BLOCK_UNBIND); + + flow_indr_block_cb_del(indr_block_cb); + flow_indr_block_dev_put(indr_dev); +} +EXPORT_SYMBOL_GPL(__flow_indr_block_cb_unregister); + +void flow_indr_block_cb_unregister(struct net_device *dev, + flow_indr_block_bind_cb_t *cb, + void *cb_ident) +{ + rtnl_lock(); + __flow_indr_block_cb_unregister(dev, cb, cb_ident); + rtnl_unlock(); +} +EXPORT_SYMBOL_GPL(flow_indr_block_cb_unregister); + +void flow_indr_block_call(struct net_device *dev, + struct flow_block_offload *bo, + enum flow_block_command command) +{ + struct flow_indr_block_cb *indr_block_cb; + struct flow_indr_block_dev *indr_dev; + + indr_dev = flow_indr_block_dev_lookup(dev); + if (!indr_dev) + return; + + list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list) + indr_block_cb->cb(dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK, + bo); +} +EXPORT_SYMBOL_GPL(flow_indr_block_call); + +void flow_indr_add_block_cb(struct flow_indr_block_entry *entry) +{ + mutex_lock(&flow_indr_block_cb_lock); + list_add_tail(&entry->list, &block_cb_list); + mutex_unlock(&flow_indr_block_cb_lock); +} +EXPORT_SYMBOL_GPL(flow_indr_add_block_cb); + +void flow_indr_del_block_cb(struct flow_indr_block_entry *entry) +{ + mutex_lock(&flow_indr_block_cb_lock); + list_del(&entry->list); + mutex_unlock(&flow_indr_block_cb_lock); +} +EXPORT_SYMBOL_GPL(flow_indr_del_block_cb); + +static int __init init_flow_indr_rhashtable(void) +{ + return rhashtable_init(&indr_setup_block_ht, + &flow_indr_setup_block_ht_params); +} +subsys_initcall(init_flow_indr_rhashtable); diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index bfe7bdd4c340..80dbf2f4016e 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -48,7 +48,7 @@ struct net_rate_estimator { u8 intvl_log; /* period : (250ms << intvl_log) */ seqcount_t seq; - u32 last_packets; + u64 last_packets; u64 last_bytes; u64 avpps; @@ -83,7 +83,7 @@ static void est_timer(struct timer_list *t) brate = (b.bytes - est->last_bytes) << (10 - est->ewma_log - est->intvl_log); brate -= (est->avbps >> est->ewma_log); - rate = (u64)(b.packets - est->last_packets) << (10 - est->ewma_log - est->intvl_log); + rate = (b.packets - est->last_packets) << (10 - est->ewma_log - est->intvl_log); rate -= (est->avpps >> est->ewma_log); write_seqcount_begin(&est->seq); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 36888f5e09eb..1d653fbfcf52 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -123,8 +123,7 @@ __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats, for_each_possible_cpu(i) { struct gnet_stats_basic_cpu *bcpu = per_cpu_ptr(cpu, i); unsigned int start; - u64 bytes; - u32 packets; + u64 bytes, packets; do { start = u64_stats_fetch_begin_irq(&bcpu->syncp); @@ -176,12 +175,17 @@ ___gnet_stats_copy_basic(const seqcount_t *running, if (d->tail) { struct gnet_stats_basic sb; + int res; memset(&sb, 0, sizeof(sb)); sb.bytes = bstats.bytes; sb.packets = bstats.packets; - return gnet_stats_copy(d, type, &sb, sizeof(sb), - TCA_STATS_PAD); + res = gnet_stats_copy(d, type, &sb, sizeof(sb), TCA_STATS_PAD); + if (res < 0 || sb.packets == bstats.packets) + return res; + /* emit 64bit stats only if needed */ + return gnet_stats_copy(d, TCA_STATS_PKT64, &bstats.packets, + sizeof(bstats.packets), TCA_STATS_PAD); } return 0; } diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index f93785e5833c..99a6de52b21d 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -88,11 +88,16 @@ static int bpf_lwt_input_reroute(struct sk_buff *skb) int err = -EINVAL; if (skb->protocol == htons(ETH_P_IP)) { + struct net_device *dev = skb_dst(skb)->dev; struct iphdr *iph = ip_hdr(skb); + dev_hold(dev); + skb_dst_drop(skb); err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, skb_dst(skb)->dev); + iph->tos, dev); + dev_put(dev); } else if (skb->protocol == htons(ETH_P_IPV6)) { + skb_dst_drop(skb); err = ipv6_stub->ipv6_route_input(skb); } else { err = -EAFNOSUPPORT; @@ -225,9 +230,7 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) fl6.daddr = iph6->daddr; fl6.saddr = iph6->saddr; - err = ipv6_stub->ipv6_dst_lookup(net, skb->sk, &dst, &fl6); - if (unlikely(err)) - goto err; + dst = ipv6_stub->ipv6_dst_lookup_flow(net, skb->sk, &fl6, NULL); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto err; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index f79e61c570ea..789a73aa7bd8 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -98,9 +98,6 @@ static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb) static void neigh_cleanup_and_release(struct neighbour *neigh) { - if (neigh->parms->neigh_cleanup) - neigh->parms->neigh_cleanup(neigh); - trace_neigh_cleanup_and_release(neigh, 0); __neigh_notify(neigh, RTM_DELNEIGH, 0, 0); call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); @@ -1197,7 +1194,7 @@ static void neigh_update_hhs(struct neighbour *neigh) if (update) { hh = &neigh->hh; - if (hh->hh_len) { + if (READ_ONCE(hh->hh_len)) { write_seqlock_bh(&hh->hh_lock); update(hh, neigh->dev, neigh->ha); write_sequnlock_bh(&hh->hh_lock); @@ -1476,7 +1473,7 @@ int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb) struct net_device *dev = neigh->dev; unsigned int seq; - if (dev->header_ops->cache && !neigh->hh.hh_len) + if (dev->header_ops->cache && !READ_ONCE(neigh->hh.hh_len)) neigh_hh_init(neigh); do { @@ -2052,8 +2049,8 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, goto nla_put_failure; { unsigned long now = jiffies; - unsigned int flush_delta = now - tbl->last_flush; - unsigned int rand_delta = now - tbl->last_rand; + long flush_delta = now - tbl->last_flush; + long rand_delta = now - tbl->last_rand; struct neigh_hash_table *nht; struct ndt_config ndc = { .ndtc_key_len = tbl->key_len, @@ -3033,7 +3030,7 @@ static struct neighbour *neigh_get_first(struct seq_file *seq) struct net *net = seq_file_net(seq); struct neigh_hash_table *nht = state->nht; struct neighbour *n = NULL; - int bucket = state->bucket; + int bucket; state->flags &= ~NEIGH_SEQ_IS_PNEIGH; for (bucket = 0; bucket < (1 << nht->hash_shift); bucket++) { @@ -3293,6 +3290,7 @@ static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos) *pos = cpu+1; return per_cpu_ptr(tbl->stats, cpu); } + (*pos)++; return NULL; } diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 36347933ec3a..6bbd06f7dc7d 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -20,8 +20,8 @@ static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff struct hlist_head *h; unsigned int count = 0, offset = get_offset(*pos); - h = &net->dev_name_head[get_bucket(*pos)]; - hlist_for_each_entry_rcu(dev, h, name_hlist) { + h = &net->dev_index_head[get_bucket(*pos)]; + hlist_for_each_entry_rcu(dev, h, index_hlist) { if (++count == offset) return dev; } diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 865ba6ca16eb..4c826b8bf9b1 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -919,25 +919,30 @@ static int rx_queue_add_kobject(struct net_device *dev, int index) struct kobject *kobj = &queue->kobj; int error = 0; + /* Kobject_put later will trigger rx_queue_release call which + * decreases dev refcount: Take that reference here + */ + dev_hold(queue->dev); + kobj->kset = dev->queues_kset; error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL, "rx-%u", index); if (error) - return error; - - dev_hold(queue->dev); + goto err; if (dev->sysfs_rx_queue_group) { error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group); - if (error) { - kobject_put(kobj); - return error; - } + if (error) + goto err; } kobject_uevent(kobj, KOBJ_ADD); return error; + +err: + kobject_put(kobj); + return error; } #endif /* CONFIG_SYSFS */ @@ -1457,25 +1462,29 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index) struct kobject *kobj = &queue->kobj; int error = 0; + /* Kobject_put later will trigger netdev_queue_release call + * which decreases dev refcount: Take that reference here + */ + dev_hold(queue->dev); + kobj->kset = dev->queues_kset; error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL, "tx-%u", index); if (error) - return error; - - dev_hold(queue->dev); + goto err; #ifdef CONFIG_BQL error = sysfs_create_group(kobj, &dql_group); - if (error) { - kobject_put(kobj); - return error; - } + if (error) + goto err; #endif kobject_uevent(kobj, KOBJ_ADD); - return 0; + +err: + kobject_put(kobj); + return error; } #endif /* CONFIG_SYSFS */ diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index a0e0d298c991..757cc1d084e7 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -211,16 +211,10 @@ static int net_eq_idr(int id, void *net, void *peer) return 0; } -/* Should be called with nsid_lock held. If a new id is assigned, the bool alloc - * is set to true, thus the caller knows that the new id must be notified via - * rtnl. - */ -static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc) +/* Must be called from RCU-critical section or with nsid_lock held */ +static int __peernet2id(const struct net *net, struct net *peer) { int id = idr_for_each(&net->netns_ids, net_eq_idr, peer); - bool alloc_it = *alloc; - - *alloc = false; /* Magic value for id 0. */ if (id == NET_ID_ZERO) @@ -228,61 +222,60 @@ static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc) if (id > 0) return id; - if (alloc_it) { - id = alloc_netid(net, peer, -1); - *alloc = true; - return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED; - } - return NETNSA_NSID_NOT_ASSIGNED; } -/* should be called with nsid_lock held */ -static int __peernet2id(struct net *net, struct net *peer) -{ - bool no = false; - - return __peernet2id_alloc(net, peer, &no); -} - -static void rtnl_net_notifyid(struct net *net, int cmd, int id); +static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid, + struct nlmsghdr *nlh, gfp_t gfp); /* This function returns the id of a peer netns. If no id is assigned, one will * be allocated and returned. */ -int peernet2id_alloc(struct net *net, struct net *peer) +int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp) { - bool alloc = false, alive = false; int id; if (refcount_read(&net->count) == 0) return NETNSA_NSID_NOT_ASSIGNED; - spin_lock_bh(&net->nsid_lock); - /* - * When peer is obtained from RCU lists, we may race with + + spin_lock(&net->nsid_lock); + id = __peernet2id(net, peer); + if (id >= 0) { + spin_unlock(&net->nsid_lock); + return id; + } + + /* When peer is obtained from RCU lists, we may race with * its cleanup. Check whether it's alive, and this guarantees * we never hash a peer back to net->netns_ids, after it has * just been idr_remove()'d from there in cleanup_net(). */ - if (maybe_get_net(peer)) - alive = alloc = true; - id = __peernet2id_alloc(net, peer, &alloc); - spin_unlock_bh(&net->nsid_lock); - if (alloc && id >= 0) - rtnl_net_notifyid(net, RTM_NEWNSID, id); - if (alive) - put_net(peer); + if (!maybe_get_net(peer)) { + spin_unlock(&net->nsid_lock); + return NETNSA_NSID_NOT_ASSIGNED; + } + + id = alloc_netid(net, peer, -1); + spin_unlock(&net->nsid_lock); + + put_net(peer); + if (id < 0) + return NETNSA_NSID_NOT_ASSIGNED; + + rtnl_net_notifyid(net, RTM_NEWNSID, id, 0, NULL, gfp); + return id; } EXPORT_SYMBOL_GPL(peernet2id_alloc); /* This function returns, if assigned, the id of a peer netns. */ -int peernet2id(struct net *net, struct net *peer) +int peernet2id(const struct net *net, struct net *peer) { int id; - spin_lock_bh(&net->nsid_lock); + rcu_read_lock(); id = __peernet2id(net, peer); - spin_unlock_bh(&net->nsid_lock); + rcu_read_unlock(); + return id; } EXPORT_SYMBOL(peernet2id); @@ -290,12 +283,12 @@ EXPORT_SYMBOL(peernet2id); /* This function returns true is the peer netns has an id assigned into the * current netns. */ -bool peernet_has_id(struct net *net, struct net *peer) +bool peernet_has_id(const struct net *net, struct net *peer) { return peernet2id(net, peer) >= 0; } -struct net *get_net_ns_by_id(struct net *net, int id) +struct net *get_net_ns_by_id(const struct net *net, int id) { struct net *peer; @@ -478,6 +471,7 @@ struct net *copy_net_ns(unsigned long flags, if (rv < 0) { put_userns: + key_remove_domain(net->key_domain); put_user_ns(user_ns); net_drop_ns(net); dec_ucounts: @@ -526,19 +520,20 @@ static void unhash_nsid(struct net *net, struct net *last) for_each_net(tmp) { int id; - spin_lock_bh(&tmp->nsid_lock); + spin_lock(&tmp->nsid_lock); id = __peernet2id(tmp, net); if (id >= 0) idr_remove(&tmp->netns_ids, id); - spin_unlock_bh(&tmp->nsid_lock); + spin_unlock(&tmp->nsid_lock); if (id >= 0) - rtnl_net_notifyid(tmp, RTM_DELNSID, id); + rtnl_net_notifyid(tmp, RTM_DELNSID, id, 0, NULL, + GFP_KERNEL); if (tmp == last) break; } - spin_lock_bh(&net->nsid_lock); + spin_lock(&net->nsid_lock); idr_destroy(&net->netns_ids); - spin_unlock_bh(&net->nsid_lock); + spin_unlock(&net->nsid_lock); } static LLIST_HEAD(cleanup_list); @@ -751,9 +746,9 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh, return PTR_ERR(peer); } - spin_lock_bh(&net->nsid_lock); + spin_lock(&net->nsid_lock); if (__peernet2id(net, peer) >= 0) { - spin_unlock_bh(&net->nsid_lock); + spin_unlock(&net->nsid_lock); err = -EEXIST; NL_SET_BAD_ATTR(extack, nla); NL_SET_ERR_MSG(extack, @@ -762,9 +757,10 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh, } err = alloc_netid(net, peer, nsid); - spin_unlock_bh(&net->nsid_lock); + spin_unlock(&net->nsid_lock); if (err >= 0) { - rtnl_net_notifyid(net, RTM_NEWNSID, err); + rtnl_net_notifyid(net, RTM_NEWNSID, err, NETLINK_CB(skb).portid, + nlh, GFP_KERNEL); err = 0; } else if (err == -ENOSPC && nsid >= 0) { err = -EEXIST; @@ -946,6 +942,7 @@ struct rtnl_net_dump_cb { int s_idx; }; +/* Runs in RCU-critical section. */ static int rtnl_net_dumpid_one(int id, void *peer, void *data) { struct rtnl_net_dump_cb *net_cb = (struct rtnl_net_dump_cb *)data; @@ -1030,19 +1027,9 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) goto end; } - spin_lock_bh(&net_cb.tgt_net->nsid_lock); - if (net_cb.fillargs.add_ref && - !net_eq(net_cb.ref_net, net_cb.tgt_net) && - !spin_trylock_bh(&net_cb.ref_net->nsid_lock)) { - spin_unlock_bh(&net_cb.tgt_net->nsid_lock); - err = -EAGAIN; - goto end; - } + rcu_read_lock(); idr_for_each(&net_cb.tgt_net->netns_ids, rtnl_net_dumpid_one, &net_cb); - if (net_cb.fillargs.add_ref && - !net_eq(net_cb.ref_net, net_cb.tgt_net)) - spin_unlock_bh(&net_cb.ref_net->nsid_lock); - spin_unlock_bh(&net_cb.tgt_net->nsid_lock); + rcu_read_unlock(); cb->args[0] = net_cb.idx; end: @@ -1051,16 +1038,19 @@ end: return err < 0 ? err : skb->len; } -static void rtnl_net_notifyid(struct net *net, int cmd, int id) +static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid, + struct nlmsghdr *nlh, gfp_t gfp) { struct net_fill_args fillargs = { + .portid = portid, + .seq = nlh ? nlh->nlmsg_seq : 0, .cmd = cmd, .nsid = id, }; struct sk_buff *msg; int err = -ENOMEM; - msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL); + msg = nlmsg_new(rtnl_net_get_size(), gfp); if (!msg) goto out; @@ -1068,7 +1058,7 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id) if (err < 0) goto err_out; - rtnl_notify(msg, net, 0, RTNLGRP_NSID, NULL, 0); + rtnl_notify(msg, net, portid, RTNLGRP_NSID, nlh, gfp); return; err_out: diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 2cf27da1baeb..849380a622ef 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -122,7 +122,7 @@ static void queue_process(struct work_struct *work) txq = netdev_get_tx_queue(dev, q_index); HARD_TX_LOCK(dev, txq, smp_processor_id()); if (netif_xmit_frozen_or_stopped(txq) || - netpoll_start_xmit(skb, dev, txq) != NETDEV_TX_OK) { + !dev_xmit_complete(netpoll_start_xmit(skb, dev, txq))) { skb_queue_head(&npinfo->txq, skb); HARD_TX_UNLOCK(dev, txq); local_irq_restore(flags); @@ -335,7 +335,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, HARD_TX_UNLOCK(dev, txq); - if (status == NETDEV_TX_OK) + if (dev_xmit_complete(status)) break; } @@ -352,7 +352,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, } - if (status != NETDEV_TX_OK) { + if (!dev_xmit_complete(status)) { skb_queue_tail(&npinfo->txq, skb); schedule_delayed_work(&npinfo->tx_work,0); } diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 256b7954b720..8881dd943dd0 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -93,7 +93,7 @@ static int extend_netdev_table(struct net_device *dev, u32 target_idx) static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev) { struct netprio_map *map = rcu_dereference_rtnl(dev->priomap); - int id = css->cgroup->id; + int id = css->id; if (map && id < map->priomap_len) return map->priomap[id]; @@ -113,7 +113,7 @@ static int netprio_set_prio(struct cgroup_subsys_state *css, struct net_device *dev, u32 prio) { struct netprio_map *map; - int id = css->cgroup->id; + int id = css->id; int ret; /* avoid extending priomap for zero writes */ @@ -177,7 +177,7 @@ static void cgrp_css_free(struct cgroup_subsys_state *css) static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft) { - return css->cgroup->id; + return css->id; } static int read_priomap(struct seq_file *sf, void *v) @@ -237,7 +237,7 @@ static void net_prio_attach(struct cgroup_taskset *tset) struct cgroup_subsys_state *css; cgroup_taskset_for_each(p, css, tset) { - void *v = (void *)(unsigned long)css->cgroup->id; + void *v = (void *)(unsigned long)css->id; task_lock(p); iterate_fd(p->files, 0, update_netprio, v); diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 3272dc7a8c81..10d2b255df5e 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -18,6 +18,9 @@ #include <trace/events/page_pool.h> +#define DEFER_TIME (msecs_to_jiffies(1000)) +#define DEFER_WARN_INTERVAL (60 * HZ) + static int page_pool_init(struct page_pool *pool, const struct page_pool_params *params) { @@ -44,6 +47,21 @@ static int page_pool_init(struct page_pool *pool, (pool->p.dma_dir != DMA_BIDIRECTIONAL)) return -EINVAL; + if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) { + /* In order to request DMA-sync-for-device the page + * needs to be mapped + */ + if (!(pool->p.flags & PP_FLAG_DMA_MAP)) + return -EINVAL; + + if (!pool->p.max_len) + return -EINVAL; + + /* pool->p.offset has to be set according to the address + * offset used by the DMA engine to start copying rx data + */ + } + if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) return -ENOMEM; @@ -61,7 +79,7 @@ static int page_pool_init(struct page_pool *pool, struct page_pool *page_pool_create(const struct page_pool_params *params) { struct page_pool *pool; - int err = 0; + int err; pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid); if (!pool) @@ -78,47 +96,86 @@ struct page_pool *page_pool_create(const struct page_pool_params *params) } EXPORT_SYMBOL(page_pool_create); -/* fast path */ -static struct page *__page_pool_get_cached(struct page_pool *pool) +static void __page_pool_return_page(struct page_pool *pool, struct page *page); + +noinline +static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) { struct ptr_ring *r = &pool->ring; struct page *page; + int pref_nid; /* preferred NUMA node */ /* Quicker fallback, avoid locks when ring is empty */ if (__ptr_ring_empty(r)) return NULL; - /* Test for safe-context, caller should provide this guarantee */ - if (likely(in_serving_softirq())) { - if (likely(pool->alloc.count)) { - /* Fast-path */ - page = pool->alloc.cache[--pool->alloc.count]; - return page; - } - /* Slower-path: Alloc array empty, time to refill - * - * Open-coded bulk ptr_ring consumer. - * - * Discussion: the ring consumer lock is not really - * needed due to the softirq/NAPI protection, but - * later need the ability to reclaim pages on the - * ring. Thus, keeping the locks. - */ - spin_lock(&r->consumer_lock); - while ((page = __ptr_ring_consume(r))) { - if (pool->alloc.count == PP_ALLOC_CACHE_REFILL) - break; + /* Softirq guarantee CPU and thus NUMA node is stable. This, + * assumes CPU refilling driver RX-ring will also run RX-NAPI. + */ +#ifdef CONFIG_NUMA + pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid; +#else + /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */ + pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */ +#endif + + /* Slower-path: Get pages from locked ring queue */ + spin_lock(&r->consumer_lock); + + /* Refill alloc array, but only if NUMA match */ + do { + page = __ptr_ring_consume(r); + if (unlikely(!page)) + break; + + if (likely(page_to_nid(page) == pref_nid)) { pool->alloc.cache[pool->alloc.count++] = page; + } else { + /* NUMA mismatch; + * (1) release 1 page to page-allocator and + * (2) break out to fallthrough to alloc_pages_node. + * This limit stress on page buddy alloactor. + */ + __page_pool_return_page(pool, page); + page = NULL; + break; } - spin_unlock(&r->consumer_lock); - return page; + } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL); + + /* Return last page */ + if (likely(pool->alloc.count > 0)) + page = pool->alloc.cache[--pool->alloc.count]; + + spin_unlock(&r->consumer_lock); + return page; +} + +/* fast path */ +static struct page *__page_pool_get_cached(struct page_pool *pool) +{ + struct page *page; + + /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */ + if (likely(pool->alloc.count)) { + /* Fast-path */ + page = pool->alloc.cache[--pool->alloc.count]; + } else { + page = page_pool_refill_alloc_cache(pool); } - /* Slow-path: Get page from locked ring queue */ - page = ptr_ring_consume(&pool->ring); return page; } +static void page_pool_dma_sync_for_device(struct page_pool *pool, + struct page *page, + unsigned int dma_sync_size) +{ + dma_sync_size = min(dma_sync_size, pool->p.max_len); + dma_sync_single_range_for_device(pool->p.dev, page->dma_addr, + pool->p.offset, dma_sync_size, + pool->p.dma_dir); +} + /* slow path */ noinline static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, @@ -142,7 +199,11 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, */ /* Cache was empty, do real allocation */ +#ifdef CONFIG_NUMA page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); +#else + page = alloc_pages(gfp, pool->p.order); +#endif if (!page) return NULL; @@ -163,6 +224,9 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, } page->dma_addr = dma; + if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) + page_pool_dma_sync_for_device(pool, page, pool->p.max_len); + skip_dma_map: /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; @@ -200,22 +264,14 @@ static s32 page_pool_inflight(struct page_pool *pool) { u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); - s32 distance; - - distance = _distance(hold_cnt, release_cnt); - - trace_page_pool_inflight(pool, distance, hold_cnt, release_cnt); - return distance; -} + s32 inflight; -static bool __page_pool_safe_to_destroy(struct page_pool *pool) -{ - s32 inflight = page_pool_inflight(pool); + inflight = _distance(hold_cnt, release_cnt); - /* The distance should not be able to become negative */ + trace_page_pool_release(pool, inflight, hold_cnt, release_cnt); WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight); - return (inflight == 0); + return inflight; } /* Cleanup page_pool state from page */ @@ -223,6 +279,7 @@ static void __page_pool_clean_page(struct page_pool *pool, struct page *page) { dma_addr_t dma; + int count; if (!(pool->p.flags & PP_FLAG_DMA_MAP)) goto skip_dma_unmap; @@ -234,9 +291,11 @@ static void __page_pool_clean_page(struct page_pool *pool, DMA_ATTR_SKIP_CPU_SYNC); page->dma_addr = 0; skip_dma_unmap: - atomic_inc(&pool->pages_state_release_cnt); - trace_page_pool_state_release(pool, page, - atomic_read(&pool->pages_state_release_cnt)); + /* This may be the last page returned, releasing the pool, so + * it is not safe to reference pool afterwards. + */ + count = atomic_inc_return(&pool->pages_state_release_cnt); + trace_page_pool_state_release(pool, page, count); } /* unmap the page and clean our state */ @@ -290,8 +349,16 @@ static bool __page_pool_recycle_direct(struct page *page, return true; } -void __page_pool_put_page(struct page_pool *pool, - struct page *page, bool allow_direct) +/* page is NOT reusable when: + * 1) allocated when system is under some pressure. (page_is_pfmemalloc) + */ +static bool pool_page_reusable(struct page_pool *pool, struct page *page) +{ + return !page_is_pfmemalloc(page); +} + +void __page_pool_put_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) { /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the @@ -299,9 +366,14 @@ void __page_pool_put_page(struct page_pool *pool, * * refcnt == 1 means page_pool owns page, and can recycle it. */ - if (likely(page_ref_count(page) == 1)) { + if (likely(page_ref_count(page) == 1 && + pool_page_reusable(pool, page))) { /* Read barrier done in page_ref_count / READ_ONCE */ + if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) + page_pool_dma_sync_for_device(pool, page, + dma_sync_size); + if (allow_direct && in_serving_softirq()) if (__page_pool_recycle_direct(page, pool)) return; @@ -345,31 +417,10 @@ static void __page_pool_empty_ring(struct page_pool *pool) } } -static void __warn_in_flight(struct page_pool *pool) +static void page_pool_free(struct page_pool *pool) { - u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); - u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); - s32 distance; - - distance = _distance(hold_cnt, release_cnt); - - /* Drivers should fix this, but only problematic when DMA is used */ - WARN(1, "Still in-flight pages:%d hold:%u released:%u", - distance, hold_cnt, release_cnt); -} - -void __page_pool_free(struct page_pool *pool) -{ - /* Only last user actually free/release resources */ - if (!page_pool_put(pool)) - return; - - WARN(pool->alloc.count, "API usage violation"); - WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty"); - - /* Can happen due to forced shutdown */ - if (!__page_pool_safe_to_destroy(pool)) - __warn_in_flight(pool); + if (pool->disconnect) + pool->disconnect(pool); ptr_ring_cleanup(&pool->ring, NULL); @@ -378,15 +429,14 @@ void __page_pool_free(struct page_pool *pool) kfree(pool); } -EXPORT_SYMBOL(__page_pool_free); -/* Request to shutdown: release pages cached by page_pool, and check - * for in-flight pages - */ -bool __page_pool_request_shutdown(struct page_pool *pool) +static void page_pool_empty_alloc_cache_once(struct page_pool *pool) { struct page *page; + if (pool->destroy_cnt) + return; + /* Empty alloc cache, assume caller made sure this is * no-longer in use, and page_pool_alloc_pages() cannot be * call concurrently. @@ -395,12 +445,91 @@ bool __page_pool_request_shutdown(struct page_pool *pool) page = pool->alloc.cache[--pool->alloc.count]; __page_pool_return_page(pool, page); } +} + +static void page_pool_scrub(struct page_pool *pool) +{ + page_pool_empty_alloc_cache_once(pool); + pool->destroy_cnt++; /* No more consumers should exist, but producers could still * be in-flight. */ __page_pool_empty_ring(pool); +} + +static int page_pool_release(struct page_pool *pool) +{ + int inflight; + + page_pool_scrub(pool); + inflight = page_pool_inflight(pool); + if (!inflight) + page_pool_free(pool); + + return inflight; +} + +static void page_pool_release_retry(struct work_struct *wq) +{ + struct delayed_work *dwq = to_delayed_work(wq); + struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw); + int inflight; + + inflight = page_pool_release(pool); + if (!inflight) + return; + + /* Periodic warning */ + if (time_after_eq(jiffies, pool->defer_warn)) { + int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ; + + pr_warn("%s() stalled pool shutdown %d inflight %d sec\n", + __func__, inflight, sec); + pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; + } + + /* Still not ready to be disconnected, retry later */ + schedule_delayed_work(&pool->release_dw, DEFER_TIME); +} + +void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *)) +{ + refcount_inc(&pool->user_cnt); + pool->disconnect = disconnect; +} + +void page_pool_destroy(struct page_pool *pool) +{ + if (!pool) + return; + + if (!page_pool_put(pool)) + return; + + if (!page_pool_release(pool)) + return; + + pool->defer_start = jiffies; + pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; + + INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry); + schedule_delayed_work(&pool->release_dw, DEFER_TIME); +} +EXPORT_SYMBOL(page_pool_destroy); + +/* Caller must provide appropriate safe context, e.g. NAPI. */ +void page_pool_update_nid(struct page_pool *pool, int new_nid) +{ + struct page *page; + + trace_page_pool_update_nid(pool, new_nid); + pool->p.nid = new_nid; - return __page_pool_safe_to_destroy(pool); + /* Flush pool alloc cache, as refill will check NUMA node */ + while (pool->alloc.count) { + page = pool->alloc.cache[--pool->alloc.count]; + __page_pool_return_page(pool, page); + } } -EXPORT_SYMBOL(__page_pool_request_shutdown); +EXPORT_SYMBOL(page_pool_update_nid); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index bb9915291644..acc849df60b5 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -535,12 +535,12 @@ static int pgctrl_open(struct inode *inode, struct file *file) return single_open(file, pgctrl_show, PDE_DATA(inode)); } -static const struct file_operations pktgen_fops = { - .open = pgctrl_open, - .read = seq_read, - .llseek = seq_lseek, - .write = pgctrl_write, - .release = single_release, +static const struct proc_ops pktgen_proc_ops = { + .proc_open = pgctrl_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = pgctrl_write, + .proc_release = single_release, }; static int pktgen_if_show(struct seq_file *seq, void *v) @@ -1707,12 +1707,12 @@ static int pktgen_if_open(struct inode *inode, struct file *file) return single_open(file, pktgen_if_show, PDE_DATA(inode)); } -static const struct file_operations pktgen_if_fops = { - .open = pktgen_if_open, - .read = seq_read, - .llseek = seq_lseek, - .write = pktgen_if_write, - .release = single_release, +static const struct proc_ops pktgen_if_proc_ops = { + .proc_open = pktgen_if_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = pktgen_if_write, + .proc_release = single_release, }; static int pktgen_thread_show(struct seq_file *seq, void *v) @@ -1844,12 +1844,12 @@ static int pktgen_thread_open(struct inode *inode, struct file *file) return single_open(file, pktgen_thread_show, PDE_DATA(inode)); } -static const struct file_operations pktgen_thread_fops = { - .open = pktgen_thread_open, - .read = seq_read, - .llseek = seq_lseek, - .write = pktgen_thread_write, - .release = single_release, +static const struct proc_ops pktgen_thread_proc_ops = { + .proc_open = pktgen_thread_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = pktgen_thread_write, + .proc_release = single_release, }; /* Think find or remove for NN */ @@ -1926,7 +1926,7 @@ static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *d pkt_dev->entry = proc_create_data(dev->name, 0600, pn->proc_dir, - &pktgen_if_fops, + &pktgen_if_proc_ops, pkt_dev); if (!pkt_dev->entry) pr_err("can't move proc entry for '%s'\n", @@ -2156,7 +2156,7 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) s64 remaining; struct hrtimer_sleeper t; - hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); hrtimer_set_expires(&t.timer, spin_until); remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer)); @@ -2170,11 +2170,9 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) end_time = ktime_get(); } while (ktime_compare(end_time, spin_until) < 0); } else { - /* see do_nanosleep */ - hrtimer_init_sleeper(&t, current); do { set_current_state(TASK_INTERRUPTIBLE); - hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); + hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_ABS); if (likely(t.task)) schedule(); @@ -2652,7 +2650,7 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, } get_page(pkt_dev->page); skb_frag_set_page(skb, i, pkt_dev->page); - skb_shinfo(skb)->frags[i].page_offset = 0; + skb_frag_off_set(&skb_shinfo(skb)->frags[i], 0); /*last fragment, fill rest of data*/ if (i == (frags - 1)) skb_frag_size_set(&skb_shinfo(skb)->frags[i], @@ -3406,7 +3404,6 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) HARD_TX_LOCK(odev, txq, smp_processor_id()); if (unlikely(netif_xmit_frozen_or_drv_stopped(txq))) { - ret = NETDEV_TX_BUSY; pkt_dev->last_ok = 0; goto unlock; } @@ -3641,7 +3638,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) pkt_dev->clone_skb = pg_clone_skb_d; pkt_dev->entry = proc_create_data(ifname, 0600, t->net->proc_dir, - &pktgen_if_fops, pkt_dev); + &pktgen_if_proc_ops, pkt_dev); if (!pkt_dev->entry) { pr_err("cannot create %s/%s procfs entry\n", PG_PROC_DIR, ifname); @@ -3711,7 +3708,7 @@ static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn) t->tsk = p; pe = proc_create_data(t->tsk->comm, 0600, pn->proc_dir, - &pktgen_thread_fops, t); + &pktgen_thread_proc_ops, t); if (!pe) { pr_err("cannot create %s/%s procfs entry\n", PG_PROC_DIR, t->tsk->comm); @@ -3796,7 +3793,7 @@ static int __net_init pg_net_init(struct net *net) pr_warn("cannot create /proc/net/%s\n", PG_PROC_DIR); return -ENODEV; } - pe = proc_create(PGCTRL, 0600, pn->proc_dir, &pktgen_fops); + pe = proc_create(PGCTRL, 0600, pn->proc_dir, &pktgen_proc_ops); if (pe == NULL) { pr_err("cannot create %s procfs entry\n", PGCTRL); ret = -EINVAL; diff --git a/net/core/request_sock.c b/net/core/request_sock.c index c9bb00008528..f35c2e998406 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -96,7 +96,7 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq; - tcp_sk(sk)->fastopen_rsk = NULL; + RCU_INIT_POINTER(tcp_sk(sk)->fastopen_rsk, NULL); spin_lock_bh(&fastopenq->lock); fastopenq->qlen--; tcp_rsk(req)->tfo_listener = false; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 1ee6460f8275..09c44bf2e1d2 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -980,6 +980,19 @@ static size_t rtnl_xdp_size(void) return xdp_size; } +static size_t rtnl_prop_list_size(const struct net_device *dev) +{ + struct netdev_name_node *name_node; + size_t size; + + if (list_empty(&dev->name_node->list)) + return 0; + size = nla_total_size(0); + list_for_each_entry(name_node, &dev->name_node->list, list) + size += nla_total_size(ALTIFNAMSIZ); + return size; +} + static noinline size_t if_nlmsg_size(const struct net_device *dev, u32 ext_filter_mask) { @@ -1027,6 +1040,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */ + nla_total_size(4) /* IFLA_MIN_MTU */ + nla_total_size(4) /* IFLA_MAX_MTU */ + + rtnl_prop_list_size(dev) + + nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */ + 0; } @@ -1204,6 +1219,8 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, struct ifla_vf_mac vf_mac; struct ifla_vf_broadcast vf_broadcast; struct ifla_vf_info ivi; + struct ifla_vf_guid node_guid; + struct ifla_vf_guid port_guid; memset(&ivi, 0, sizeof(ivi)); @@ -1225,6 +1242,8 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, return 0; memset(&vf_vlan_info, 0, sizeof(vf_vlan_info)); + memset(&node_guid, 0, sizeof(node_guid)); + memset(&port_guid, 0, sizeof(port_guid)); vf_mac.vf = vf_vlan.vf = @@ -1234,7 +1253,9 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, vf_spoofchk.vf = vf_linkstate.vf = vf_rss_query_en.vf = - vf_trust.vf = ivi.vf; + vf_trust.vf = + node_guid.vf = + port_guid.vf = ivi.vf; memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); memcpy(vf_broadcast.broadcast, dev->broadcast, dev->addr_len); @@ -1270,6 +1291,16 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, nla_put(skb, IFLA_VF_TRUST, sizeof(vf_trust), &vf_trust)) goto nla_put_vf_failure; + + if (dev->netdev_ops->ndo_get_vf_guid && + !dev->netdev_ops->ndo_get_vf_guid(dev, vfs_num, &node_guid, + &port_guid)) { + if (nla_put(skb, IFLA_VF_IB_NODE_GUID, sizeof(node_guid), + &node_guid) || + nla_put(skb, IFLA_VF_IB_PORT_GUID, sizeof(port_guid), + &port_guid)) + goto nla_put_vf_failure; + } vfvlanlist = nla_nest_start_noflag(skb, IFLA_VF_VLAN_LIST); if (!vfvlanlist) goto nla_put_vf_failure; @@ -1523,7 +1554,7 @@ static noinline_for_stack int nla_put_ifalias(struct sk_buff *skb, static int rtnl_fill_link_netnsid(struct sk_buff *skb, const struct net_device *dev, - struct net *src_net) + struct net *src_net, gfp_t gfp) { bool put_iflink = false; @@ -1531,7 +1562,7 @@ static int rtnl_fill_link_netnsid(struct sk_buff *skb, struct net *link_net = dev->rtnl_link_ops->get_link_net(dev); if (!net_eq(dev_net(dev), link_net)) { - int id = peernet2id_alloc(src_net, link_net); + int id = peernet2id_alloc(src_net, link_net, gfp); if (nla_put_s32(skb, IFLA_LINK_NETNSID, id)) return -EMSGSIZE; @@ -1584,12 +1615,48 @@ static int rtnl_fill_link_af(struct sk_buff *skb, return 0; } +static int rtnl_fill_alt_ifnames(struct sk_buff *skb, + const struct net_device *dev) +{ + struct netdev_name_node *name_node; + int count = 0; + + list_for_each_entry(name_node, &dev->name_node->list, list) { + if (nla_put_string(skb, IFLA_ALT_IFNAME, name_node->name)) + return -EMSGSIZE; + count++; + } + return count; +} + +static int rtnl_fill_prop_list(struct sk_buff *skb, + const struct net_device *dev) +{ + struct nlattr *prop_list; + int ret; + + prop_list = nla_nest_start(skb, IFLA_PROP_LIST); + if (!prop_list) + return -EMSGSIZE; + + ret = rtnl_fill_alt_ifnames(skb, dev); + if (ret <= 0) + goto nest_cancel; + + nla_nest_end(skb, prop_list); + return 0; + +nest_cancel: + nla_nest_cancel(skb, prop_list); + return ret; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct net *src_net, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask, u32 event, int *new_nsid, int new_ifindex, - int tgt_netnsid) + int tgt_netnsid, gfp_t gfp) { struct ifinfomsg *ifm; struct nlmsghdr *nlh; @@ -1681,7 +1748,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, goto nla_put_failure; } - if (rtnl_fill_link_netnsid(skb, dev, src_net)) + if (rtnl_fill_link_netnsid(skb, dev, src_net, gfp)) goto nla_put_failure; if (new_nsid && @@ -1691,12 +1758,18 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, nla_put_s32(skb, IFLA_NEW_IFINDEX, new_ifindex) < 0) goto nla_put_failure; + if (memchr_inv(dev->perm_addr, '\0', dev->addr_len) && + nla_put(skb, IFLA_PERM_ADDRESS, dev->addr_len, dev->perm_addr)) + goto nla_put_failure; rcu_read_lock(); if (rtnl_fill_link_af(skb, dev, ext_filter_mask)) goto nla_put_failure_rcu; rcu_read_unlock(); + if (rtnl_fill_prop_list(skb, dev)) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; @@ -1750,6 +1823,10 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 }, [IFLA_MIN_MTU] = { .type = NLA_U32 }, [IFLA_MAX_MTU] = { .type = NLA_U32 }, + [IFLA_PROP_LIST] = { .type = NLA_NESTED }, + [IFLA_ALT_IFNAME] = { .type = NLA_STRING, + .len = ALTIFNAMSIZ - 1 }, + [IFLA_PERM_ADDRESS] = { .type = NLA_REJECT }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -2001,7 +2078,7 @@ walk_entries: NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, 0, flags, ext_filter_mask, 0, NULL, 0, - netnsid); + netnsid, GFP_KERNEL); if (err < 0) { if (likely(skb->len)) @@ -2195,6 +2272,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) if (tb[IFLA_VF_MAC]) { struct ifla_vf_mac *ivm = nla_data(tb[IFLA_VF_MAC]); + if (ivm->vf >= INT_MAX) + return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_mac) err = ops->ndo_set_vf_mac(dev, ivm->vf, @@ -2206,6 +2285,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) if (tb[IFLA_VF_VLAN]) { struct ifla_vf_vlan *ivv = nla_data(tb[IFLA_VF_VLAN]); + if (ivv->vf >= INT_MAX) + return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_vlan) err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan, @@ -2238,6 +2319,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) if (len == 0) return -EINVAL; + if (ivvl[0]->vf >= INT_MAX) + return -EINVAL; err = ops->ndo_set_vf_vlan(dev, ivvl[0]->vf, ivvl[0]->vlan, ivvl[0]->qos, ivvl[0]->vlan_proto); if (err < 0) @@ -2248,6 +2331,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) struct ifla_vf_tx_rate *ivt = nla_data(tb[IFLA_VF_TX_RATE]); struct ifla_vf_info ivf; + if (ivt->vf >= INT_MAX) + return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_get_vf_config) err = ops->ndo_get_vf_config(dev, ivt->vf, &ivf); @@ -2266,6 +2351,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) if (tb[IFLA_VF_RATE]) { struct ifla_vf_rate *ivt = nla_data(tb[IFLA_VF_RATE]); + if (ivt->vf >= INT_MAX) + return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_rate) err = ops->ndo_set_vf_rate(dev, ivt->vf, @@ -2278,6 +2365,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) if (tb[IFLA_VF_SPOOFCHK]) { struct ifla_vf_spoofchk *ivs = nla_data(tb[IFLA_VF_SPOOFCHK]); + if (ivs->vf >= INT_MAX) + return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_spoofchk) err = ops->ndo_set_vf_spoofchk(dev, ivs->vf, @@ -2289,6 +2378,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) if (tb[IFLA_VF_LINK_STATE]) { struct ifla_vf_link_state *ivl = nla_data(tb[IFLA_VF_LINK_STATE]); + if (ivl->vf >= INT_MAX) + return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_link_state) err = ops->ndo_set_vf_link_state(dev, ivl->vf, @@ -2302,6 +2393,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) err = -EOPNOTSUPP; ivrssq_en = nla_data(tb[IFLA_VF_RSS_QUERY_EN]); + if (ivrssq_en->vf >= INT_MAX) + return -EINVAL; if (ops->ndo_set_vf_rss_query_en) err = ops->ndo_set_vf_rss_query_en(dev, ivrssq_en->vf, ivrssq_en->setting); @@ -2312,6 +2405,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) if (tb[IFLA_VF_TRUST]) { struct ifla_vf_trust *ivt = nla_data(tb[IFLA_VF_TRUST]); + if (ivt->vf >= INT_MAX) + return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_trust) err = ops->ndo_set_vf_trust(dev, ivt->vf, ivt->setting); @@ -2322,15 +2417,18 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) if (tb[IFLA_VF_IB_NODE_GUID]) { struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_NODE_GUID]); + if (ivt->vf >= INT_MAX) + return -EINVAL; if (!ops->ndo_set_vf_guid) return -EOPNOTSUPP; - return handle_vf_guid(dev, ivt, IFLA_VF_IB_NODE_GUID); } if (tb[IFLA_VF_IB_PORT_GUID]) { struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_PORT_GUID]); + if (ivt->vf >= INT_MAX) + return -EINVAL; if (!ops->ndo_set_vf_guid) return -EOPNOTSUPP; @@ -2355,6 +2453,7 @@ static int do_set_master(struct net_device *dev, int ifindex, err = ops->ndo_del_slave(upper_dev, dev); if (err) return err; + netdev_update_lockdep_key(dev); } else { return -EOPNOTSUPP; } @@ -2722,6 +2821,26 @@ errout: return err; } +static struct net_device *rtnl_dev_get(struct net *net, + struct nlattr *ifname_attr, + struct nlattr *altifname_attr, + char *ifname) +{ + char buffer[ALTIFNAMSIZ]; + + if (!ifname) { + ifname = buffer; + if (ifname_attr) + nla_strlcpy(ifname, ifname_attr, IFNAMSIZ); + else if (altifname_attr) + nla_strlcpy(ifname, altifname_attr, ALTIFNAMSIZ); + else + return NULL; + } + + return __dev_get_by_name(net, ifname); +} + static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -2750,8 +2869,8 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); - else if (tb[IFLA_IFNAME]) - dev = __dev_get_by_name(net, ifname); + else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + dev = rtnl_dev_get(net, NULL, tb[IFLA_ALT_IFNAME], ifname); else goto errout; @@ -2824,7 +2943,6 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, struct net *tgt_net = net; struct net_device *dev = NULL; struct ifinfomsg *ifm; - char ifname[IFNAMSIZ]; struct nlattr *tb[IFLA_MAX+1]; int err; int netnsid = -1; @@ -2838,9 +2956,6 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) return err; - if (tb[IFLA_IFNAME]) - nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); - if (tb[IFLA_TARGET_NETNSID]) { netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]); tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid); @@ -2852,8 +2967,9 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); - else if (tb[IFLA_IFNAME]) - dev = __dev_get_by_name(tgt_net, ifname); + else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + dev = rtnl_dev_get(net, tb[IFLA_IFNAME], + tb[IFLA_ALT_IFNAME], NULL); else if (tb[IFLA_GROUP]) err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP])); else @@ -2937,8 +3053,17 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, dev->rtnl_link_ops = ops; dev->rtnl_link_state = RTNL_LINK_INITIALIZING; - if (tb[IFLA_MTU]) - dev->mtu = nla_get_u32(tb[IFLA_MTU]); + if (tb[IFLA_MTU]) { + u32 mtu = nla_get_u32(tb[IFLA_MTU]); + int err; + + err = dev_validate_mtu(dev, mtu, extack); + if (err) { + free_netdev(dev); + return ERR_PTR(err); + } + dev->mtu = mtu; + } if (tb[IFLA_ADDRESS]) { memcpy(dev->dev_addr, nla_data(tb[IFLA_ADDRESS]), nla_len(tb[IFLA_ADDRESS])); @@ -3024,12 +3149,10 @@ replay: ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); - else { - if (ifname[0]) - dev = __dev_get_by_name(net, ifname); - else - dev = NULL; - } + else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + dev = rtnl_dev_get(net, NULL, tb[IFLA_ALT_IFNAME], ifname); + else + dev = NULL; if (dev) { master_dev = netdev_master_upper_dev_get(dev); @@ -3291,6 +3414,7 @@ static int rtnl_valid_getlink_req(struct sk_buff *skb, switch (i) { case IFLA_IFNAME: + case IFLA_ALT_IFNAME: case IFLA_EXT_MASK: case IFLA_TARGET_NETNSID: break; @@ -3309,7 +3433,6 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct net *net = sock_net(skb->sk); struct net *tgt_net = net; struct ifinfomsg *ifm; - char ifname[IFNAMSIZ]; struct nlattr *tb[IFLA_MAX+1]; struct net_device *dev = NULL; struct sk_buff *nskb; @@ -3332,9 +3455,6 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, return PTR_ERR(tgt_net); } - if (tb[IFLA_IFNAME]) - nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); - if (tb[IFLA_EXT_MASK]) ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); @@ -3342,8 +3462,9 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); - else if (tb[IFLA_IFNAME]) - dev = __dev_get_by_name(tgt_net, ifname); + else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + dev = rtnl_dev_get(tgt_net, tb[IFLA_IFNAME], + tb[IFLA_ALT_IFNAME], NULL); else goto out; @@ -3359,7 +3480,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, err = rtnl_fill_ifinfo(nskb, dev, net, RTM_NEWLINK, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, 0, ext_filter_mask, - 0, NULL, 0, netnsid); + 0, NULL, 0, netnsid, GFP_KERNEL); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size */ WARN_ON(err == -EMSGSIZE); @@ -3373,6 +3494,100 @@ out: return err; } +static int rtnl_alt_ifname(int cmd, struct net_device *dev, struct nlattr *attr, + bool *changed, struct netlink_ext_ack *extack) +{ + char *alt_ifname; + int err; + + err = nla_validate(attr, attr->nla_len, IFLA_MAX, ifla_policy, extack); + if (err) + return err; + + alt_ifname = nla_data(attr); + if (cmd == RTM_NEWLINKPROP) { + alt_ifname = kstrdup(alt_ifname, GFP_KERNEL); + if (!alt_ifname) + return -ENOMEM; + err = netdev_name_node_alt_create(dev, alt_ifname); + if (err) { + kfree(alt_ifname); + return err; + } + } else if (cmd == RTM_DELLINKPROP) { + err = netdev_name_node_alt_destroy(dev, alt_ifname); + if (err) + return err; + } else { + WARN_ON(1); + return 0; + } + + *changed = true; + return 0; +} + +static int rtnl_linkprop(int cmd, struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tb[IFLA_MAX + 1]; + struct net_device *dev; + struct ifinfomsg *ifm; + bool changed = false; + struct nlattr *attr; + int err, rem; + + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); + if (err) + return err; + + err = rtnl_ensure_unique_netns(tb, extack, true); + if (err) + return err; + + ifm = nlmsg_data(nlh); + if (ifm->ifi_index > 0) + dev = __dev_get_by_index(net, ifm->ifi_index); + else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + dev = rtnl_dev_get(net, tb[IFLA_IFNAME], + tb[IFLA_ALT_IFNAME], NULL); + else + return -EINVAL; + + if (!dev) + return -ENODEV; + + if (!tb[IFLA_PROP_LIST]) + return 0; + + nla_for_each_nested(attr, tb[IFLA_PROP_LIST], rem) { + switch (nla_type(attr)) { + case IFLA_ALT_IFNAME: + err = rtnl_alt_ifname(cmd, dev, attr, &changed, extack); + if (err) + return err; + break; + } + } + + if (changed) + netdev_state_change(dev); + return 0; +} + +static int rtnl_newlinkprop(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + return rtnl_linkprop(RTM_NEWLINKPROP, skb, nlh, extack); +} + +static int rtnl_dellinkprop(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + return rtnl_linkprop(RTM_DELLINKPROP, skb, nlh, extack); +} + static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); @@ -3471,7 +3686,7 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, err = rtnl_fill_ifinfo(skb, dev, dev_net(dev), type, 0, 0, change, 0, 0, event, - new_nsid, new_ifindex, -1); + new_nsid, new_ifindex, -1, flags); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -3916,7 +4131,7 @@ static int valid_fdb_dump_strict(const struct nlmsghdr *nlh, ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_flags || ndm->ndm_type) { - NL_SET_ERR_MSG(extack, "Invalid values in header for fbd dump request"); + NL_SET_ERR_MSG(extack, "Invalid values in header for fdb dump request"); return -EINVAL; } @@ -5331,6 +5546,9 @@ void __init rtnetlink_init(void) rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, 0); rtnl_register(PF_UNSPEC, RTM_GETNETCONF, NULL, rtnl_dump_all, 0); + rtnl_register(PF_UNSPEC, RTM_NEWLINKPROP, rtnl_newlinkprop, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_DELLINKPROP, rtnl_dellinkprop, NULL, 0); + rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, 0); rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, 0); rtnl_register(PF_BRIDGE, RTM_GETNEIGH, rtnl_fdb_get, rtnl_fdb_dump, 0); diff --git a/net/core/scm.c b/net/core/scm.c index 31a38239c92f..dc6fed1f221c 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -268,8 +268,10 @@ void put_cmsg_scm_timestamping(struct msghdr *msg, struct scm_timestamping_inter struct scm_timestamping tss; int i; - for (i = 0; i < ARRAY_SIZE(tss.ts); i++) - tss.ts[i] = timespec64_to_timespec(tss_internal->ts[i]); + for (i = 0; i < ARRAY_SIZE(tss.ts); i++) { + tss.ts[i].tv_sec = tss_internal->ts[i].tv_sec; + tss.ts[i].tv_nsec = tss_internal->ts[i].tv_nsec; + } put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPING_OLD, sizeof(tss), &tss); } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 0338820ee0ec..864cb9e9622f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -68,6 +68,7 @@ #include <net/ip6_checksum.h> #include <net/xfrm.h> #include <net/mpls.h> +#include <net/mptcp.h> #include <linux/uaccess.h> #include <trace/events/skb.h> @@ -785,7 +786,7 @@ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) struct page *p; u8 *vaddr; - skb_frag_foreach_page(frag, frag->page_offset, + skb_frag_foreach_page(frag, skb_frag_off(frag), skb_frag_size(frag), p, p_off, p_len, copied) { seg_len = min_t(int, p_len, len); @@ -1375,7 +1376,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) struct page *p; u8 *vaddr; - skb_frag_foreach_page(f, f->page_offset, skb_frag_size(f), + skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f), p, p_off, p_len, copied) { u32 copy, done = 0; vaddr = kmap_atomic(p); @@ -2144,10 +2145,12 @@ pull_pages: skb_frag_unref(skb, i); eat -= size; } else { - skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; + skb_frag_t *frag = &skb_shinfo(skb)->frags[k]; + + *frag = skb_shinfo(skb)->frags[i]; if (eat) { - skb_shinfo(skb)->frags[k].page_offset += eat; - skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat); + skb_frag_off_add(frag, eat); + skb_frag_size_sub(frag, eat); if (!i) goto end; eat = 0; @@ -2219,7 +2222,7 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) copy = len; skb_frag_foreach_page(f, - f->page_offset + offset - start, + skb_frag_off(f) + offset - start, copy, p, p_off, p_len, copied) { vaddr = kmap_atomic(p); memcpy(to + copied, vaddr + p_off, p_len); @@ -2395,7 +2398,7 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; if (__splice_segment(skb_frag_page(f), - f->page_offset, skb_frag_size(f), + skb_frag_off(f), skb_frag_size(f), offset, len, spd, false, sk, pipe)) return true; } @@ -2485,20 +2488,20 @@ do_frag_list: for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx]; - if (offset < frag->size) + if (offset < skb_frag_size(frag)) break; - offset -= frag->size; + offset -= skb_frag_size(frag); } for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx]; - slen = min_t(size_t, len, frag->size - offset); + slen = min_t(size_t, len, skb_frag_size(frag) - offset); while (slen) { - ret = kernel_sendpage_locked(sk, frag->page.p, - frag->page_offset + offset, + ret = kernel_sendpage_locked(sk, skb_frag_page(frag), + skb_frag_off(frag) + offset, slen, MSG_DONTWAIT); if (ret <= 0) goto error; @@ -2580,7 +2583,7 @@ int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) copy = len; skb_frag_foreach_page(frag, - frag->page_offset + offset - start, + skb_frag_off(frag) + offset - start, copy, p, p_off, p_len, copied) { vaddr = kmap_atomic(p); memcpy(vaddr + p_off, from + copied, p_len); @@ -2660,7 +2663,7 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, copy = len; skb_frag_foreach_page(frag, - frag->page_offset + offset - start, + skb_frag_off(frag) + offset - start, copy, p, p_off, p_len, copied) { vaddr = kmap_atomic(p); csum2 = INDIRECT_CALL_1(ops->update, @@ -2759,7 +2762,7 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, copy = len; skb_frag_foreach_page(frag, - frag->page_offset + offset - start, + skb_frag_off(frag) + offset - start, copy, p, p_off, p_len, copied) { vaddr = kmap_atomic(p); csum2 = csum_partial_copy_nocheck(vaddr + p_off, @@ -2975,11 +2978,15 @@ skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) skb_zerocopy_clone(to, from, GFP_ATOMIC); for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { + int size; + if (!len) break; skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i]; - skb_shinfo(to)->frags[j].size = min_t(int, skb_shinfo(to)->frags[j].size, len); - len -= skb_shinfo(to)->frags[j].size; + size = min_t(int, skb_frag_size(&skb_shinfo(to)->frags[j]), + len); + skb_frag_size_set(&skb_shinfo(to)->frags[j], size); + len -= size; skb_frag_ref(to, j); j++; } @@ -3230,7 +3237,7 @@ static inline void skb_split_no_header(struct sk_buff *skb, * 2. Split is accurately. We make this. */ skb_frag_ref(skb, i); - skb_shinfo(skb1)->frags[0].page_offset += len - pos; + skb_frag_off_add(&skb_shinfo(skb1)->frags[0], len - pos); skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos); skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos); skb_shinfo(skb)->nr_frags++; @@ -3293,7 +3300,7 @@ static int skb_prepare_for_shift(struct sk_buff *skb) int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) { int from, to, merge, todo; - struct skb_frag_struct *fragfrom, *fragto; + skb_frag_t *fragfrom, *fragto; BUG_ON(shiftlen > skb->len); @@ -3312,7 +3319,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) */ if (!to || !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom), - fragfrom->page_offset)) { + skb_frag_off(fragfrom))) { merge = -1; } else { merge = to - 1; @@ -3329,7 +3336,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) skb_frag_size_add(fragto, shiftlen); skb_frag_size_sub(fragfrom, shiftlen); - fragfrom->page_offset += shiftlen; + skb_frag_off_add(fragfrom, shiftlen); goto onlymerged; } @@ -3360,11 +3367,11 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) } else { __skb_frag_ref(fragfrom); - fragto->page = fragfrom->page; - fragto->page_offset = fragfrom->page_offset; + skb_frag_page_copy(fragto, fragfrom); + skb_frag_off_copy(fragto, fragfrom); skb_frag_size_set(fragto, todo); - fragfrom->page_offset += todo; + skb_frag_off_add(fragfrom, todo); skb_frag_size_sub(fragfrom, todo); todo = 0; @@ -3489,7 +3496,7 @@ next_skb: if (!st->frag_data) st->frag_data = kmap_atomic(skb_frag_page(frag)); - *data = (u8 *) st->frag_data + frag->page_offset + + *data = (u8 *) st->frag_data + skb_frag_off(frag) + (abs_offset - st->stepped_offset); return block_limit - abs_offset; @@ -3625,13 +3632,104 @@ static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb) struct page *page; page = virt_to_head_page(frag_skb->head); - head_frag.page.p = page; - head_frag.page_offset = frag_skb->data - - (unsigned char *)page_address(page); - head_frag.size = skb_headlen(frag_skb); + __skb_frag_set_page(&head_frag, page); + skb_frag_off_set(&head_frag, frag_skb->data - + (unsigned char *)page_address(page)); + skb_frag_size_set(&head_frag, skb_headlen(frag_skb)); return head_frag; } +struct sk_buff *skb_segment_list(struct sk_buff *skb, + netdev_features_t features, + unsigned int offset) +{ + struct sk_buff *list_skb = skb_shinfo(skb)->frag_list; + unsigned int tnl_hlen = skb_tnl_header_len(skb); + unsigned int delta_truesize = 0; + unsigned int delta_len = 0; + struct sk_buff *tail = NULL; + struct sk_buff *nskb; + + skb_push(skb, -skb_network_offset(skb) + offset); + + skb_shinfo(skb)->frag_list = NULL; + + do { + nskb = list_skb; + list_skb = list_skb->next; + + if (!tail) + skb->next = nskb; + else + tail->next = nskb; + + tail = nskb; + + delta_len += nskb->len; + delta_truesize += nskb->truesize; + + skb_push(nskb, -skb_network_offset(nskb) + offset); + + __copy_skb_header(nskb, skb); + + skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb)); + skb_copy_from_linear_data_offset(skb, -tnl_hlen, + nskb->data - tnl_hlen, + offset + tnl_hlen); + + if (skb_needs_linearize(nskb, features) && + __skb_linearize(nskb)) + goto err_linearize; + + } while (list_skb); + + skb->truesize = skb->truesize - delta_truesize; + skb->data_len = skb->data_len - delta_len; + skb->len = skb->len - delta_len; + + skb_gso_reset(skb); + + skb->prev = tail; + + if (skb_needs_linearize(skb, features) && + __skb_linearize(skb)) + goto err_linearize; + + skb_get(skb); + + return skb; + +err_linearize: + kfree_skb_list(skb->next); + skb->next = NULL; + return ERR_PTR(-ENOMEM); +} +EXPORT_SYMBOL_GPL(skb_segment_list); + +int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) +{ + if (unlikely(p->len + skb->len >= 65536)) + return -E2BIG; + + if (NAPI_GRO_CB(p)->last == p) + skb_shinfo(p)->frag_list = skb; + else + NAPI_GRO_CB(p)->last->next = skb; + + skb_pull(skb, skb_gro_offset(skb)); + + NAPI_GRO_CB(p)->last = skb; + NAPI_GRO_CB(p)->count++; + p->data_len += skb->len; + p->truesize += skb->truesize; + p->len += skb->len; + + NAPI_GRO_CB(skb)->same_flow = 1; + + return 0; +} +EXPORT_SYMBOL_GPL(skb_gro_receive_list); + /** * skb_segment - Perform protocol segmentation on skb. * @head_skb: buffer to segment @@ -3664,6 +3762,25 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, int pos; int dummy; + if (list_skb && !list_skb->head_frag && skb_headlen(list_skb) && + (skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY)) { + /* gso_size is untrusted, and we have a frag_list with a linear + * non head_frag head. + * + * (we assume checking the first list_skb member suffices; + * i.e if either of the list_skb members have non head_frag + * head, then the first one has too). + * + * If head_skb's headlen does not fit requested gso_size, it + * means that the frag_list members do NOT terminate on exact + * gso_size boundaries. Hence we cannot perform skb_frag_t page + * sharing. Therefore we must fallback to copying the frag_list + * skbs; we do so by disabling SG. + */ + if (mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) + features &= ~NETIF_F_SG; + } + __skb_push(head_skb, doffset); proto = skb_network_protocol(head_skb, &dummy); if (unlikely(!proto)) @@ -3871,7 +3988,7 @@ normal: size = skb_frag_size(nskb_frag); if (pos < offset) { - nskb_frag->page_offset += offset - pos; + skb_frag_off_add(nskb_frag, offset - pos); skb_frag_size_sub(nskb_frag, offset - pos); } @@ -3992,7 +4109,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) *--frag = *--frag2; } while (--i); - frag->page_offset += offset; + skb_frag_off_add(frag, offset); skb_frag_size_sub(frag, offset); /* all fragments truesize : remove (head size + sk_buff) */ @@ -4021,8 +4138,8 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags; - frag->page.p = page; - frag->page_offset = first_offset; + __skb_frag_set_page(frag, page); + skb_frag_off_set(frag, first_offset); skb_frag_size_set(frag, first_size); memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); @@ -4038,7 +4155,7 @@ merge: if (offset > headlen) { unsigned int eat = offset - headlen; - skbinfo->frags[0].page_offset += eat; + skb_frag_off_add(&skbinfo->frags[0], eat); skb_frag_size_sub(&skbinfo->frags[0], eat); skb->data_len -= eat; skb->len -= eat; @@ -4081,6 +4198,12 @@ static const u8 skb_ext_type_len[] = { #ifdef CONFIG_XFRM [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path), #endif +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + [TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext), +#endif +#if IS_ENABLED(CONFIG_MPTCP) + [SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext), +#endif }; static __always_inline unsigned int skb_ext_total_length(void) @@ -4092,6 +4215,12 @@ static __always_inline unsigned int skb_ext_total_length(void) #ifdef CONFIG_XFRM skb_ext_type_len[SKB_EXT_SEC_PATH] + #endif +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + skb_ext_type_len[TC_SKB_EXT] + +#endif +#if IS_ENABLED(CONFIG_MPTCP) + skb_ext_type_len[SKB_EXT_MPTCP] + +#endif 0; } @@ -4163,7 +4292,7 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len, if (copy > len) copy = len; sg_set_page(&sg[elt], skb_frag_page(frag), copy, - frag->page_offset+offset-start); + skb_frag_off(frag) + offset - start); elt++; if (!(len -= copy)) return elt; @@ -4384,7 +4513,7 @@ static void skb_set_err_queue(struct sk_buff *skb) int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) { if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= - (unsigned int)sk->sk_rcvbuf) + (unsigned int)READ_ONCE(sk->sk_rcvbuf)) return -ENOMEM; skb_orphan(skb); @@ -5088,8 +5217,8 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) skb->skb_iif = 0; skb->ignore_df = 0; skb_dst_drop(skb); - secpath_reset(skb); - nf_reset(skb); + skb_ext_reset(skb); + nf_reset_ct(skb); nf_reset_trace(skb); #ifdef CONFIG_NET_SWITCHDEV @@ -5441,17 +5570,22 @@ static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr, } /** - * skb_mpls_push() - push a new MPLS header after the mac header + * skb_mpls_push() - push a new MPLS header after mac_len bytes from start of + * the packet * * @skb: buffer * @mpls_lse: MPLS label stack entry to push * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848) + * @mac_len: length of the MAC header + * @ethernet: flag to indicate if the resulting packet after skb_mpls_push is + * ethernet * * Expects skb->data at mac header. * * Returns 0 on success, -errno otherwise. */ -int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto) +int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, + int mac_len, bool ethernet) { struct mpls_shim_hdr *lse; int err; @@ -5468,21 +5602,22 @@ int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto) return err; if (!skb->inner_protocol) { - skb_set_inner_network_header(skb, skb->mac_len); + skb_set_inner_network_header(skb, skb_network_offset(skb)); skb_set_inner_protocol(skb, skb->protocol); } skb_push(skb, MPLS_HLEN); memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), - skb->mac_len); + mac_len); skb_reset_mac_header(skb); - skb_set_network_header(skb, skb->mac_len); + skb_set_network_header(skb, mac_len); + skb_reset_mac_len(skb); lse = mpls_hdr(skb); lse->label_stack_entry = mpls_lse; skb_postpush_rcsum(skb, lse, MPLS_HLEN); - if (skb->dev && skb->dev->type == ARPHRD_ETHER) + if (ethernet) skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto); skb->protocol = mpls_proto; @@ -5495,31 +5630,34 @@ EXPORT_SYMBOL_GPL(skb_mpls_push); * * @skb: buffer * @next_proto: ethertype of header after popped MPLS header + * @mac_len: length of the MAC header + * @ethernet: flag to indicate if the packet is ethernet * * Expects skb->data at mac header. * * Returns 0 on success, -errno otherwise. */ -int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto) +int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len, + bool ethernet) { int err; if (unlikely(!eth_p_mpls(skb->protocol))) - return -EINVAL; + return 0; - err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN); + err = skb_ensure_writable(skb, mac_len + MPLS_HLEN); if (unlikely(err)) return err; skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN); memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), - skb->mac_len); + mac_len); __skb_pull(skb, MPLS_HLEN); skb_reset_mac_header(skb); - skb_set_network_header(skb, skb->mac_len); + skb_set_network_header(skb, mac_len); - if (skb->dev && skb->dev->type == ARPHRD_ETHER) { + if (ethernet) { struct ethhdr *hdr; /* use mpls_hdr() to get ethertype to account for VLANs. */ @@ -5834,7 +5972,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, * where splitting is expensive. * 2. Split is accurately. We make this. */ - shinfo->frags[0].page_offset += off - pos; + skb_frag_off_add(&shinfo->frags[0], off - pos); skb_frag_size_sub(&shinfo->frags[0], off - pos); } skb_frag_ref(skb, i); @@ -5940,7 +6078,14 @@ static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id) return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE); } -static struct skb_ext *skb_ext_alloc(void) +/** + * __skb_ext_alloc - allocate a new skb extensions storage + * + * Returns the newly allocated pointer. The pointer can later attached to a + * skb via __skb_ext_set(). + * Note: caller must handle the skb_ext as an opaque data. + */ +struct skb_ext *__skb_ext_alloc(void) { struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); @@ -5981,6 +6126,30 @@ static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old, } /** + * __skb_ext_set - attach the specified extension storage to this skb + * @skb: buffer + * @id: extension id + * @ext: extension storage previously allocated via __skb_ext_alloc() + * + * Existing extensions, if any, are cleared. + * + * Returns the pointer to the extension. + */ +void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id, + struct skb_ext *ext) +{ + unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext); + + skb_ext_put(skb); + newlen = newoff + skb_ext_type_len[id]; + ext->chunks = newlen; + ext->offset[id] = newoff; + skb->extensions = ext; + skb->active_extensions = 1 << id; + return skb_ext_get_ptr(ext, id); +} + +/** * skb_ext_add - allocate space for given extension, COW if needed * @skb: buffer * @id: extension to allocate space for @@ -6013,7 +6182,7 @@ void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) } else { newoff = SKB_EXT_CHUNKSIZEOF(*new); - new = skb_ext_alloc(); + new = __skb_ext_alloc(); if (!new) return NULL; } diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 6832eeb4b785..ded2d5227678 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -190,8 +190,7 @@ static int __sk_msg_free(struct sock *sk, struct sk_msg *msg, u32 i, sk_msg_check_to_free(msg, i, msg->sg.size); sge = sk_msg_elem(msg, i); } - if (msg->skb) - consume_skb(msg->skb); + consume_skb(msg->skb); sk_msg_init(msg); return freed; } @@ -271,18 +270,28 @@ void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len) msg->sg.data[i].length -= trim; sk_mem_uncharge(sk, trim); + /* Adjust copybreak if it falls into the trimmed part of last buf */ + if (msg->sg.curr == i && msg->sg.copybreak > msg->sg.data[i].length) + msg->sg.copybreak = msg->sg.data[i].length; out: - /* If we trim data before curr pointer update copybreak and current - * so that any future copy operations start at new copy location. + sk_msg_iter_var_next(i); + msg->sg.end = i; + + /* If we trim data a full sg elem before curr pointer update + * copybreak and current so that any future copy operations + * start at new copy location. * However trimed data that has not yet been used in a copy op * does not require an update. */ - if (msg->sg.curr >= i) { + if (!msg->sg.size) { + msg->sg.curr = msg->sg.start; + msg->sg.copybreak = 0; + } else if (sk_msg_iter_dist(msg->sg.start, msg->sg.curr) >= + sk_msg_iter_dist(msg->sg.start, msg->sg.end)) { + sk_msg_iter_var_prev(i); msg->sg.curr = i; msg->sg.copybreak = msg->sg.data[i].length; } - sk_msg_iter_var_next(i); - msg->sg.end = i; } EXPORT_SYMBOL_GPL(sk_msg_trim); @@ -412,7 +421,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) copied = skb->len; msg->sg.start = 0; msg->sg.size = copied; - msg->sg.end = num_sge == MAX_MSG_FRAGS ? 0 : num_sge; + msg->sg.end = num_sge; msg->skb = skb; sk_psock_queue_msg(psock, msg); @@ -784,15 +793,18 @@ static void sk_psock_strp_data_ready(struct sock *sk) static void sk_psock_write_space(struct sock *sk) { struct sk_psock *psock; - void (*write_space)(struct sock *sk); + void (*write_space)(struct sock *sk) = NULL; rcu_read_lock(); psock = sk_psock(sk); - if (likely(psock && sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))) - schedule_work(&psock->work); - write_space = psock->saved_write_space; + if (likely(psock)) { + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + schedule_work(&psock->work); + write_space = psock->saved_write_space; + } rcu_read_unlock(); - write_space(sk); + if (write_space) + write_space(sk); } int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) diff --git a/net/core/sock.c b/net/core/sock.c index 6d08553f885c..a4c8fac781ff 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -333,7 +333,6 @@ EXPORT_SYMBOL(__sk_backlog_rcv); static int sock_get_timeout(long timeo, void *optval, bool old_timeval) { struct __kernel_sock_timeval tv; - int size; if (timeo == MAX_SCHEDULE_TIMEOUT) { tv.tv_sec = 0; @@ -354,13 +353,11 @@ static int sock_get_timeout(long timeo, void *optval, bool old_timeval) old_tv.tv_sec = tv.tv_sec; old_tv.tv_usec = tv.tv_usec; *(struct __kernel_old_timeval *)optval = old_tv; - size = sizeof(old_tv); - } else { - *(struct __kernel_sock_timeval *)optval = tv; - size = sizeof(tv); + return sizeof(old_tv); } - return size; + *(struct __kernel_sock_timeval *)optval = tv; + return sizeof(tv); } static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool old_timeval) @@ -521,8 +518,8 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, rc = sk_backlog_rcv(sk, skb); - mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); - } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) { + mutex_release(&sk->sk_lock.dep_map, _RET_IP_); + } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) { bh_unlock_sock(sk); atomic_inc(&sk->sk_drops); goto discard_and_relse; @@ -687,7 +684,8 @@ out: return ret; } -static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) +static inline void sock_valbool_flag(struct sock *sk, enum sock_flags bit, + int valbool) { if (valbool) sock_set_flag(sk, bit); @@ -785,7 +783,8 @@ set_sndbuf: */ val = min_t(int, val, INT_MAX / 2); sk->sk_userlocks |= SOCK_SNDBUF_LOCK; - sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); + WRITE_ONCE(sk->sk_sndbuf, + max_t(int, val * 2, SOCK_MIN_SNDBUF)); /* Wake up sending tasks if we upped the value. */ sk->sk_write_space(sk); break; @@ -831,7 +830,8 @@ set_rcvbuf: * returning the value we actually used in getsockopt * is the most desirable behavior. */ - sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF); + WRITE_ONCE(sk->sk_rcvbuf, + max_t(int, val * 2, SOCK_MIN_RCVBUF)); break; case SO_RCVBUFFORCE: @@ -974,7 +974,7 @@ set_rcvbuf: if (sock->ops->set_rcvlowat) ret = sock->ops->set_rcvlowat(sk, val); else - sk->sk_rcvlowat = val ? : 1; + WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); break; case SO_RCVTIMEO_OLD: @@ -1125,7 +1125,7 @@ set_rcvbuf: break; } case SO_INCOMING_CPU: - sk->sk_incoming_cpu = val; + WRITE_ONCE(sk->sk_incoming_cpu, val); break; case SO_CNX_ADVICE: @@ -1474,7 +1474,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; case SO_INCOMING_CPU: - v.val = sk->sk_incoming_cpu; + v.val = READ_ONCE(sk->sk_incoming_cpu); break; case SO_MEMINFO: @@ -1700,8 +1700,6 @@ static void __sk_destruct(struct rcu_head *head) sk_filter_uncharge(sk, filter); RCU_INIT_POINTER(sk->sk_filter, NULL); } - if (rcu_access_pointer(sk->sk_reuseport_cb)) - reuseport_detach_sock(sk); sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); @@ -1728,7 +1726,14 @@ static void __sk_destruct(struct rcu_head *head) void sk_destruct(struct sock *sk) { - if (sock_flag(sk, SOCK_RCU_FREE)) + bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); + + if (rcu_access_pointer(sk->sk_reuseport_cb)) { + reuseport_detach_sock(sk); + use_call_rcu = true; + } + + if (use_call_rcu) call_rcu(&sk->sk_rcu, __sk_destruct); else __sk_destruct(&sk->sk_rcu); @@ -1851,9 +1856,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) goto out; } RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); -#ifdef CONFIG_BPF_SYSCALL - RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL); -#endif + + if (bpf_sk_storage_clone(sk, newsk)) { + sk_free_unlock_clone(newsk); + newsk = NULL; + goto out; + } newsk->sk_err = 0; newsk->sk_err_soft = 0; @@ -2080,8 +2088,10 @@ EXPORT_SYMBOL(sock_i_ino); struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority) { - if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { + if (force || + refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) { struct sk_buff *skb = alloc_skb(size, priority); + if (skb) { skb_set_owner_w(skb, sk); return skb; @@ -2182,7 +2192,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo) break; set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) + if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) break; if (sk->sk_shutdown & SEND_SHUTDOWN) break; @@ -2217,7 +2227,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, if (sk->sk_shutdown & SEND_SHUTDOWN) goto failure; - if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf) + if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) break; sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); @@ -2326,8 +2336,8 @@ static void sk_leave_memory_pressure(struct sock *sk) } else { unsigned long *memory_pressure = sk->sk_prot->memory_pressure; - if (memory_pressure && *memory_pressure) - *memory_pressure = 0; + if (memory_pressure && READ_ONCE(*memory_pressure)) + WRITE_ONCE(*memory_pressure, 0); } } @@ -2776,7 +2786,7 @@ static void sock_def_error_report(struct sock *sk) rcu_read_unlock(); } -static void sock_def_readable(struct sock *sk) +void sock_def_readable(struct sock *sk) { struct socket_wq *wq; @@ -2798,7 +2808,7 @@ static void sock_def_write_space(struct sock *sk) /* Do not wake up a writer until he can make "significant" * progress. --DaveM */ - if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { + if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) { wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | @@ -2906,7 +2916,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_max_pacing_rate = ~0UL; sk->sk_pacing_rate = ~0UL; - sk->sk_pacing_shift = 10; + WRITE_ONCE(sk->sk_pacing_shift, 10); sk->sk_incoming_cpu = -1; sk_rx_queue_clear(sk); @@ -3003,7 +3013,7 @@ int sock_gettstamp(struct socket *sock, void __user *userstamp, return -ENOENT; if (ts.tv_sec == 0) { ktime_t kt = ktime_get_real(); - sock_write_timestamp(sk, kt);; + sock_write_timestamp(sk, kt); ts = ktime_to_timespec64(kt); } @@ -3030,7 +3040,7 @@ int sock_gettstamp(struct socket *sock, void __user *userstamp, } EXPORT_SYMBOL(sock_gettstamp); -void sock_enable_timestamp(struct sock *sk, int flag) +void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) { if (!sock_flag(sk, flag)) { unsigned long previous_flags = sk->sk_flags; @@ -3196,13 +3206,13 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem) memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); - mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf; + mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); - mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf; + mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; - mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; + mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); - mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; + mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); } @@ -3287,16 +3297,17 @@ static __init int net_inuse_init(void) core_initcall(net_inuse_init); -static void assign_proto_idx(struct proto *prot) +static int assign_proto_idx(struct proto *prot) { prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { pr_err("PROTO_INUSE_NR exhausted\n"); - return; + return -ENOSPC; } set_bit(prot->inuse_idx, proto_inuse_idx); + return 0; } static void release_proto_idx(struct proto *prot) @@ -3305,8 +3316,9 @@ static void release_proto_idx(struct proto *prot) clear_bit(prot->inuse_idx, proto_inuse_idx); } #else -static inline void assign_proto_idx(struct proto *prot) +static inline int assign_proto_idx(struct proto *prot) { + return 0; } static inline void release_proto_idx(struct proto *prot) @@ -3355,6 +3367,8 @@ static int req_prot_init(const struct proto *prot) int proto_register(struct proto *prot, int alloc_slab) { + int ret = -ENOBUFS; + if (alloc_slab) { prot->slab = kmem_cache_create_usercopy(prot->name, prot->obj_size, 0, @@ -3391,20 +3405,27 @@ int proto_register(struct proto *prot, int alloc_slab) } mutex_lock(&proto_list_mutex); + ret = assign_proto_idx(prot); + if (ret) { + mutex_unlock(&proto_list_mutex); + goto out_free_timewait_sock_slab_name; + } list_add(&prot->node, &proto_list); - assign_proto_idx(prot); mutex_unlock(&proto_list_mutex); - return 0; + return ret; out_free_timewait_sock_slab_name: - kfree(prot->twsk_prot->twsk_slab_name); + if (alloc_slab && prot->twsk_prot) + kfree(prot->twsk_prot->twsk_slab_name); out_free_request_sock_slab: - req_prot_cleanup(prot->rsk_prot); + if (alloc_slab) { + req_prot_cleanup(prot->rsk_prot); - kmem_cache_destroy(prot->slab); - prot->slab = NULL; + kmem_cache_destroy(prot->slab); + prot->slab = NULL; + } out: - return -ENOBUFS; + return ret; } EXPORT_SYMBOL(proto_register); @@ -3478,7 +3499,7 @@ static long sock_prot_memory_allocated(struct proto *proto) return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; } -static char *sock_prot_memory_pressure(struct proto *proto) +static const char *sock_prot_memory_pressure(struct proto *proto) { return proto->memory_pressure != NULL ? proto_memory_pressure(proto) ? "yes" : "no" : "NI"; @@ -3577,7 +3598,7 @@ bool sk_busy_loop_end(void *p, unsigned long start_time) { struct sock *sk = p; - return !skb_queue_empty(&sk->sk_receive_queue) || + return !skb_queue_empty_lockless(&sk->sk_receive_queue) || sk_busy_loop_timeout(sk, start_time); } EXPORT_SYMBOL(sk_busy_loop_end); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 1330a7442e5b..085cef5857bb 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -234,19 +234,23 @@ static void sock_map_free(struct bpf_map *map) int i; synchronize_rcu(); - rcu_read_lock(); raw_spin_lock_bh(&stab->lock); for (i = 0; i < stab->map.max_entries; i++) { struct sock **psk = &stab->sks[i]; struct sock *sk; sk = xchg(psk, NULL); - if (sk) + if (sk) { + lock_sock(sk); + rcu_read_lock(); sock_map_unref(sk, psk); + rcu_read_unlock(); + release_sock(sk); + } } raw_spin_unlock_bh(&stab->lock); - rcu_read_unlock(); + /* wait for psock readers accessing its map link */ synchronize_rcu(); bpf_map_area_free(stab->sks); @@ -345,7 +349,7 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx, return -EINVAL; if (unlikely(idx >= map->max_entries)) return -E2BIG; - if (unlikely(icsk->icsk_ulp_data)) + if (unlikely(rcu_access_pointer(icsk->icsk_ulp_data))) return -EINVAL; link = sk_psock_init_link(); @@ -413,14 +417,16 @@ static int sock_map_update_elem(struct bpf_map *map, void *key, ret = -EINVAL; goto out; } - if (!sock_map_sk_is_suitable(sk) || - sk->sk_state != TCP_ESTABLISHED) { + if (!sock_map_sk_is_suitable(sk)) { ret = -EOPNOTSUPP; goto out; } sock_map_sk_acquire(sk); - ret = sock_map_update_common(map, idx, sk, flags); + if (sk->sk_state != TCP_ESTABLISHED) + ret = -EOPNOTSUPP; + else + ret = sock_map_update_common(map, idx, sk, flags); sock_map_sk_release(sk); out: fput(sock->file); @@ -656,6 +662,7 @@ static int sock_hash_update_common(struct bpf_map *map, void *key, struct sock *sk, u64 flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct inet_connection_sock *icsk = inet_csk(sk); u32 key_size = map->key_size, hash; struct bpf_htab_elem *elem, *elem_new; struct bpf_htab_bucket *bucket; @@ -666,6 +673,8 @@ static int sock_hash_update_common(struct bpf_map *map, void *key, WARN_ON_ONCE(!rcu_read_lock_held()); if (unlikely(flags > BPF_EXIST)) return -EINVAL; + if (unlikely(icsk->icsk_ulp_data)) + return -EINVAL; link = sk_psock_init_link(); if (!link) @@ -733,14 +742,16 @@ static int sock_hash_update_elem(struct bpf_map *map, void *key, ret = -EINVAL; goto out; } - if (!sock_map_sk_is_suitable(sk) || - sk->sk_state != TCP_ESTABLISHED) { + if (!sock_map_sk_is_suitable(sk)) { ret = -EOPNOTSUPP; goto out; } sock_map_sk_acquire(sk); - ret = sock_hash_update_common(map, key, sk, flags); + if (sk->sk_state != TCP_ESTABLISHED) + ret = -EOPNOTSUPP; + else + ret = sock_hash_update_common(map, key, sk, flags); sock_map_sk_release(sk); out: fput(sock->file); @@ -853,17 +864,22 @@ static void sock_hash_free(struct bpf_map *map) int i; synchronize_rcu(); - rcu_read_lock(); for (i = 0; i < htab->buckets_num; i++) { bucket = sock_hash_select_bucket(htab, i); raw_spin_lock_bh(&bucket->lock); hlist_for_each_entry_safe(elem, node, &bucket->head, node) { hlist_del_rcu(&elem->node); + lock_sock(elem->sk); + rcu_read_lock(); sock_map_unref(elem->sk, elem); + rcu_read_unlock(); + release_sock(elem->sk); } raw_spin_unlock_bh(&bucket->lock); } - rcu_read_unlock(); + + /* wait for psock readers accessing its map link */ + synchronize_rcu(); bpf_map_area_free(htab->buckets); kfree(htab); diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 9408f9264d05..91e9f2223c39 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -107,7 +107,6 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) if (!more_reuse) return NULL; - more_reuse->max_socks = more_socks_size; more_reuse->num_socks = reuse->num_socks; more_reuse->prog = reuse->prog; more_reuse->reuseport_id = reuse->reuseport_id; @@ -295,8 +294,19 @@ struct sock *reuseport_select_sock(struct sock *sk, select_by_hash: /* no bpf or invalid bpf result: fall back to hash usage */ - if (!sk2) - sk2 = reuse->socks[reciprocal_scale(hash, socks)]; + if (!sk2) { + int i, j; + + i = j = reciprocal_scale(hash, socks); + while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) { + i++; + if (i >= reuse->num_socks) + i = 0; + if (i == j) + goto out; + } + sk2 = reuse->socks[i]; + } } out: @@ -345,8 +355,8 @@ int reuseport_detach_prog(struct sock *sk) spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); - rcu_swap_protected(reuse->prog, old_prog, - lockdep_is_held(&reuseport_lock)); + old_prog = rcu_replace_pointer(reuse->prog, old_prog, + lockdep_is_held(&reuseport_lock)); spin_unlock_bh(&reuseport_lock); if (!old_prog) diff --git a/net/core/stream.c b/net/core/stream.c index e94bb02a5629..4f1d4aa5fb38 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -120,7 +120,6 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) int err = 0; long vm_wait = 0; long current_timeo = *timeo_p; - bool noblock = (*timeo_p ? false : true); DEFINE_WAIT_FUNC(wait, woken_wake_function); if (sk_stream_memory_free(sk)) @@ -133,11 +132,8 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto do_error; - if (!*timeo_p) { - if (noblock) - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - goto do_nonblock; - } + if (!*timeo_p) + goto do_eagain; if (signal_pending(current)) goto do_interrupted; sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); @@ -169,7 +165,13 @@ out: do_error: err = -EPIPE; goto out; -do_nonblock: +do_eagain: + /* Make sure that whenever EAGAIN is returned, EPOLLOUT event can + * be generated later. + * When TCP receives ACK packets that make room, tcp_check_space() + * only calls tcp_new_space() if SOCK_NOSPACE is set. + */ + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); err = -EAGAIN; goto out; do_interrupted: diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 8da5b3a54dac..9f9e00ba3ad7 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -288,6 +288,7 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, return ret; } +# ifdef CONFIG_HAVE_EBPF_JIT static int proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -298,6 +299,7 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, return proc_dointvec_minmax(table, write, buffer, lenp, ppos); } +# endif /* CONFIG_HAVE_EBPF_JIT */ static int proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write, @@ -567,6 +569,14 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_do_static_key, }, + { + .procname = "gro_normal_batch", + .data = &gro_normal_batch, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + }, { } }; diff --git a/net/core/timestamping.c b/net/core/timestamping.c index 7911235706a9..04840697fe79 100644 --- a/net/core/timestamping.c +++ b/net/core/timestamping.c @@ -13,7 +13,7 @@ static unsigned int classify(const struct sk_buff *skb) { if (likely(skb->dev && skb->dev->phydev && - skb->dev->phydev->drv)) + skb->dev->phydev->mii_ts)) return ptp_classify_raw(skb); else return PTP_CLASS_NONE; @@ -21,7 +21,7 @@ static unsigned int classify(const struct sk_buff *skb) void skb_clone_tx_timestamp(struct sk_buff *skb) { - struct phy_device *phydev; + struct mii_timestamper *mii_ts; struct sk_buff *clone; unsigned int type; @@ -32,22 +32,22 @@ void skb_clone_tx_timestamp(struct sk_buff *skb) if (type == PTP_CLASS_NONE) return; - phydev = skb->dev->phydev; - if (likely(phydev->drv->txtstamp)) { + mii_ts = skb->dev->phydev->mii_ts; + if (likely(mii_ts->txtstamp)) { clone = skb_clone_sk(skb); if (!clone) return; - phydev->drv->txtstamp(phydev, clone, type); + mii_ts->txtstamp(mii_ts, clone, type); } } EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp); bool skb_defer_rx_timestamp(struct sk_buff *skb) { - struct phy_device *phydev; + struct mii_timestamper *mii_ts; unsigned int type; - if (!skb->dev || !skb->dev->phydev || !skb->dev->phydev->drv) + if (!skb->dev || !skb->dev->phydev || !skb->dev->phydev->mii_ts) return false; if (skb_headroom(skb) < ETH_HLEN) @@ -62,9 +62,9 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb) if (type == PTP_CLASS_NONE) return false; - phydev = skb->dev->phydev; - if (likely(phydev->drv->rxtstamp)) - return phydev->drv->rxtstamp(phydev, skb, type); + mii_ts = skb->dev->phydev->mii_ts; + if (likely(mii_ts->rxtstamp)) + return mii_ts->rxtstamp(mii_ts, skb, type); return false; } diff --git a/net/core/tso.c b/net/core/tso.c index 43f4eba61933..d4d5c077ad72 100644 --- a/net/core/tso.c +++ b/net/core/tso.c @@ -55,8 +55,8 @@ void tso_build_data(struct sk_buff *skb, struct tso_t *tso, int size) skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx]; /* Move to next segment */ - tso->size = frag->size; - tso->data = page_address(frag->page.p) + frag->page_offset; + tso->size = skb_frag_size(frag); + tso->data = skb_frag_address(frag); tso->next_frag_idx++; } } @@ -79,8 +79,8 @@ void tso_start(struct sk_buff *skb, struct tso_t *tso) skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx]; /* Move to next segment */ - tso->size = frag->size; - tso->data = page_address(frag->page.p) + frag->page_offset; + tso->size = skb_frag_size(frag); + tso->data = skb_frag_address(frag); tso->next_frag_idx++; } } diff --git a/net/core/utils.c b/net/core/utils.c index 6b6e51db9f3b..1f31a39236d5 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -438,6 +438,23 @@ void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, } EXPORT_SYMBOL(inet_proto_csum_replace4); +/** + * inet_proto_csum_replace16 - update layer 4 header checksum field + * @sum: Layer 4 header checksum field + * @skb: sk_buff for the packet + * @from: old IPv6 address + * @to: new IPv6 address + * @pseudohdr: True if layer 4 header checksum includes pseudoheader + * + * Update layer 4 header as per the update in IPv6 src/dst address. + * + * There is no need to update skb->csum in this function, because update in two + * fields a.) IPv6 src/dst address and b.) L4 header checksum cancels each other + * for skb->csum calculation. Whereas inet_proto_csum_replace4 function needs to + * update skb->csum, because update in 3 fields a.) IPv4 src/dst address, + * b.) IPv4 Header checksum and c.) L4 header checksum results in same diff as + * L4 Header checksum for skb->csum calculation. + */ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, const __be32 *from, const __be32 *to, bool pseudohdr) @@ -449,9 +466,6 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, if (skb->ip_summed != CHECKSUM_PARTIAL) { *sum = csum_fold(csum_partial(diff, sizeof(diff), ~csum_unfold(*sum))); - if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) - skb->csum = ~csum_partial(diff, sizeof(diff), - ~skb->csum); } else if (pseudohdr) *sum = ~csum_fold(csum_partial(diff, sizeof(diff), csum_unfold(*sum))); diff --git a/net/core/xdp.c b/net/core/xdp.c index d7bf62ffbb5e..8310714c47fd 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -36,7 +36,7 @@ static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed) const u32 *k = data; const u32 key = *k; - BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_mem_allocator, mem.id) + BUILD_BUG_ON(sizeof_field(struct xdp_mem_allocator, mem.id) != sizeof(u32)); /* Use cyclic increasing ID as direct hash key */ @@ -56,7 +56,7 @@ static const struct rhashtable_params mem_id_rht_params = { .nelem_hint = 64, .head_offset = offsetof(struct xdp_mem_allocator, node), .key_offset = offsetof(struct xdp_mem_allocator, mem.id), - .key_len = FIELD_SIZEOF(struct xdp_mem_allocator, mem.id), + .key_len = sizeof_field(struct xdp_mem_allocator, mem.id), .max_size = MEM_ID_MAX, .min_size = 8, .automatic_shrinking = true, @@ -70,77 +70,63 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu) xa = container_of(rcu, struct xdp_mem_allocator, rcu); - /* Allocator have indicated safe to remove before this is called */ - if (xa->mem.type == MEM_TYPE_PAGE_POOL) - page_pool_free(xa->page_pool); - /* Allow this ID to be reused */ ida_simple_remove(&mem_id_pool, xa->mem.id); - /* Poison memory */ - xa->mem.id = 0xFFFF; - xa->mem.type = 0xF0F0; - xa->allocator = (void *)0xDEAD9001; - kfree(xa); } -static bool __mem_id_disconnect(int id, bool force) +static void mem_xa_remove(struct xdp_mem_allocator *xa) +{ + trace_mem_disconnect(xa); + + if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params)) + call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); +} + +static void mem_allocator_disconnect(void *allocator) { struct xdp_mem_allocator *xa; - bool safe_to_remove = true; + struct rhashtable_iter iter; mutex_lock(&mem_id_lock); - xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); - if (!xa) { - mutex_unlock(&mem_id_lock); - WARN(1, "Request remove non-existing id(%d), driver bug?", id); - return true; - } - xa->disconnect_cnt++; + rhashtable_walk_enter(mem_id_ht, &iter); + do { + rhashtable_walk_start(&iter); - /* Detects in-flight packet-pages for page_pool */ - if (xa->mem.type == MEM_TYPE_PAGE_POOL) - safe_to_remove = page_pool_request_shutdown(xa->page_pool); + while ((xa = rhashtable_walk_next(&iter)) && !IS_ERR(xa)) { + if (xa->allocator == allocator) + mem_xa_remove(xa); + } - trace_mem_disconnect(xa, safe_to_remove, force); + rhashtable_walk_stop(&iter); - if ((safe_to_remove || force) && - !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params)) - call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); + } while (xa == ERR_PTR(-EAGAIN)); + rhashtable_walk_exit(&iter); mutex_unlock(&mem_id_lock); - return (safe_to_remove|force); } -#define DEFER_TIME (msecs_to_jiffies(1000)) -#define DEFER_WARN_INTERVAL (30 * HZ) -#define DEFER_MAX_RETRIES 120 - -static void mem_id_disconnect_defer_retry(struct work_struct *wq) +static void mem_id_disconnect(int id) { - struct delayed_work *dwq = to_delayed_work(wq); - struct xdp_mem_allocator *xa = container_of(dwq, typeof(*xa), defer_wq); - bool force = false; + struct xdp_mem_allocator *xa; - if (xa->disconnect_cnt > DEFER_MAX_RETRIES) - force = true; + mutex_lock(&mem_id_lock); - if (__mem_id_disconnect(xa->mem.id, force)) + xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); + if (!xa) { + mutex_unlock(&mem_id_lock); + WARN(1, "Request remove non-existing id(%d), driver bug?", id); return; + } - /* Periodic warning */ - if (time_after_eq(jiffies, xa->defer_warn)) { - int sec = (s32)((u32)jiffies - (u32)xa->defer_start) / HZ; + trace_mem_disconnect(xa); - pr_warn("%s() stalled mem.id=%u shutdown %d attempts %d sec\n", - __func__, xa->mem.id, xa->disconnect_cnt, sec); - xa->defer_warn = jiffies + DEFER_WARN_INTERVAL; - } + if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params)) + call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); - /* Still not ready to be disconnected, retry later */ - schedule_delayed_work(&xa->defer_wq, DEFER_TIME); + mutex_unlock(&mem_id_lock); } void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) @@ -153,38 +139,21 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) return; } - if (xdp_rxq->mem.type != MEM_TYPE_PAGE_POOL && - xdp_rxq->mem.type != MEM_TYPE_ZERO_COPY) { - return; - } - if (id == 0) return; - if (__mem_id_disconnect(id, false)) - return; - - /* Could not disconnect, defer new disconnect attempt to later */ - mutex_lock(&mem_id_lock); + if (xdp_rxq->mem.type == MEM_TYPE_ZERO_COPY) + return mem_id_disconnect(id); - xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); - if (!xa) { - mutex_unlock(&mem_id_lock); - return; + if (xdp_rxq->mem.type == MEM_TYPE_PAGE_POOL) { + rcu_read_lock(); + xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params); + page_pool_destroy(xa->page_pool); + rcu_read_unlock(); } - xa->defer_start = jiffies; - xa->defer_warn = jiffies + DEFER_WARN_INTERVAL; - - INIT_DELAYED_WORK(&xa->defer_wq, mem_id_disconnect_defer_retry); - mutex_unlock(&mem_id_lock); - schedule_delayed_work(&xa->defer_wq, DEFER_TIME); } EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model); -/* This unregister operation will also cleanup and destroy the - * allocator. The page_pool_free() operation is first called when it's - * safe to remove, possibly deferred to a workqueue. - */ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) { /* Simplify driver cleanup code paths, allow unreg "unused" */ @@ -371,7 +340,7 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, } if (type == MEM_TYPE_PAGE_POOL) - page_pool_get(xdp_alloc->page_pool); + page_pool_use_xdp_mem(allocator, mem_allocator_disconnect); mutex_unlock(&mem_id_lock); @@ -386,7 +355,7 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); /* XDP RX runs under NAPI protection, and in different delivery error * scenarios (e.g. queue full), it is possible to return the xdp_frame - * while still leveraging this protection. The @napi_direct boolian + * while still leveraging this protection. The @napi_direct boolean * is used for those calls sites. Thus, allowing for faster recycling * of xdp_frames/pages in those cases. */ @@ -402,15 +371,8 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); page = virt_to_head_page(data); - if (likely(xa)) { - napi_direct &= !xdp_return_frame_no_direct(); - page_pool_put_page(xa->page_pool, page, napi_direct); - } else { - /* Hopefully stack show who to blame for late return */ - WARN_ONCE(1, "page_pool gone mem.id=%d", mem->id); - trace_mem_return_failed(mem, page); - put_page(page); - } + napi_direct &= !xdp_return_frame_no_direct(); + page_pool_put_page(xa->page_pool, page, napi_direct); rcu_read_unlock(); break; case MEM_TYPE_PAGE_SHARED: diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index b685bc82f8d0..d19557c6d04b 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -117,7 +117,7 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_daddr, inet->inet_sport, inet->inet_dport); - inet->inet_id = dp->dccps_iss ^ jiffies; + inet->inet_id = prandom_u32(); err = dccp_connect(sk); rt = NULL; @@ -416,7 +416,7 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk, RCU_INIT_POINTER(newinet->inet_opt, rcu_dereference(ireq->ireq_opt)); newinet->mc_index = inet_iif(skb); newinet->mc_ttl = ip_hdr(skb)->ttl; - newinet->inet_id = jiffies; + newinet->inet_id = prandom_u32(); if (dst == NULL && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL) goto put_and_exit; @@ -871,7 +871,7 @@ lookup: if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; - nf_reset(skb); + nf_reset_ct(skb); return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4, refcounted); diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 1b7381ff787b..1e5e08cc0bfc 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -210,7 +210,7 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final); rcu_read_unlock(); - dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; @@ -230,7 +230,8 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req opt = ireq->ipv6_opt; if (!opt) opt = rcu_dereference(np->opt); - err = ip6_xmit(sk, skb, &fl6, sk->sk_mark, opt, np->tclass); + err = ip6_xmit(sk, skb, &fl6, sk->sk_mark, opt, np->tclass, + sk->sk_priority); rcu_read_unlock(); err = net_xmit_eval(err); } @@ -281,10 +282,10 @@ static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) security_skb_classify_flow(rxskb, flowi6_to_flowi(&fl6)); /* sk = NULL, but it is safe for now. RST socket required. */ - dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL); + dst = ip6_dst_lookup_flow(sock_net(ctl_sk), ctl_sk, &fl6, NULL); if (!IS_ERR(dst)) { skb_dst_set(skb, dst); - ip6_xmit(ctl_sk, skb, &fl6, 0, NULL, 0); + ip6_xmit(ctl_sk, skb, &fl6, 0, NULL, 0, 0); DCCP_INC_STATS(DCCP_MIB_OUTSEGS); DCCP_INC_STATS(DCCP_MIB_OUTRSTS); return; @@ -911,7 +912,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); final_p = fl6_update_dst(&fl6, opt, &final); - dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto failure; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 5bad08dc4316..4af8a98fe784 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -944,7 +944,7 @@ int inet_dccp_listen(struct socket *sock, int backlog) if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN))) goto out; - sk->sk_max_ack_backlog = backlog; + WRITE_ONCE(sk->sk_max_ack_backlog, backlog); /* Really, if the socket is already in listen state * we can only allow the backlog to be adjusted. */ @@ -1132,7 +1132,7 @@ static int __init dccp_init(void) int rc; BUILD_BUG_ON(sizeof(struct dccp_skb_cb) > - FIELD_SIZEOF(struct sk_buff, cb)); + sizeof_field(struct sk_buff, cb)); rc = percpu_counter_init(&dccp_orphan_count, 0, GFP_KERNEL); if (rc) goto out_fail; diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 0ea75286abf4..0a46ea3bddd5 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -670,7 +670,7 @@ static int dn_create(struct net *net, struct socket *sock, int protocol, { struct sock *sk; - if (protocol < 0 || protocol > SK_PROTOCOL_MAX) + if (protocol < 0 || protocol > U8_MAX) return -EINVAL; if (!net_eq(net, &init_net)) @@ -1091,7 +1091,7 @@ static int dn_accept(struct socket *sock, struct socket *newsock, int flags, } cb = DN_SKB_CB(skb); - sk->sk_ack_backlog--; + sk_acceptq_removed(sk); newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation, kern); if (newsk == NULL) { release_sock(sk); @@ -1205,7 +1205,7 @@ static __poll_t dn_poll(struct file *file, struct socket *sock, poll_table *wai struct dn_scp *scp = DN_SK(sk); __poll_t mask = datagram_poll(file, sock, wait); - if (!skb_queue_empty(&scp->other_receive_queue)) + if (!skb_queue_empty_lockless(&scp->other_receive_queue)) mask |= EPOLLRDBAND; return mask; diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c index e4161e0c86aa..c68503a18025 100644 --- a/net/decnet/dn_nsp_in.c +++ b/net/decnet/dn_nsp_in.c @@ -328,7 +328,7 @@ static void dn_nsp_conn_init(struct sock *sk, struct sk_buff *skb) return; } - sk->sk_ack_backlog++; + sk_acceptq_added(sk); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_state_change(sk); } diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index aea918135ec3..08c3dc45f1a4 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -110,7 +110,8 @@ static void dn_dst_ifdown(struct dst_entry *, struct net_device *dev, int how); static struct dst_entry *dn_dst_negative_advice(struct dst_entry *); static void dn_dst_link_failure(struct sk_buff *); static void dn_dst_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb , u32 mtu); + struct sk_buff *skb , u32 mtu, + bool confirm_neigh); static void dn_dst_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); static struct neighbour *dn_dst_neigh_lookup(const struct dst_entry *dst, @@ -251,7 +252,8 @@ static int dn_dst_gc(struct dst_ops *ops) * advertise to the other end). */ static void dn_dst_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) + struct sk_buff *skb, u32 mtu, + bool confirm_neigh) { struct dn_route *rt = (struct dn_route *) dst; struct neighbour *n = rt->n; diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 6e942dda1bcd..92663dcb3aa2 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -20,7 +20,7 @@ if NET_DSA # tagging formats config NET_DSA_TAG_8021Q - tristate "Tag driver for switches using custom 802.1Q VLAN headers" + tristate select VLAN_8021Q help Unlike the other tagging protocols, the 802.1Q config option simply @@ -29,6 +29,12 @@ config NET_DSA_TAG_8021Q Drivers which use these helpers should select this as dependency. +config NET_DSA_TAG_AR9331 + tristate "Tag driver for Atheros AR9331 SoC with built-in switch" + help + Say Y or M if you want to enable support for tagging frames for + the Atheros AR9331 SoC with built-in switch. + config NET_DSA_TAG_BRCM_COMMON tristate default n @@ -73,23 +79,18 @@ config NET_DSA_TAG_MTK Say Y or M if you want to enable support for tagging frames for Mediatek switches. -config NET_DSA_TAG_KSZ_COMMON - tristate - default n - config NET_DSA_TAG_KSZ - tristate "Tag driver for Microchip 9893 family of switches" - select NET_DSA_TAG_KSZ_COMMON + tristate "Tag driver for Microchip 8795/9477/9893 families of switches" help Say Y if you want to enable support for tagging frames for the - Microchip 9893 family of switches. + Microchip 8795/9477/9893 families of switches. -config NET_DSA_TAG_KSZ9477 - tristate "Tag driver for Microchip 9477 family of switches" - select NET_DSA_TAG_KSZ_COMMON +config NET_DSA_TAG_OCELOT + tristate "Tag driver for Ocelot family of switches" + select PACKING help - Say Y if you want to enable support for tagging frames for the - Microchip 9477 family of switches. + Say Y or M if you want to enable support for tagging frames for the + Ocelot switches (VSC7511, VSC7512, VSC7513, VSC7514, VSC9959). config NET_DSA_TAG_QCA tristate "Tag driver for Qualcomm Atheros QCA8K switches" diff --git a/net/dsa/Makefile b/net/dsa/Makefile index c342f54715ba..108486cfdeef 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -5,13 +5,15 @@ dsa_core-y += dsa.o dsa2.o master.o port.o slave.o switch.o # tagging formats obj-$(CONFIG_NET_DSA_TAG_8021Q) += tag_8021q.o +obj-$(CONFIG_NET_DSA_TAG_AR9331) += tag_ar9331.o obj-$(CONFIG_NET_DSA_TAG_BRCM_COMMON) += tag_brcm.o obj-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o obj-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o obj-$(CONFIG_NET_DSA_TAG_GSWIP) += tag_gswip.o -obj-$(CONFIG_NET_DSA_TAG_KSZ_COMMON) += tag_ksz.o +obj-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o obj-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o obj-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o +obj-$(CONFIG_NET_DSA_TAG_OCELOT) += tag_ocelot.o obj-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o obj-$(CONFIG_NET_DSA_TAG_SJA1105) += tag_sja1105.o obj-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 43120a3fb06f..17281fec710c 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -246,7 +246,9 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, #ifdef CONFIG_PM_SLEEP static bool dsa_is_port_initialized(struct dsa_switch *ds, int p) { - return dsa_is_user_port(ds, p) && ds->ports[p].slave; + const struct dsa_port *dp = dsa_to_port(ds, p); + + return dp->type == DSA_PORT_TYPE_USER && dp->slave; } int dsa_switch_suspend(struct dsa_switch *ds) @@ -258,7 +260,7 @@ int dsa_switch_suspend(struct dsa_switch *ds) if (!dsa_is_port_initialized(ds, i)) continue; - ret = dsa_slave_suspend(ds->ports[i].slave); + ret = dsa_slave_suspend(dsa_to_port(ds, i)->slave); if (ret) return ret; } @@ -285,7 +287,7 @@ int dsa_switch_resume(struct dsa_switch *ds) if (!dsa_is_port_initialized(ds, i)) continue; - ret = dsa_slave_resume(ds->ports[i].slave); + ret = dsa_slave_resume(dsa_to_port(ds, i)->slave); if (ret) return ret; } @@ -329,6 +331,91 @@ int call_dsa_notifiers(unsigned long val, struct net_device *dev, } EXPORT_SYMBOL_GPL(call_dsa_notifiers); +int dsa_devlink_param_get(struct devlink *dl, u32 id, + struct devlink_param_gset_ctx *ctx) +{ + struct dsa_devlink_priv *dl_priv; + struct dsa_switch *ds; + + dl_priv = devlink_priv(dl); + ds = dl_priv->ds; + + if (!ds->ops->devlink_param_get) + return -EOPNOTSUPP; + + return ds->ops->devlink_param_get(ds, id, ctx); +} +EXPORT_SYMBOL_GPL(dsa_devlink_param_get); + +int dsa_devlink_param_set(struct devlink *dl, u32 id, + struct devlink_param_gset_ctx *ctx) +{ + struct dsa_devlink_priv *dl_priv; + struct dsa_switch *ds; + + dl_priv = devlink_priv(dl); + ds = dl_priv->ds; + + if (!ds->ops->devlink_param_set) + return -EOPNOTSUPP; + + return ds->ops->devlink_param_set(ds, id, ctx); +} +EXPORT_SYMBOL_GPL(dsa_devlink_param_set); + +int dsa_devlink_params_register(struct dsa_switch *ds, + const struct devlink_param *params, + size_t params_count) +{ + return devlink_params_register(ds->devlink, params, params_count); +} +EXPORT_SYMBOL_GPL(dsa_devlink_params_register); + +void dsa_devlink_params_unregister(struct dsa_switch *ds, + const struct devlink_param *params, + size_t params_count) +{ + devlink_params_unregister(ds->devlink, params, params_count); +} +EXPORT_SYMBOL_GPL(dsa_devlink_params_unregister); + +int dsa_devlink_resource_register(struct dsa_switch *ds, + const char *resource_name, + u64 resource_size, + u64 resource_id, + u64 parent_resource_id, + const struct devlink_resource_size_params *size_params) +{ + return devlink_resource_register(ds->devlink, resource_name, + resource_size, resource_id, + parent_resource_id, + size_params); +} +EXPORT_SYMBOL_GPL(dsa_devlink_resource_register); + +void dsa_devlink_resources_unregister(struct dsa_switch *ds) +{ + devlink_resources_unregister(ds->devlink, NULL); +} +EXPORT_SYMBOL_GPL(dsa_devlink_resources_unregister); + +void dsa_devlink_resource_occ_get_register(struct dsa_switch *ds, + u64 resource_id, + devlink_resource_occ_get_t *occ_get, + void *occ_get_priv) +{ + return devlink_resource_occ_get_register(ds->devlink, resource_id, + occ_get, occ_get_priv); +} +EXPORT_SYMBOL_GPL(dsa_devlink_resource_occ_get_register); + +void dsa_devlink_resource_occ_get_unregister(struct dsa_switch *ds, + u64 resource_id) +{ + devlink_resource_occ_get_unregister(ds->devlink, resource_id); +} +EXPORT_SYMBOL_GPL(dsa_devlink_resource_occ_get_unregister); + static int __init dsa_init_module(void) { int rc; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 3abd173ebacb..e7c30b472034 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -45,8 +45,12 @@ static struct dsa_switch_tree *dsa_tree_alloc(int index) dst->index = index; + INIT_LIST_HEAD(&dst->rtable); + + INIT_LIST_HEAD(&dst->ports); + INIT_LIST_HEAD(&dst->list); - list_add_tail(&dsa_tree_list, &dst->list); + list_add_tail(&dst->list, &dsa_tree_list); kref_init(&dst->refcount); @@ -111,24 +115,39 @@ static bool dsa_port_is_user(struct dsa_port *dp) static struct dsa_port *dsa_tree_find_port_by_node(struct dsa_switch_tree *dst, struct device_node *dn) { - struct dsa_switch *ds; struct dsa_port *dp; - int device, port; - for (device = 0; device < DSA_MAX_SWITCHES; device++) { - ds = dst->ds[device]; - if (!ds) - continue; + list_for_each_entry(dp, &dst->ports, list) + if (dp->dn == dn) + return dp; - for (port = 0; port < ds->num_ports; port++) { - dp = &ds->ports[port]; + return NULL; +} - if (dp->dn == dn) - return dp; - } - } +static struct dsa_link *dsa_link_touch(struct dsa_port *dp, + struct dsa_port *link_dp) +{ + struct dsa_switch *ds = dp->ds; + struct dsa_switch_tree *dst; + struct dsa_link *dl; - return NULL; + dst = ds->dst; + + list_for_each_entry(dl, &dst->rtable, list) + if (dl->dp == dp && dl->link_dp == link_dp) + return dl; + + dl = kzalloc(sizeof(*dl), GFP_KERNEL); + if (!dl) + return NULL; + + dl->dp = dp; + dl->link_dp = link_dp; + + INIT_LIST_HEAD(&dl->list); + list_add_tail(&dl->list, &dst->rtable); + + return dl; } static bool dsa_port_setup_routing_table(struct dsa_port *dp) @@ -138,6 +157,7 @@ static bool dsa_port_setup_routing_table(struct dsa_port *dp) struct device_node *dn = dp->dn; struct of_phandle_iterator it; struct dsa_port *link_dp; + struct dsa_link *dl; int err; of_for_each_phandle(&it, err, dn, "link", NULL, 0) { @@ -147,24 +167,22 @@ static bool dsa_port_setup_routing_table(struct dsa_port *dp) return false; } - ds->rtable[link_dp->ds->index] = dp->index; + dl = dsa_link_touch(dp, link_dp); + if (!dl) { + of_node_put(it.node); + return false; + } } return true; } -static bool dsa_switch_setup_routing_table(struct dsa_switch *ds) +static bool dsa_tree_setup_routing_table(struct dsa_switch_tree *dst) { bool complete = true; struct dsa_port *dp; - int i; - - for (i = 0; i < DSA_MAX_SWITCHES; i++) - ds->rtable[i] = DSA_RTABLE_NONE; - - for (i = 0; i < ds->num_ports; i++) { - dp = &ds->ports[i]; + list_for_each_entry(dp, &dst->ports, list) { if (dsa_port_is_dsa(dp)) { complete = dsa_port_setup_routing_table(dp); if (!complete) @@ -175,178 +193,176 @@ static bool dsa_switch_setup_routing_table(struct dsa_switch *ds) return complete; } -static bool dsa_tree_setup_routing_table(struct dsa_switch_tree *dst) -{ - struct dsa_switch *ds; - bool complete = true; - int device; - - for (device = 0; device < DSA_MAX_SWITCHES; device++) { - ds = dst->ds[device]; - if (!ds) - continue; - - complete = dsa_switch_setup_routing_table(ds); - if (!complete) - break; - } - - return complete; -} - static struct dsa_port *dsa_tree_find_first_cpu(struct dsa_switch_tree *dst) { - struct dsa_switch *ds; struct dsa_port *dp; - int device, port; - - for (device = 0; device < DSA_MAX_SWITCHES; device++) { - ds = dst->ds[device]; - if (!ds) - continue; - - for (port = 0; port < ds->num_ports; port++) { - dp = &ds->ports[port]; - if (dsa_port_is_cpu(dp)) - return dp; - } - } + list_for_each_entry(dp, &dst->ports, list) + if (dsa_port_is_cpu(dp)) + return dp; return NULL; } static int dsa_tree_setup_default_cpu(struct dsa_switch_tree *dst) { - struct dsa_switch *ds; - struct dsa_port *dp; - int device, port; + struct dsa_port *cpu_dp, *dp; - /* DSA currently only supports a single CPU port */ - dst->cpu_dp = dsa_tree_find_first_cpu(dst); - if (!dst->cpu_dp) { - pr_warn("Tree has no master device\n"); + cpu_dp = dsa_tree_find_first_cpu(dst); + if (!cpu_dp) { + pr_err("DSA: tree %d has no CPU port\n", dst->index); return -EINVAL; } /* Assign the default CPU port to all ports of the fabric */ - for (device = 0; device < DSA_MAX_SWITCHES; device++) { - ds = dst->ds[device]; - if (!ds) - continue; - - for (port = 0; port < ds->num_ports; port++) { - dp = &ds->ports[port]; - - if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp)) - dp->cpu_dp = dst->cpu_dp; - } - } + list_for_each_entry(dp, &dst->ports, list) + if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp)) + dp->cpu_dp = cpu_dp; return 0; } static void dsa_tree_teardown_default_cpu(struct dsa_switch_tree *dst) { - /* DSA currently only supports a single CPU port */ - dst->cpu_dp = NULL; + struct dsa_port *dp; + + list_for_each_entry(dp, &dst->ports, list) + if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp)) + dp->cpu_dp = NULL; } static int dsa_port_setup(struct dsa_port *dp) { - enum devlink_port_flavour flavour; struct dsa_switch *ds = dp->ds; struct dsa_switch_tree *dst = ds->dst; + const unsigned char *id = (const unsigned char *)&dst->index; + const unsigned char len = sizeof(dst->index); + struct devlink_port *dlp = &dp->devlink_port; + bool dsa_port_link_registered = false; + bool devlink_port_registered = false; + struct devlink *dl = ds->devlink; + bool dsa_port_enabled = false; int err = 0; - if (dp->type == DSA_PORT_TYPE_UNUSED) + if (dp->setup) return 0; - memset(&dp->devlink_port, 0, sizeof(dp->devlink_port)); - dp->mac = of_get_mac_address(dp->dn); - - switch (dp->type) { - case DSA_PORT_TYPE_CPU: - flavour = DEVLINK_PORT_FLAVOUR_CPU; - break; - case DSA_PORT_TYPE_DSA: - flavour = DEVLINK_PORT_FLAVOUR_DSA; - break; - case DSA_PORT_TYPE_USER: /* fall-through */ - default: - flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL; - break; - } - - /* dp->index is used now as port_number. However - * CPU and DSA ports should have separate numbering - * independent from front panel port numbers. - */ - devlink_port_attrs_set(&dp->devlink_port, flavour, - dp->index, false, 0, - (const char *) &dst->index, sizeof(dst->index)); - err = devlink_port_register(ds->devlink, &dp->devlink_port, - dp->index); - if (err) - return err; - switch (dp->type) { case DSA_PORT_TYPE_UNUSED: + dsa_port_disable(dp); break; case DSA_PORT_TYPE_CPU: + memset(dlp, 0, sizeof(*dlp)); + devlink_port_attrs_set(dlp, DEVLINK_PORT_FLAVOUR_CPU, + dp->index, false, 0, id, len); + err = devlink_port_register(dl, dlp, dp->index); + if (err) + break; + devlink_port_registered = true; + err = dsa_port_link_register_of(dp); if (err) - dev_err(ds->dev, "failed to setup link for port %d.%d\n", - ds->index, dp->index); + break; + dsa_port_link_registered = true; + + err = dsa_port_enable(dp, NULL); + if (err) + break; + dsa_port_enabled = true; + break; case DSA_PORT_TYPE_DSA: + memset(dlp, 0, sizeof(*dlp)); + devlink_port_attrs_set(dlp, DEVLINK_PORT_FLAVOUR_DSA, + dp->index, false, 0, id, len); + err = devlink_port_register(dl, dlp, dp->index); + if (err) + break; + devlink_port_registered = true; + err = dsa_port_link_register_of(dp); if (err) - dev_err(ds->dev, "failed to setup link for port %d.%d\n", - ds->index, dp->index); + break; + dsa_port_link_registered = true; + + err = dsa_port_enable(dp, NULL); + if (err) + break; + dsa_port_enabled = true; + break; case DSA_PORT_TYPE_USER: + memset(dlp, 0, sizeof(*dlp)); + devlink_port_attrs_set(dlp, DEVLINK_PORT_FLAVOUR_PHYSICAL, + dp->index, false, 0, id, len); + err = devlink_port_register(dl, dlp, dp->index); + if (err) + break; + devlink_port_registered = true; + + dp->mac = of_get_mac_address(dp->dn); err = dsa_slave_create(dp); if (err) - dev_err(ds->dev, "failed to create slave for port %d.%d\n", - ds->index, dp->index); - else - devlink_port_type_eth_set(&dp->devlink_port, dp->slave); + break; + + devlink_port_type_eth_set(dlp, dp->slave); break; } + if (err && dsa_port_enabled) + dsa_port_disable(dp); + if (err && dsa_port_link_registered) + dsa_port_link_unregister_of(dp); + if (err && devlink_port_registered) + devlink_port_unregister(dlp); if (err) - devlink_port_unregister(&dp->devlink_port); + return err; - return err; + dp->setup = true; + + return 0; } static void dsa_port_teardown(struct dsa_port *dp) { - if (dp->type != DSA_PORT_TYPE_UNUSED) - devlink_port_unregister(&dp->devlink_port); + struct devlink_port *dlp = &dp->devlink_port; + + if (!dp->setup) + return; switch (dp->type) { case DSA_PORT_TYPE_UNUSED: break; case DSA_PORT_TYPE_CPU: + dsa_port_disable(dp); dsa_tag_driver_put(dp->tag_ops); - /* fall-through */ + devlink_port_unregister(dlp); + dsa_port_link_unregister_of(dp); + break; case DSA_PORT_TYPE_DSA: + dsa_port_disable(dp); + devlink_port_unregister(dlp); dsa_port_link_unregister_of(dp); break; case DSA_PORT_TYPE_USER: + devlink_port_unregister(dlp); if (dp->slave) { dsa_slave_destroy(dp->slave); dp->slave = NULL; } break; } + + dp->setup = false; } static int dsa_switch_setup(struct dsa_switch *ds) { - int err = 0; + struct dsa_devlink_priv *dl_priv; + int err; + + if (ds->setup) + return 0; /* Initialize ds->phys_mii_mask before registering the slave MDIO bus * driver and before ops->setup() has run, since the switch drivers and @@ -358,9 +374,11 @@ static int dsa_switch_setup(struct dsa_switch *ds) /* Add the switch to devlink before calling setup, so that setup can * add dpipe tables */ - ds->devlink = devlink_alloc(&dsa_devlink_ops, 0); + ds->devlink = devlink_alloc(&dsa_devlink_ops, sizeof(*dl_priv)); if (!ds->devlink) return -ENOMEM; + dl_priv = devlink_priv(ds->devlink); + dl_priv->ds = ds; err = devlink_register(ds->devlink, ds->dev); if (err) @@ -374,6 +392,8 @@ static int dsa_switch_setup(struct dsa_switch *ds) if (err < 0) goto unregister_notifier; + devlink_params_publish(ds->devlink); + if (!ds->slave_mii_bus && ds->ops->phy_read) { ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev); if (!ds->slave_mii_bus) { @@ -388,6 +408,8 @@ static int dsa_switch_setup(struct dsa_switch *ds) goto unregister_notifier; } + ds->setup = true; + return 0; unregister_notifier: @@ -403,6 +425,9 @@ free_devlink: static void dsa_switch_teardown(struct dsa_switch *ds) { + if (!ds->setup) + return; + if (ds->slave_mii_bus && ds->ops->phy_read) mdiobus_unregister(ds->slave_mii_bus); @@ -417,95 +442,72 @@ static void dsa_switch_teardown(struct dsa_switch *ds) ds->devlink = NULL; } + ds->setup = false; } static int dsa_tree_setup_switches(struct dsa_switch_tree *dst) { - struct dsa_switch *ds; struct dsa_port *dp; - int device, port, i; - int err = 0; - - for (device = 0; device < DSA_MAX_SWITCHES; device++) { - ds = dst->ds[device]; - if (!ds) - continue; + int err; - err = dsa_switch_setup(ds); + list_for_each_entry(dp, &dst->ports, list) { + err = dsa_switch_setup(dp->ds); if (err) - goto switch_teardown; - - for (port = 0; port < ds->num_ports; port++) { - dp = &ds->ports[port]; + goto teardown; + } - err = dsa_port_setup(dp); - if (err) - goto ports_teardown; - } + list_for_each_entry(dp, &dst->ports, list) { + err = dsa_port_setup(dp); + if (err) + goto teardown; } return 0; -ports_teardown: - for (i = 0; i < port; i++) - dsa_port_teardown(&ds->ports[i]); - - dsa_switch_teardown(ds); +teardown: + list_for_each_entry(dp, &dst->ports, list) + dsa_port_teardown(dp); -switch_teardown: - for (i = 0; i < device; i++) { - ds = dst->ds[i]; - if (!ds) - continue; - - for (port = 0; port < ds->num_ports; port++) { - dp = &ds->ports[port]; - - dsa_port_teardown(dp); - } - - dsa_switch_teardown(ds); - } + list_for_each_entry(dp, &dst->ports, list) + dsa_switch_teardown(dp->ds); return err; } static void dsa_tree_teardown_switches(struct dsa_switch_tree *dst) { - struct dsa_switch *ds; struct dsa_port *dp; - int device, port; - for (device = 0; device < DSA_MAX_SWITCHES; device++) { - ds = dst->ds[device]; - if (!ds) - continue; - - for (port = 0; port < ds->num_ports; port++) { - dp = &ds->ports[port]; - - dsa_port_teardown(dp); - } + list_for_each_entry(dp, &dst->ports, list) + dsa_port_teardown(dp); - dsa_switch_teardown(ds); - } + list_for_each_entry(dp, &dst->ports, list) + dsa_switch_teardown(dp->ds); } static int dsa_tree_setup_master(struct dsa_switch_tree *dst) { - struct dsa_port *cpu_dp = dst->cpu_dp; - struct net_device *master = cpu_dp->master; + struct dsa_port *dp; + int err; - /* DSA currently supports a single pair of CPU port and master device */ - return dsa_master_setup(master, cpu_dp); + list_for_each_entry(dp, &dst->ports, list) { + if (dsa_port_is_cpu(dp)) { + err = dsa_master_setup(dp->master, dp); + if (err) + return err; + } + } + + return 0; } static void dsa_tree_teardown_master(struct dsa_switch_tree *dst) { - struct dsa_port *cpu_dp = dst->cpu_dp; - struct net_device *master = cpu_dp->master; + struct dsa_port *dp; - return dsa_master_teardown(master); + list_for_each_entry(dp, &dst->ports, list) + if (dsa_port_is_cpu(dp)) + dsa_master_teardown(dp->master); } static int dsa_tree_setup(struct dsa_switch_tree *dst) @@ -551,6 +553,8 @@ teardown_default_cpu: static void dsa_tree_teardown(struct dsa_switch_tree *dst) { + struct dsa_link *dl, *next; + if (!dst->setup) return; @@ -560,39 +564,36 @@ static void dsa_tree_teardown(struct dsa_switch_tree *dst) dsa_tree_teardown_default_cpu(dst); + list_for_each_entry_safe(dl, next, &dst->rtable, list) { + list_del(&dl->list); + kfree(dl); + } + pr_info("DSA: tree %d torn down\n", dst->index); dst->setup = false; } -static void dsa_tree_remove_switch(struct dsa_switch_tree *dst, - unsigned int index) +static struct dsa_port *dsa_port_touch(struct dsa_switch *ds, int index) { - dsa_tree_teardown(dst); + struct dsa_switch_tree *dst = ds->dst; + struct dsa_port *dp; - dst->ds[index] = NULL; - dsa_tree_put(dst); -} + list_for_each_entry(dp, &dst->ports, list) + if (dp->ds == ds && dp->index == index) + return dp; -static int dsa_tree_add_switch(struct dsa_switch_tree *dst, - struct dsa_switch *ds) -{ - unsigned int index = ds->index; - int err; + dp = kzalloc(sizeof(*dp), GFP_KERNEL); + if (!dp) + return NULL; - if (dst->ds[index]) - return -EBUSY; + dp->ds = ds; + dp->index = index; - dsa_tree_get(dst); - dst->ds[index] = ds; + INIT_LIST_HEAD(&dp->list); + list_add_tail(&dp->list, &dst->ports); - err = dsa_tree_setup(dst); - if (err) { - dst->ds[index] = NULL; - dsa_tree_put(dst); - } - - return err; + return dp; } static int dsa_port_parse_user(struct dsa_port *dp, const char *name) @@ -613,6 +614,32 @@ static int dsa_port_parse_dsa(struct dsa_port *dp) return 0; } +static enum dsa_tag_protocol dsa_get_tag_protocol(struct dsa_port *dp, + struct net_device *master) +{ + enum dsa_tag_protocol tag_protocol = DSA_TAG_PROTO_NONE; + struct dsa_switch *mds, *ds = dp->ds; + unsigned int mdp_upstream; + struct dsa_port *mdp; + + /* It is possible to stack DSA switches onto one another when that + * happens the switch driver may want to know if its tagging protocol + * is going to work in such a configuration. + */ + if (dsa_slave_dev_check(master)) { + mdp = dsa_slave_to_port(master); + mds = mdp->ds; + mdp_upstream = dsa_upstream_port(mds, mdp->index); + tag_protocol = mds->ops->get_tag_protocol(mds, mdp_upstream, + DSA_TAG_PROTO_NONE); + } + + /* If the master device is not itself a DSA slave in a disjoint DSA + * tree, then return immediately. + */ + return ds->ops->get_tag_protocol(ds, dp->index, tag_protocol); +} + static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master) { struct dsa_switch *ds = dp->ds; @@ -620,18 +647,21 @@ static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master) const struct dsa_device_ops *tag_ops; enum dsa_tag_protocol tag_protocol; - tag_protocol = ds->ops->get_tag_protocol(ds, dp->index); + tag_protocol = dsa_get_tag_protocol(dp, master); tag_ops = dsa_tag_driver_get(tag_protocol); if (IS_ERR(tag_ops)) { + if (PTR_ERR(tag_ops) == -ENOPROTOOPT) + return -EPROBE_DEFER; dev_warn(ds->dev, "No tagger for this switch\n"); + dp->master = NULL; return PTR_ERR(tag_ops); } + dp->master = master; dp->type = DSA_PORT_TYPE_CPU; dp->filter = tag_ops->filter; dp->rcv = tag_ops->rcv; dp->tag_ops = tag_ops; - dp->master = master; dp->dst = dst; return 0; @@ -685,7 +715,7 @@ static int dsa_switch_parse_ports_of(struct dsa_switch *ds, goto out_put_node; } - dp = &ds->ports[reg]; + dp = dsa_to_port(ds, reg); err = dsa_port_parse_of(dp, port); if (err) @@ -709,8 +739,6 @@ static int dsa_switch_parse_member_of(struct dsa_switch *ds, return sz; ds->index = m[1]; - if (ds->index >= DSA_MAX_SWITCHES) - return -EINVAL; ds->dst = dsa_tree_touch(m[0]); if (!ds->dst) @@ -719,6 +747,20 @@ static int dsa_switch_parse_member_of(struct dsa_switch *ds, return 0; } +static int dsa_switch_touch_ports(struct dsa_switch *ds) +{ + struct dsa_port *dp; + int port; + + for (port = 0; port < ds->num_ports; port++) { + dp = dsa_port_touch(ds, port); + if (!dp) + return -ENOMEM; + } + + return 0; +} + static int dsa_switch_parse_of(struct dsa_switch *ds, struct device_node *dn) { int err; @@ -727,6 +769,10 @@ static int dsa_switch_parse_of(struct dsa_switch *ds, struct device_node *dn) if (err) return err; + err = dsa_switch_touch_ports(ds); + if (err) + return err; + return dsa_switch_parse_ports_of(ds, dn); } @@ -764,7 +810,7 @@ static int dsa_switch_parse_ports(struct dsa_switch *ds, for (i = 0; i < DSA_MAX_PORTS; i++) { name = cd->port_names[i]; dev = cd->netdev[i]; - dp = &ds->ports[i]; + dp = dsa_to_port(ds, i); if (!name) continue; @@ -784,6 +830,8 @@ static int dsa_switch_parse_ports(struct dsa_switch *ds, static int dsa_switch_parse(struct dsa_switch *ds, struct dsa_chip_data *cd) { + int err; + ds->cd = cd; /* We don't support interconnected switches nor multiple trees via @@ -794,69 +842,67 @@ static int dsa_switch_parse(struct dsa_switch *ds, struct dsa_chip_data *cd) if (!ds->dst) return -ENOMEM; + err = dsa_switch_touch_ports(ds); + if (err) + return err; + return dsa_switch_parse_ports(ds, cd); } -static int dsa_switch_add(struct dsa_switch *ds) +static void dsa_switch_release_ports(struct dsa_switch *ds) { struct dsa_switch_tree *dst = ds->dst; + struct dsa_port *dp, *next; - return dsa_tree_add_switch(dst, ds); + list_for_each_entry_safe(dp, next, &dst->ports, list) { + if (dp->ds != ds) + continue; + list_del(&dp->list); + kfree(dp); + } } static int dsa_switch_probe(struct dsa_switch *ds) { - struct dsa_chip_data *pdata = ds->dev->platform_data; - struct device_node *np = ds->dev->of_node; + struct dsa_switch_tree *dst; + struct dsa_chip_data *pdata; + struct device_node *np; int err; - if (np) - err = dsa_switch_parse_of(ds, np); - else if (pdata) - err = dsa_switch_parse(ds, pdata); - else - err = -ENODEV; + if (!ds->dev) + return -ENODEV; - if (err) - return err; + pdata = ds->dev->platform_data; + np = ds->dev->of_node; - return dsa_switch_add(ds); -} - -struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n) -{ - struct dsa_switch *ds; - int i; - - ds = devm_kzalloc(dev, struct_size(ds, ports, n), GFP_KERNEL); - if (!ds) - return NULL; + if (!ds->num_ports) + return -EINVAL; - /* We avoid allocating memory outside dsa_switch - * if it is not needed. - */ - if (n <= sizeof(ds->_bitmap) * 8) { - ds->bitmap = &ds->_bitmap; + if (np) { + err = dsa_switch_parse_of(ds, np); + if (err) + dsa_switch_release_ports(ds); + } else if (pdata) { + err = dsa_switch_parse(ds, pdata); + if (err) + dsa_switch_release_ports(ds); } else { - ds->bitmap = devm_kcalloc(dev, - BITS_TO_LONGS(n), - sizeof(unsigned long), - GFP_KERNEL); - if (unlikely(!ds->bitmap)) - return NULL; + err = -ENODEV; } - ds->dev = dev; - ds->num_ports = n; + if (err) + return err; - for (i = 0; i < ds->num_ports; ++i) { - ds->ports[i].index = i; - ds->ports[i].ds = ds; + dst = ds->dst; + dsa_tree_get(dst); + err = dsa_tree_setup(dst); + if (err) { + dsa_switch_release_ports(ds); + dsa_tree_put(dst); } - return ds; + return err; } -EXPORT_SYMBOL_GPL(dsa_switch_alloc); int dsa_register_switch(struct dsa_switch *ds) { @@ -874,9 +920,10 @@ EXPORT_SYMBOL_GPL(dsa_register_switch); static void dsa_switch_remove(struct dsa_switch *ds) { struct dsa_switch_tree *dst = ds->dst; - unsigned int index = ds->index; - dsa_tree_remove_switch(dst, index); + dsa_tree_teardown(dst); + dsa_switch_release_ports(ds); + dsa_tree_put(dst); } void dsa_unregister_switch(struct dsa_switch *ds) diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 12f8c7ee4dd8..a7662e7a691d 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -104,25 +104,14 @@ static inline struct net_device *dsa_master_find_slave(struct net_device *dev, { struct dsa_port *cpu_dp = dev->dsa_ptr; struct dsa_switch_tree *dst = cpu_dp->dst; - struct dsa_switch *ds; - struct dsa_port *slave_port; + struct dsa_port *dp; - if (device < 0 || device >= DSA_MAX_SWITCHES) - return NULL; + list_for_each_entry(dp, &dst->ports, list) + if (dp->ds->index == device && dp->index == port && + dp->type == DSA_PORT_TYPE_USER) + return dp->slave; - ds = dst->ds[device]; - if (!ds) - return NULL; - - if (port < 0 || port >= ds->num_ports) - return NULL; - - slave_port = &ds->ports[port]; - - if (unlikely(slave_port->type != DSA_PORT_TYPE_USER)) - return NULL; - - return slave_port->slave; + return NULL; } /* port.c */ @@ -161,22 +150,6 @@ int dsa_port_vid_add(struct dsa_port *dp, u16 vid, u16 flags); int dsa_port_vid_del(struct dsa_port *dp, u16 vid); int dsa_port_link_register_of(struct dsa_port *dp); void dsa_port_link_unregister_of(struct dsa_port *dp); -void dsa_port_phylink_validate(struct phylink_config *config, - unsigned long *supported, - struct phylink_link_state *state); -int dsa_port_phylink_mac_link_state(struct phylink_config *config, - struct phylink_link_state *state); -void dsa_port_phylink_mac_config(struct phylink_config *config, - unsigned int mode, - const struct phylink_link_state *state); -void dsa_port_phylink_mac_an_restart(struct phylink_config *config); -void dsa_port_phylink_mac_link_down(struct phylink_config *config, - unsigned int mode, - phy_interface_t interface); -void dsa_port_phylink_mac_link_up(struct phylink_config *config, - unsigned int mode, - phy_interface_t interface, - struct phy_device *phydev); extern const struct phylink_mac_ops dsa_port_phylink_mac_ops; /* slave.c */ @@ -184,13 +157,12 @@ extern const struct dsa_device_ops notag_netdev_ops; void dsa_slave_mii_bus_init(struct dsa_switch *ds); int dsa_slave_create(struct dsa_port *dp); void dsa_slave_destroy(struct net_device *slave_dev); +bool dsa_slave_dev_check(const struct net_device *dev); int dsa_slave_suspend(struct net_device *slave_dev); int dsa_slave_resume(struct net_device *slave_dev); int dsa_slave_register_notifier(void); void dsa_slave_unregister_notifier(void); -void *dsa_defer_xmit(struct sk_buff *skb, struct net_device *dev); - static inline struct dsa_port *dsa_slave_to_port(const struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); diff --git a/net/dsa/master.c b/net/dsa/master.c index 4b52f8bac5e1..bd44bde272f4 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -8,6 +8,70 @@ #include "dsa_priv.h" +static int dsa_master_get_regs_len(struct net_device *dev) +{ + struct dsa_port *cpu_dp = dev->dsa_ptr; + const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; + struct dsa_switch *ds = cpu_dp->ds; + int port = cpu_dp->index; + int ret = 0; + int len; + + if (ops->get_regs_len) { + len = ops->get_regs_len(dev); + if (len < 0) + return len; + ret += len; + } + + ret += sizeof(struct ethtool_drvinfo); + ret += sizeof(struct ethtool_regs); + + if (ds->ops->get_regs_len) { + len = ds->ops->get_regs_len(ds, port); + if (len < 0) + return len; + ret += len; + } + + return ret; +} + +static void dsa_master_get_regs(struct net_device *dev, + struct ethtool_regs *regs, void *data) +{ + struct dsa_port *cpu_dp = dev->dsa_ptr; + const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; + struct dsa_switch *ds = cpu_dp->ds; + struct ethtool_drvinfo *cpu_info; + struct ethtool_regs *cpu_regs; + int port = cpu_dp->index; + int len; + + if (ops->get_regs_len && ops->get_regs) { + len = ops->get_regs_len(dev); + if (len < 0) + return; + regs->len = len; + ops->get_regs(dev, regs, data); + data += regs->len; + } + + cpu_info = (struct ethtool_drvinfo *)data; + strlcpy(cpu_info->driver, "dsa", sizeof(cpu_info->driver)); + data += sizeof(*cpu_info); + cpu_regs = (struct ethtool_regs *)data; + data += sizeof(*cpu_regs); + + if (ds->ops->get_regs_len && ds->ops->get_regs) { + len = ds->ops->get_regs_len(ds, port); + if (len < 0) + return; + cpu_regs->len = len; + ds->ops->get_regs(ds, port, cpu_regs, data); + } +} + static void dsa_master_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, uint64_t *data) @@ -133,6 +197,35 @@ static int dsa_master_get_phys_port_name(struct net_device *dev, return 0; } +static int dsa_master_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + struct dsa_port *cpu_dp = dev->dsa_ptr; + struct dsa_switch *ds = cpu_dp->ds; + struct dsa_switch_tree *dst; + int err = -EOPNOTSUPP; + struct dsa_port *dp; + + dst = ds->dst; + + switch (cmd) { + case SIOCGHWTSTAMP: + case SIOCSHWTSTAMP: + /* Deny PTP operations on master if there is at least one + * switch in the tree that is PTP capable. + */ + list_for_each_entry(dp, &dst->ports, list) + if (dp->ds->ops->port_hwtstamp_get || + dp->ds->ops->port_hwtstamp_set) + return -EBUSY; + break; + } + + if (cpu_dp->orig_ndo_ops && cpu_dp->orig_ndo_ops->ndo_do_ioctl) + err = cpu_dp->orig_ndo_ops->ndo_do_ioctl(dev, ifr, cmd); + + return err; +} + static int dsa_master_ethtool_setup(struct net_device *dev) { struct dsa_port *cpu_dp = dev->dsa_ptr; @@ -147,6 +240,8 @@ static int dsa_master_ethtool_setup(struct net_device *dev) if (cpu_dp->orig_ethtool_ops) memcpy(ops, cpu_dp->orig_ethtool_ops, sizeof(*ops)); + ops->get_regs_len = dsa_master_get_regs_len; + ops->get_regs = dsa_master_get_regs; ops->get_sset_count = dsa_master_get_sset_count; ops->get_ethtool_stats = dsa_master_get_ethtool_stats; ops->get_strings = dsa_master_get_strings; @@ -183,6 +278,7 @@ static int dsa_master_ndo_setup(struct net_device *dev) memcpy(ops, cpu_dp->orig_ndo_ops, sizeof(*ops)); ops->ndo_get_phys_port_name = dsa_master_get_phys_port_name; + ops->ndo_do_ioctl = dsa_master_ioctl; dev->netdev_ops = ops; @@ -244,8 +340,6 @@ static void dsa_master_reset_mtu(struct net_device *dev) rtnl_unlock(); } -static struct lock_class_key dsa_master_addr_list_lock_key; - int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) { int ret; @@ -259,9 +353,6 @@ int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) wmb(); dev->dsa_ptr = cpu_dp; - lockdep_set_class(&dev->addr_list_lock, - &dsa_master_addr_list_lock_key); - ret = dsa_master_ethtool_setup(dev); if (ret) return ret; diff --git a/net/dsa/port.c b/net/dsa/port.c index f071acf2842b..774facb8d547 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -348,10 +348,7 @@ int dsa_port_vlan_add(struct dsa_port *dp, .vlan = vlan, }; - if (!dp->bridge_dev || br_vlan_enabled(dp->bridge_dev)) - return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_ADD, &info); - - return 0; + return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_ADD, &info); } int dsa_port_vlan_del(struct dsa_port *dp, @@ -363,10 +360,7 @@ int dsa_port_vlan_del(struct dsa_port *dp, .vlan = vlan, }; - if (!dp->bridge_dev || br_vlan_enabled(dp->bridge_dev)) - return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info); - - return 0; + return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info); } int dsa_port_vid_add(struct dsa_port *dp, u16 vid, u16 flags) @@ -382,8 +376,8 @@ int dsa_port_vid_add(struct dsa_port *dp, u16 vid, u16 flags) trans.ph_prepare = true; err = dsa_port_vlan_add(dp, &vlan, &trans); - if (err == -EOPNOTSUPP) - return 0; + if (err) + return err; trans.ph_prepare = false; return dsa_port_vlan_add(dp, &vlan, &trans); @@ -421,9 +415,9 @@ static struct phy_device *dsa_port_get_phy_device(struct dsa_port *dp) return phydev; } -void dsa_port_phylink_validate(struct phylink_config *config, - unsigned long *supported, - struct phylink_link_state *state) +static void dsa_port_phylink_validate(struct phylink_config *config, + unsigned long *supported, + struct phylink_link_state *state) { struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); struct dsa_switch *ds = dp->ds; @@ -433,25 +427,26 @@ void dsa_port_phylink_validate(struct phylink_config *config, ds->ops->phylink_validate(ds, dp->index, supported, state); } -EXPORT_SYMBOL_GPL(dsa_port_phylink_validate); -int dsa_port_phylink_mac_link_state(struct phylink_config *config, - struct phylink_link_state *state) +static void dsa_port_phylink_mac_pcs_get_state(struct phylink_config *config, + struct phylink_link_state *state) { struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); struct dsa_switch *ds = dp->ds; - /* Only called for SGMII and 802.3z */ - if (!ds->ops->phylink_mac_link_state) - return -EOPNOTSUPP; + /* Only called for inband modes */ + if (!ds->ops->phylink_mac_link_state) { + state->link = 0; + return; + } - return ds->ops->phylink_mac_link_state(ds, dp->index, state); + if (ds->ops->phylink_mac_link_state(ds, dp->index, state) < 0) + state->link = 0; } -EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_link_state); -void dsa_port_phylink_mac_config(struct phylink_config *config, - unsigned int mode, - const struct phylink_link_state *state) +static void dsa_port_phylink_mac_config(struct phylink_config *config, + unsigned int mode, + const struct phylink_link_state *state) { struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); struct dsa_switch *ds = dp->ds; @@ -461,9 +456,8 @@ void dsa_port_phylink_mac_config(struct phylink_config *config, ds->ops->phylink_mac_config(ds, dp->index, mode, state); } -EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_config); -void dsa_port_phylink_mac_an_restart(struct phylink_config *config) +static void dsa_port_phylink_mac_an_restart(struct phylink_config *config) { struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); struct dsa_switch *ds = dp->ds; @@ -473,11 +467,10 @@ void dsa_port_phylink_mac_an_restart(struct phylink_config *config) ds->ops->phylink_mac_an_restart(ds, dp->index); } -EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_an_restart); -void dsa_port_phylink_mac_link_down(struct phylink_config *config, - unsigned int mode, - phy_interface_t interface) +static void dsa_port_phylink_mac_link_down(struct phylink_config *config, + unsigned int mode, + phy_interface_t interface) { struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); struct phy_device *phydev = NULL; @@ -494,12 +487,11 @@ void dsa_port_phylink_mac_link_down(struct phylink_config *config, ds->ops->phylink_mac_link_down(ds, dp->index, mode, interface); } -EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_link_down); -void dsa_port_phylink_mac_link_up(struct phylink_config *config, - unsigned int mode, - phy_interface_t interface, - struct phy_device *phydev) +static void dsa_port_phylink_mac_link_up(struct phylink_config *config, + unsigned int mode, + phy_interface_t interface, + struct phy_device *phydev) { struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); struct dsa_switch *ds = dp->ds; @@ -512,11 +504,10 @@ void dsa_port_phylink_mac_link_up(struct phylink_config *config, ds->ops->phylink_mac_link_up(ds, dp->index, mode, interface, phydev); } -EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_link_up); const struct phylink_mac_ops dsa_port_phylink_mac_ops = { .validate = dsa_port_phylink_validate, - .mac_link_state = dsa_port_phylink_mac_link_state, + .mac_pcs_get_state = dsa_port_phylink_mac_pcs_get_state, .mac_config = dsa_port_phylink_mac_config, .mac_an_restart = dsa_port_phylink_mac_an_restart, .mac_link_down = dsa_port_phylink_mac_link_down, @@ -538,10 +529,6 @@ static int dsa_port_setup_phy_of(struct dsa_port *dp, bool enable) return PTR_ERR(phydev); if (enable) { - err = genphy_config_init(phydev); - if (err < 0) - goto err_put_dev; - err = genphy_resume(phydev); if (err < 0) goto err_put_dev; @@ -571,7 +558,7 @@ static int dsa_port_fixed_link_register_of(struct dsa_port *dp) struct dsa_switch *ds = dp->ds; struct phy_device *phydev; int port = dp->index; - int mode; + phy_interface_t mode; int err; err = of_phy_register_fixed_link(dn); @@ -584,12 +571,11 @@ static int dsa_port_fixed_link_register_of(struct dsa_port *dp) phydev = of_phy_find_device(dn); - mode = of_get_phy_mode(dn); - if (mode < 0) + err = of_get_phy_mode(dn, &mode); + if (err) mode = PHY_INTERFACE_MODE_NA; phydev->interface = mode; - genphy_config_init(phydev); genphy_read_status(phydev); if (ds->ops->adjust_link) @@ -604,14 +590,16 @@ static int dsa_port_phylink_register(struct dsa_port *dp) { struct dsa_switch *ds = dp->ds; struct device_node *port_dn = dp->dn; - int mode, err; + phy_interface_t mode; + int err; - mode = of_get_phy_mode(port_dn); - if (mode < 0) + err = of_get_phy_mode(port_dn, &mode); + if (err) mode = PHY_INTERFACE_MODE_NA; dp->pl_config.dev = ds->dev; dp->pl_config.type = PHYLINK_DEV; + dp->pl_config.pcs_poll = ds->pcs_poll; dp->pl = phylink_create(&dp->pl_config, of_fwnode_handle(port_dn), mode, &dsa_port_phylink_mac_ops); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 33f41178afcc..088c886e609e 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -22,8 +22,6 @@ #include "dsa_priv.h" -static bool dsa_slave_dev_check(const struct net_device *dev); - /* slave mii_bus handling ***************************************************/ static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg) { @@ -116,9 +114,6 @@ static int dsa_slave_close(struct net_device *dev) struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); - cancel_work_sync(&dp->xmit_work); - skb_queue_purge(&dp->xmit_queue); - phylink_stop(dp->pl); dsa_port_disable(dp); @@ -312,6 +307,39 @@ static int dsa_slave_port_attr_set(struct net_device *dev, return ret; } +static int dsa_slave_vlan_add(struct net_device *dev, + const struct switchdev_obj *obj, + struct switchdev_trans *trans) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct switchdev_obj_port_vlan vlan; + int err; + + if (obj->orig_dev != dev) + return -EOPNOTSUPP; + + if (dp->bridge_dev && !br_vlan_enabled(dp->bridge_dev)) + return 0; + + vlan = *SWITCHDEV_OBJ_PORT_VLAN(obj); + + err = dsa_port_vlan_add(dp, &vlan, trans); + if (err) + return err; + + /* We need the dedicated CPU port to be a member of the VLAN as well. + * Even though drivers often handle CPU membership in special ways, + * it doesn't make sense to program a PVID, so clear this flag. + */ + vlan.flags &= ~BRIDGE_VLAN_INFO_PVID; + + err = dsa_port_vlan_add(dp->cpu_dp, &vlan, trans); + if (err) + return err; + + return 0; +} + static int dsa_slave_port_obj_add(struct net_device *dev, const struct switchdev_obj *obj, struct switchdev_trans *trans, @@ -339,10 +367,7 @@ static int dsa_slave_port_obj_add(struct net_device *dev, trans); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: - if (obj->orig_dev != dev) - return -EOPNOTSUPP; - err = dsa_port_vlan_add(dp, SWITCHDEV_OBJ_PORT_VLAN(obj), - trans); + err = dsa_slave_vlan_add(dev, obj, trans); break; default: err = -EOPNOTSUPP; @@ -352,6 +377,23 @@ static int dsa_slave_port_obj_add(struct net_device *dev, return err; } +static int dsa_slave_vlan_del(struct net_device *dev, + const struct switchdev_obj *obj) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + + if (obj->orig_dev != dev) + return -EOPNOTSUPP; + + if (dp->bridge_dev && !br_vlan_enabled(dp->bridge_dev)) + return 0; + + /* Do not deprogram the CPU port as it may be shared with other user + * ports which can be members of this VLAN as well. + */ + return dsa_port_vlan_del(dp, SWITCHDEV_OBJ_PORT_VLAN(obj)); +} + static int dsa_slave_port_obj_del(struct net_device *dev, const struct switchdev_obj *obj) { @@ -371,9 +413,7 @@ static int dsa_slave_port_obj_del(struct net_device *dev, err = dsa_port_mdb_del(dp->cpu_dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: - if (obj->orig_dev != dev) - return -EOPNOTSUPP; - err = dsa_port_vlan_del(dp, SWITCHDEV_OBJ_PORT_VLAN(obj)); + err = dsa_slave_vlan_del(dev, obj); break; default: err = -EOPNOTSUPP; @@ -473,7 +513,6 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) s->tx_bytes += skb->len; u64_stats_update_end(&s->syncp); - DSA_SKB_CB(skb)->deferred_xmit = false; DSA_SKB_CB(skb)->clone = NULL; /* Identify PTP protocol packets, clone them, and pass them to the @@ -486,39 +525,13 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) */ nskb = p->xmit(skb, dev); if (!nskb) { - if (!DSA_SKB_CB(skb)->deferred_xmit) - kfree_skb(skb); + kfree_skb(skb); return NETDEV_TX_OK; } return dsa_enqueue_skb(nskb, dev); } -void *dsa_defer_xmit(struct sk_buff *skb, struct net_device *dev) -{ - struct dsa_port *dp = dsa_slave_to_port(dev); - - DSA_SKB_CB(skb)->deferred_xmit = true; - - skb_queue_tail(&dp->xmit_queue, skb); - schedule_work(&dp->xmit_work); - return NULL; -} -EXPORT_SYMBOL_GPL(dsa_defer_xmit); - -static void dsa_port_xmit_work(struct work_struct *work) -{ - struct dsa_port *dp = container_of(work, struct dsa_port, xmit_work); - struct dsa_switch *ds = dp->ds; - struct sk_buff *skb; - - if (unlikely(!ds->ops->port_deferred_xmit)) - return; - - while ((skb = skb_dequeue(&dp->xmit_queue)) != NULL) - ds->ops->port_deferred_xmit(ds, dp->index, skb); -} - /* ethtool operations *******************************************************/ static void dsa_slave_get_drvinfo(struct net_device *dev, @@ -744,6 +757,22 @@ static int dsa_slave_set_link_ksettings(struct net_device *dev, return phylink_ethtool_ksettings_set(dp->pl, cmd); } +static void dsa_slave_get_pauseparam(struct net_device *dev, + struct ethtool_pauseparam *pause) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + + phylink_ethtool_get_pauseparam(dp->pl, pause); +} + +static int dsa_slave_set_pauseparam(struct net_device *dev, + struct ethtool_pauseparam *pause) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + + return phylink_ethtool_set_pauseparam(dp->pl, pause); +} + #ifdef CONFIG_NET_POLL_CONTROLLER static int dsa_slave_netpoll_setup(struct net_device *dev, struct netpoll_info *ni) @@ -990,12 +1019,16 @@ static int dsa_slave_setup_tc_block(struct net_device *dev, static int dsa_slave_setup_tc(struct net_device *dev, enum tc_setup_type type, void *type_data) { - switch (type) { - case TC_SETUP_BLOCK: + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; + + if (type == TC_SETUP_BLOCK) return dsa_slave_setup_tc_block(dev, type_data); - default: + + if (!ds->ops->port_setup_tc) return -EOPNOTSUPP; - } + + return ds->ops->port_setup_tc(ds, dp->index, type, type_data); } static void dsa_slave_get_stats64(struct net_device *dev, @@ -1073,6 +1106,9 @@ static int dsa_slave_vlan_rx_add_vid(struct net_device *dev, __be16 proto, * need to emulate the switchdev prepare + commit phase. */ if (dp->bridge_dev) { + if (!br_vlan_enabled(dp->bridge_dev)) + return 0; + /* br_vlan_get_info() returns -EINVAL or -ENOENT if the * device, respectively the VID is not found, returning * 0 means success, which is a failure for us here. @@ -1082,8 +1118,15 @@ static int dsa_slave_vlan_rx_add_vid(struct net_device *dev, __be16 proto, return -EBUSY; } - /* This API only allows programming tagged, non-PVID VIDs */ - return dsa_port_vid_add(dp, vid, 0); + ret = dsa_port_vid_add(dp, vid, 0); + if (ret) + return ret; + + ret = dsa_port_vid_add(dp->cpu_dp, vid, 0); + if (ret) + return ret; + + return 0; } static int dsa_slave_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, @@ -1097,6 +1140,9 @@ static int dsa_slave_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, * need to emulate the switchdev prepare + commit phase. */ if (dp->bridge_dev) { + if (!br_vlan_enabled(dp->bridge_dev)) + return 0; + /* br_vlan_get_info() returns -EINVAL or -ENOENT if the * device, respectively the VID is not found, returning * 0 means success, which is a failure for us here. @@ -1106,11 +1152,10 @@ static int dsa_slave_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, return -EBUSY; } - ret = dsa_port_vid_del(dp, vid); - if (ret == -EOPNOTSUPP) - ret = 0; - - return ret; + /* Do not deprogram the CPU port as it may be shared with other user + * ports which can be members of this VLAN as well. + */ + return dsa_port_vid_del(dp, vid); } static const struct ethtool_ops dsa_slave_ethtool_ops = { @@ -1131,6 +1176,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_eee = dsa_slave_get_eee, .get_link_ksettings = dsa_slave_get_link_ksettings, .set_link_ksettings = dsa_slave_set_link_ksettings, + .get_pauseparam = dsa_slave_get_pauseparam, + .set_pauseparam = dsa_slave_set_pauseparam, .get_rxnfc = dsa_slave_get_rxnfc, .set_rxnfc = dsa_slave_set_rxnfc, .get_ts_info = dsa_slave_get_ts_info, @@ -1234,11 +1281,12 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) struct dsa_port *dp = dsa_slave_to_port(slave_dev); struct device_node *port_dn = dp->dn; struct dsa_switch *ds = dp->ds; + phy_interface_t mode; u32 phy_flags = 0; - int mode, ret; + int ret; - mode = of_get_phy_mode(port_dn); - if (mode < 0) + ret = of_get_phy_mode(port_dn, &mode); + if (ret) mode = PHY_INTERFACE_MODE_NA; dp->pl_config.dev = &slave_dev->dev; @@ -1280,15 +1328,6 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) return ret; } -static struct lock_class_key dsa_slave_netdev_xmit_lock_key; -static void dsa_slave_set_lockdep_class_one(struct net_device *dev, - struct netdev_queue *txq, - void *_unused) -{ - lockdep_set_class(&txq->_xmit_lock, - &dsa_slave_netdev_xmit_lock_key); -} - int dsa_slave_suspend(struct net_device *slave_dev) { struct dsa_port *dp = dsa_slave_to_port(slave_dev); @@ -1296,9 +1335,6 @@ int dsa_slave_suspend(struct net_device *slave_dev) if (!netif_running(slave_dev)) return 0; - cancel_work_sync(&dp->xmit_work); - skb_queue_purge(&dp->xmit_queue); - netif_device_detach(slave_dev); rtnl_lock(); @@ -1357,8 +1393,9 @@ int dsa_slave_create(struct dsa_port *port) if (slave_dev == NULL) return -ENOMEM; - slave_dev->features = master->vlan_features | NETIF_F_HW_TC | - NETIF_F_HW_VLAN_CTAG_FILTER; + slave_dev->features = master->vlan_features | NETIF_F_HW_TC; + if (ds->ops->port_vlan_add && ds->ops->port_vlan_del) + slave_dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER; slave_dev->hw_features |= NETIF_F_HW_TC; slave_dev->ethtool_ops = &dsa_slave_ethtool_ops; if (!IS_ERR_OR_NULL(port->mac)) @@ -1371,9 +1408,6 @@ int dsa_slave_create(struct dsa_port *port) slave_dev->max_mtu = ETH_MAX_MTU; SET_NETDEV_DEVTYPE(slave_dev, &dsa_type); - netdev_for_each_tx_queue(slave_dev, dsa_slave_set_lockdep_class_one, - NULL); - SET_NETDEV_DEV(slave_dev, port->ds->dev); slave_dev->dev.of_node = port->dn; slave_dev->vlan_features = master->vlan_features; @@ -1386,8 +1420,6 @@ int dsa_slave_create(struct dsa_port *port) } p->dp = port; INIT_LIST_HEAD(&p->mall_tc_list); - INIT_WORK(&port->xmit_work, dsa_port_xmit_work); - skb_queue_head_init(&port->xmit_queue); p->xmit = cpu_dp->tag_ops->xmit; port->slave = slave_dev; @@ -1439,7 +1471,7 @@ void dsa_slave_destroy(struct net_device *slave_dev) free_netdev(slave_dev); } -static bool dsa_slave_dev_check(const struct net_device *dev) +bool dsa_slave_dev_check(const struct net_device *dev) { return dev->netdev_ops == &dsa_slave_netdev_ops; } diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 09d9286b27cc..df4abe897ed6 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -20,7 +20,7 @@ static unsigned int dsa_switch_fastest_ageing_time(struct dsa_switch *ds, int i; for (i = 0; i < ds->num_ports; ++i) { - struct dsa_port *dp = &ds->ports[i]; + struct dsa_port *dp = dsa_to_port(ds, i); if (dp->ageing_time && dp->ageing_time < ageing_time) ageing_time = dp->ageing_time; @@ -98,7 +98,7 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds, if (unset_vlan_filtering) { struct switchdev_trans trans = {0}; - err = dsa_port_vlan_filtering(&ds->ports[info->port], + err = dsa_port_vlan_filtering(dsa_to_port(ds, info->port), false, &trans); if (err && err != EOPNOTSUPP) return err; @@ -128,57 +128,51 @@ static int dsa_switch_fdb_del(struct dsa_switch *ds, return ds->ops->port_fdb_del(ds, port, info->addr, info->vid); } -static int -dsa_switch_mdb_prepare_bitmap(struct dsa_switch *ds, - const struct switchdev_obj_port_mdb *mdb, - const unsigned long *bitmap) +static bool dsa_switch_mdb_match(struct dsa_switch *ds, int port, + struct dsa_notifier_mdb_info *info) +{ + if (ds->index == info->sw_index && port == info->port) + return true; + + if (dsa_is_dsa_port(ds, port)) + return true; + + return false; +} + +static int dsa_switch_mdb_prepare(struct dsa_switch *ds, + struct dsa_notifier_mdb_info *info) { int port, err; if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add) return -EOPNOTSUPP; - for_each_set_bit(port, bitmap, ds->num_ports) { - err = ds->ops->port_mdb_prepare(ds, port, mdb); - if (err) - return err; + for (port = 0; port < ds->num_ports; port++) { + if (dsa_switch_mdb_match(ds, port, info)) { + err = ds->ops->port_mdb_prepare(ds, port, info->mdb); + if (err) + return err; + } } return 0; } -static void dsa_switch_mdb_add_bitmap(struct dsa_switch *ds, - const struct switchdev_obj_port_mdb *mdb, - const unsigned long *bitmap) -{ - int port; - - if (!ds->ops->port_mdb_add) - return; - - for_each_set_bit(port, bitmap, ds->num_ports) - ds->ops->port_mdb_add(ds, port, mdb); -} - static int dsa_switch_mdb_add(struct dsa_switch *ds, struct dsa_notifier_mdb_info *info) { - const struct switchdev_obj_port_mdb *mdb = info->mdb; - struct switchdev_trans *trans = info->trans; int port; - /* Build a mask of Multicast group members */ - bitmap_zero(ds->bitmap, ds->num_ports); - if (ds->index == info->sw_index) - set_bit(info->port, ds->bitmap); - for (port = 0; port < ds->num_ports; port++) - if (dsa_is_dsa_port(ds, port)) - set_bit(port, ds->bitmap); + if (switchdev_trans_ph_prepare(info->trans)) + return dsa_switch_mdb_prepare(ds, info); - if (switchdev_trans_ph_prepare(trans)) - return dsa_switch_mdb_prepare_bitmap(ds, mdb, ds->bitmap); + if (!ds->ops->port_mdb_add) + return 0; - dsa_switch_mdb_add_bitmap(ds, mdb, ds->bitmap); + for (port = 0; port < ds->num_ports; port++) + if (dsa_switch_mdb_match(ds, port, info)) + ds->ops->port_mdb_add(ds, port, info->mdb); return 0; } @@ -186,13 +180,11 @@ static int dsa_switch_mdb_add(struct dsa_switch *ds, static int dsa_switch_mdb_del(struct dsa_switch *ds, struct dsa_notifier_mdb_info *info) { - const struct switchdev_obj_port_mdb *mdb = info->mdb; - if (!ds->ops->port_mdb_del) return -EOPNOTSUPP; if (ds->index == info->sw_index) - return ds->ops->port_mdb_del(ds, info->port, mdb); + return ds->ops->port_mdb_del(ds, info->port, info->mdb); return 0; } @@ -234,59 +226,55 @@ static int dsa_port_vlan_check(struct dsa_switch *ds, int port, (void *)vlan); } -static int -dsa_switch_vlan_prepare_bitmap(struct dsa_switch *ds, - const struct switchdev_obj_port_vlan *vlan, - const unsigned long *bitmap) +static bool dsa_switch_vlan_match(struct dsa_switch *ds, int port, + struct dsa_notifier_vlan_info *info) +{ + if (ds->index == info->sw_index && port == info->port) + return true; + + if (dsa_is_dsa_port(ds, port)) + return true; + + return false; +} + +static int dsa_switch_vlan_prepare(struct dsa_switch *ds, + struct dsa_notifier_vlan_info *info) { int port, err; if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add) return -EOPNOTSUPP; - for_each_set_bit(port, bitmap, ds->num_ports) { - err = dsa_port_vlan_check(ds, port, vlan); - if (err) - return err; + for (port = 0; port < ds->num_ports; port++) { + if (dsa_switch_vlan_match(ds, port, info)) { + err = dsa_port_vlan_check(ds, port, info->vlan); + if (err) + return err; - err = ds->ops->port_vlan_prepare(ds, port, vlan); - if (err) - return err; + err = ds->ops->port_vlan_prepare(ds, port, info->vlan); + if (err) + return err; + } } return 0; } -static void -dsa_switch_vlan_add_bitmap(struct dsa_switch *ds, - const struct switchdev_obj_port_vlan *vlan, - const unsigned long *bitmap) -{ - int port; - - for_each_set_bit(port, bitmap, ds->num_ports) - ds->ops->port_vlan_add(ds, port, vlan); -} - static int dsa_switch_vlan_add(struct dsa_switch *ds, struct dsa_notifier_vlan_info *info) { - const struct switchdev_obj_port_vlan *vlan = info->vlan; - struct switchdev_trans *trans = info->trans; int port; - /* Build a mask of VLAN members */ - bitmap_zero(ds->bitmap, ds->num_ports); - if (ds->index == info->sw_index) - set_bit(info->port, ds->bitmap); - for (port = 0; port < ds->num_ports; port++) - if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)) - set_bit(port, ds->bitmap); + if (switchdev_trans_ph_prepare(info->trans)) + return dsa_switch_vlan_prepare(ds, info); - if (switchdev_trans_ph_prepare(trans)) - return dsa_switch_vlan_prepare_bitmap(ds, vlan, ds->bitmap); + if (!ds->ops->port_vlan_add) + return 0; - dsa_switch_vlan_add_bitmap(ds, vlan, ds->bitmap); + for (port = 0; port < ds->num_ports; port++) + if (dsa_switch_vlan_match(ds, port, info)) + ds->ops->port_vlan_add(ds, port, info->vlan); return 0; } @@ -294,14 +282,15 @@ static int dsa_switch_vlan_add(struct dsa_switch *ds, static int dsa_switch_vlan_del(struct dsa_switch *ds, struct dsa_notifier_vlan_info *info) { - const struct switchdev_obj_port_vlan *vlan = info->vlan; - if (!ds->ops->port_vlan_del) return -EOPNOTSUPP; if (ds->index == info->sw_index) - return ds->ops->port_vlan_del(ds, info->port, vlan); + return ds->ops->port_vlan_del(ds, info->port, info->vlan); + /* Do not deprogram the DSA links as they may be used as conduit + * for other VLAN members in the fabric. + */ return 0; } diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 6ebbd799c4eb..2fb6c26294b5 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -28,16 +28,17 @@ * * RSV - VID[9]: * To be used for further expansion of SWITCH_ID or for other purposes. + * Must be transmitted as zero and ignored on receive. * * SWITCH_ID - VID[8:6]: - * Index of switch within DSA tree. Must be between 0 and - * DSA_MAX_SWITCHES - 1. + * Index of switch within DSA tree. Must be between 0 and 7. * * RSV - VID[5:4]: * To be used for further expansion of PORT or for other purposes. + * Must be transmitted as zero and ignored on receive. * * PORT - VID[3:0]: - * Index of switch port. Must be between 0 and DSA_MAX_PORTS - 1. + * Index of switch port. Must be between 0 and 15. */ #define DSA_8021Q_DIR_SHIFT 10 @@ -91,6 +92,79 @@ int dsa_8021q_rx_source_port(u16 vid) } EXPORT_SYMBOL_GPL(dsa_8021q_rx_source_port); +static int dsa_8021q_restore_pvid(struct dsa_switch *ds, int port) +{ + struct bridge_vlan_info vinfo; + struct net_device *slave; + u16 pvid; + int err; + + if (!dsa_is_user_port(ds, port)) + return 0; + + slave = dsa_to_port(ds, port)->slave; + + err = br_vlan_get_pvid(slave, &pvid); + if (!pvid || err < 0) + /* There is no pvid on the bridge for this port, which is + * perfectly valid. Nothing to restore, bye-bye! + */ + return 0; + + err = br_vlan_get_info(slave, pvid, &vinfo); + if (err < 0) { + dev_err(ds->dev, "Couldn't determine PVID attributes\n"); + return err; + } + + return dsa_port_vid_add(dsa_to_port(ds, port), pvid, vinfo.flags); +} + +/* If @enabled is true, installs @vid with @flags into the switch port's HW + * filter. + * If @enabled is false, deletes @vid (ignores @flags) from the port. Had the + * user explicitly configured this @vid through the bridge core, then the @vid + * is installed again, but this time with the flags from the bridge layer. + */ +static int dsa_8021q_vid_apply(struct dsa_switch *ds, int port, u16 vid, + u16 flags, bool enabled) +{ + struct dsa_port *dp = dsa_to_port(ds, port); + struct bridge_vlan_info vinfo; + int err; + + if (enabled) + return dsa_port_vid_add(dp, vid, flags); + + err = dsa_port_vid_del(dp, vid); + if (err < 0) + return err; + + /* Nothing to restore from the bridge for a non-user port. + * The CPU port VLANs are restored implicitly with the user ports, + * similar to how the bridge does in dsa_slave_vlan_add and + * dsa_slave_vlan_del. + */ + if (!dsa_is_user_port(ds, port)) + return 0; + + err = br_vlan_get_info(dp->slave, vid, &vinfo); + /* Couldn't determine bridge attributes for this vid, + * it means the bridge had not configured it. + */ + if (err < 0) + return 0; + + /* Restore the VID from the bridge */ + err = dsa_port_vid_add(dp, vid, vinfo.flags); + if (err < 0) + return err; + + vinfo.flags &= ~BRIDGE_VLAN_INFO_PVID; + + return dsa_port_vid_add(dp->cpu_dp, vid, vinfo.flags); +} + /* RX VLAN tagging (left) and TX VLAN tagging (right) setup shown for a single * front-panel switch port (here swp0). * @@ -146,8 +220,6 @@ EXPORT_SYMBOL_GPL(dsa_8021q_rx_source_port); int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int port, bool enabled) { int upstream = dsa_upstream_port(ds, port); - struct dsa_port *dp = &ds->ports[port]; - struct dsa_port *upstream_dp = &ds->ports[upstream]; u16 rx_vid = dsa_8021q_rx_vid(ds, port); u16 tx_vid = dsa_8021q_tx_vid(ds, port); int i, err; @@ -164,7 +236,6 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int port, bool enabled) * restrictions, so there are no concerns about leaking traffic. */ for (i = 0; i < ds->num_ports; i++) { - struct dsa_port *other_dp = &ds->ports[i]; u16 flags; if (i == upstream) @@ -177,10 +248,7 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int port, bool enabled) /* The RX VID is a regular VLAN on all others */ flags = BRIDGE_VLAN_INFO_UNTAGGED; - if (enabled) - err = dsa_port_vid_add(other_dp, rx_vid, flags); - else - err = dsa_port_vid_del(other_dp, rx_vid); + err = dsa_8021q_vid_apply(ds, i, rx_vid, flags, enabled); if (err) { dev_err(ds->dev, "Failed to apply RX VID %d to port %d: %d\n", rx_vid, port, err); @@ -191,10 +259,7 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int port, bool enabled) /* CPU port needs to see this port's RX VID * as tagged egress. */ - if (enabled) - err = dsa_port_vid_add(upstream_dp, rx_vid, 0); - else - err = dsa_port_vid_del(upstream_dp, rx_vid); + err = dsa_8021q_vid_apply(ds, upstream, rx_vid, 0, enabled); if (err) { dev_err(ds->dev, "Failed to apply RX VID %d to port %d: %d\n", rx_vid, port, err); @@ -202,26 +267,24 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int port, bool enabled) } /* Finally apply the TX VID on this port and on the CPU port */ - if (enabled) - err = dsa_port_vid_add(dp, tx_vid, BRIDGE_VLAN_INFO_UNTAGGED); - else - err = dsa_port_vid_del(dp, tx_vid); + err = dsa_8021q_vid_apply(ds, port, tx_vid, BRIDGE_VLAN_INFO_UNTAGGED, + enabled); if (err) { dev_err(ds->dev, "Failed to apply TX VID %d on port %d: %d\n", tx_vid, port, err); return err; } - if (enabled) - err = dsa_port_vid_add(upstream_dp, tx_vid, 0); - else - err = dsa_port_vid_del(upstream_dp, tx_vid); + err = dsa_8021q_vid_apply(ds, upstream, tx_vid, 0, enabled); if (err) { dev_err(ds->dev, "Failed to apply TX VID %d on port %d: %d\n", tx_vid, upstream, err); return err; } - return 0; + if (!enabled) + err = dsa_8021q_restore_pvid(ds, port); + + return err; } EXPORT_SYMBOL_GPL(dsa_port_setup_8021q_tagging); @@ -278,13 +341,4 @@ struct sk_buff *dsa_8021q_remove_header(struct sk_buff *skb) } EXPORT_SYMBOL_GPL(dsa_8021q_remove_header); -static const struct dsa_device_ops dsa_8021q_netdev_ops = { - .name = "8021q", - .proto = DSA_TAG_PROTO_8021Q, - .overhead = VLAN_HLEN, -}; - MODULE_LICENSE("GPL v2"); -MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_8021Q); - -module_dsa_tag_driver(dsa_8021q_netdev_ops); diff --git a/net/dsa/tag_ar9331.c b/net/dsa/tag_ar9331.c new file mode 100644 index 000000000000..55b00694cdba --- /dev/null +++ b/net/dsa/tag_ar9331.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2019 Pengutronix, Oleksij Rempel <kernel@pengutronix.de> + */ + + +#include <linux/bitfield.h> +#include <linux/etherdevice.h> + +#include "dsa_priv.h" + +#define AR9331_HDR_LEN 2 +#define AR9331_HDR_VERSION 1 + +#define AR9331_HDR_VERSION_MASK GENMASK(15, 14) +#define AR9331_HDR_PRIORITY_MASK GENMASK(13, 12) +#define AR9331_HDR_TYPE_MASK GENMASK(10, 8) +#define AR9331_HDR_BROADCAST BIT(7) +#define AR9331_HDR_FROM_CPU BIT(6) +/* AR9331_HDR_RESERVED - not used or may be version field. + * According to the AR8216 doc it should 0b10. On AR9331 it is 0b11 on RX path + * and should be set to 0b11 to make it work. + */ +#define AR9331_HDR_RESERVED_MASK GENMASK(5, 4) +#define AR9331_HDR_PORT_NUM_MASK GENMASK(3, 0) + +static struct sk_buff *ar9331_tag_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + __le16 *phdr; + u16 hdr; + + if (skb_cow_head(skb, AR9331_HDR_LEN) < 0) + return NULL; + + phdr = skb_push(skb, AR9331_HDR_LEN); + + hdr = FIELD_PREP(AR9331_HDR_VERSION_MASK, AR9331_HDR_VERSION); + hdr |= AR9331_HDR_FROM_CPU | dp->index; + /* 0b10 for AR8216 and 0b11 for AR9331 */ + hdr |= AR9331_HDR_RESERVED_MASK; + + phdr[0] = cpu_to_le16(hdr); + + return skb; +} + +static struct sk_buff *ar9331_tag_rcv(struct sk_buff *skb, + struct net_device *ndev, + struct packet_type *pt) +{ + u8 ver, port; + u16 hdr; + + if (unlikely(!pskb_may_pull(skb, AR9331_HDR_LEN))) + return NULL; + + hdr = le16_to_cpu(*(__le16 *)skb_mac_header(skb)); + + ver = FIELD_GET(AR9331_HDR_VERSION_MASK, hdr); + if (unlikely(ver != AR9331_HDR_VERSION)) { + netdev_warn_once(ndev, "%s:%i wrong header version 0x%2x\n", + __func__, __LINE__, hdr); + return NULL; + } + + if (unlikely(hdr & AR9331_HDR_FROM_CPU)) { + netdev_warn_once(ndev, "%s:%i packet should not be from cpu 0x%2x\n", + __func__, __LINE__, hdr); + return NULL; + } + + skb_pull_rcsum(skb, AR9331_HDR_LEN); + + /* Get source port information */ + port = FIELD_GET(AR9331_HDR_PORT_NUM_MASK, hdr); + + skb->dev = dsa_master_find_slave(ndev, 0, port); + if (!skb->dev) + return NULL; + + return skb; +} + +static const struct dsa_device_ops ar9331_netdev_ops = { + .name = "ar9331", + .proto = DSA_TAG_PROTO_AR9331, + .xmit = ar9331_tag_xmit, + .rcv = ar9331_tag_rcv, + .overhead = AR9331_HDR_LEN, +}; + +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_AR9331); +module_dsa_tag_driver(ar9331_netdev_ops); diff --git a/net/dsa/tag_gswip.c b/net/dsa/tag_gswip.c index b678160bbd66..408d4af390a0 100644 --- a/net/dsa/tag_gswip.c +++ b/net/dsa/tag_gswip.c @@ -104,7 +104,7 @@ static struct sk_buff *gswip_tag_rcv(struct sk_buff *skb, } static const struct dsa_device_ops gswip_netdev_ops = { - .name = "gwsip", + .name = "gswip", .proto = DSA_TAG_PROTO_GSWIP, .xmit = gswip_tag_xmit, .rcv = gswip_tag_rcv, diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index b4872b87d4a6..90d055c4df9e 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -70,6 +70,65 @@ static struct sk_buff *ksz_common_rcv(struct sk_buff *skb, } /* + * For Ingress (Host -> KSZ8795), 1 byte is added before FCS. + * --------------------------------------------------------------------------- + * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag(1byte)|FCS(4bytes) + * --------------------------------------------------------------------------- + * tag : each bit represents port (eg, 0x01=port1, 0x02=port2, 0x10=port5) + * + * For Egress (KSZ8795 -> Host), 1 byte is added before FCS. + * --------------------------------------------------------------------------- + * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|FCS(4bytes) + * --------------------------------------------------------------------------- + * tag0 : zero-based value represents port + * (eg, 0x00=port1, 0x02=port3, 0x06=port7) + */ + +#define KSZ8795_TAIL_TAG_OVERRIDE BIT(6) +#define KSZ8795_TAIL_TAG_LOOKUP BIT(7) + +static struct sk_buff *ksz8795_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct sk_buff *nskb; + u8 *tag; + u8 *addr; + + nskb = ksz_common_xmit(skb, dev, KSZ_INGRESS_TAG_LEN); + if (!nskb) + return NULL; + + /* Tag encoding */ + tag = skb_put(nskb, KSZ_INGRESS_TAG_LEN); + addr = skb_mac_header(nskb); + + *tag = 1 << dp->index; + if (is_link_local_ether_addr(addr)) + *tag |= KSZ8795_TAIL_TAG_OVERRIDE; + + return nskb; +} + +static struct sk_buff *ksz8795_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt) +{ + u8 *tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN; + + return ksz_common_rcv(skb, dev, tag[0] & 7, KSZ_EGRESS_TAG_LEN); +} + +static const struct dsa_device_ops ksz8795_netdev_ops = { + .name = "ksz8795", + .proto = DSA_TAG_PROTO_KSZ8795, + .xmit = ksz8795_xmit, + .rcv = ksz8795_rcv, + .overhead = KSZ_INGRESS_TAG_LEN, +}; + +DSA_TAG_DRIVER(ksz8795_netdev_ops); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ8795); + +/* * For Ingress (Host -> KSZ9477), 2 bytes are added before FCS. * --------------------------------------------------------------------------- * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|tag1(1byte)|FCS(4bytes) @@ -183,6 +242,7 @@ DSA_TAG_DRIVER(ksz9893_netdev_ops); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ9893); static struct dsa_tag_driver *dsa_tag_driver_array[] = { + &DSA_TAG_DRIVER_NAME(ksz8795_netdev_ops), &DSA_TAG_DRIVER_NAME(ksz9477_netdev_ops), &DSA_TAG_DRIVER_NAME(ksz9893_netdev_ops), }; diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c new file mode 100644 index 000000000000..8e3e7283d430 --- /dev/null +++ b/net/dsa/tag_ocelot.c @@ -0,0 +1,241 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright 2019 NXP Semiconductors + */ +#include <soc/mscc/ocelot.h> +#include <linux/packing.h> +#include "dsa_priv.h" + +/* The CPU injection header and the CPU extraction header can have 3 types of + * prefixes: long, short and no prefix. The format of the header itself is the + * same in all 3 cases. + * + * Extraction with long prefix: + * + * +-------------------+-------------------+------+------+------------+-------+ + * | ff:ff:ff:ff:ff:ff | ff:ff:ff:ff:ff:ff | 8880 | 000a | extraction | frame | + * | | | | | header | | + * +-------------------+-------------------+------+------+------------+-------+ + * 48 bits 48 bits 16 bits 16 bits 128 bits + * + * Extraction with short prefix: + * + * +------+------+------------+-------+ + * | 8880 | 000a | extraction | frame | + * | | | header | | + * +------+------+------------+-------+ + * 16 bits 16 bits 128 bits + * + * Extraction with no prefix: + * + * +------------+-------+ + * | extraction | frame | + * | header | | + * +------------+-------+ + * 128 bits + * + * + * Injection with long prefix: + * + * +-------------------+-------------------+------+------+------------+-------+ + * | any dmac | any smac | 8880 | 000a | injection | frame | + * | | | | | header | | + * +-------------------+-------------------+------+------+------------+-------+ + * 48 bits 48 bits 16 bits 16 bits 128 bits + * + * Injection with short prefix: + * + * +------+------+------------+-------+ + * | 8880 | 000a | injection | frame | + * | | | header | | + * +------+------+------------+-------+ + * 16 bits 16 bits 128 bits + * + * Injection with no prefix: + * + * +------------+-------+ + * | injection | frame | + * | header | | + * +------------+-------+ + * 128 bits + * + * The injection header looks like this (network byte order, bit 127 + * is part of lowest address byte in memory, bit 0 is part of highest + * address byte): + * + * +------+------+------+------+------+------+------+------+ + * 127:120 |BYPASS| MASQ | MASQ_PORT |REW_OP|REW_OP| + * +------+------+------+------+------+------+------+------+ + * 119:112 | REW_OP | + * +------+------+------+------+------+------+------+------+ + * 111:104 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 103: 96 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 95: 88 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 87: 80 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 79: 72 | RSV | + * +------+------+------+------+------+------+------+------+ + * 71: 64 | RSV | DEST | + * +------+------+------+------+------+------+------+------+ + * 63: 56 | DEST | + * +------+------+------+------+------+------+------+------+ + * 55: 48 | RSV | + * +------+------+------+------+------+------+------+------+ + * 47: 40 | RSV | SRC_PORT | RSV |TFRM_TIMER| + * +------+------+------+------+------+------+------+------+ + * 39: 32 | TFRM_TIMER | RSV | + * +------+------+------+------+------+------+------+------+ + * 31: 24 | RSV | DP | POP_CNT | CPUQ | + * +------+------+------+------+------+------+------+------+ + * 23: 16 | CPUQ | QOS_CLASS |TAG_TYPE| + * +------+------+------+------+------+------+------+------+ + * 15: 8 | PCP | DEI | VID | + * +------+------+------+------+------+------+------+------+ + * 7: 0 | VID | + * +------+------+------+------+------+------+------+------+ + * + * And the extraction header looks like this: + * + * +------+------+------+------+------+------+------+------+ + * 127:120 | RSV | REW_OP | + * +------+------+------+------+------+------+------+------+ + * 119:112 | REW_OP | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 111:104 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 103: 96 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 95: 88 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 87: 80 | REW_VAL | LLEN | + * +------+------+------+------+------+------+------+------+ + * 79: 72 | LLEN | WLEN | + * +------+------+------+------+------+------+------+------+ + * 71: 64 | WLEN | RSV | + * +------+------+------+------+------+------+------+------+ + * 63: 56 | RSV | + * +------+------+------+------+------+------+------+------+ + * 55: 48 | RSV | + * +------+------+------+------+------+------+------+------+ + * 47: 40 | RSV | SRC_PORT | ACL_ID | + * +------+------+------+------+------+------+------+------+ + * 39: 32 | ACL_ID | RSV | SFLOW_ID | + * +------+------+------+------+------+------+------+------+ + * 31: 24 |ACL_HIT| DP | LRN_FLAGS | CPUQ | + * +------+------+------+------+------+------+------+------+ + * 23: 16 | CPUQ | QOS_CLASS |TAG_TYPE| + * +------+------+------+------+------+------+------+------+ + * 15: 8 | PCP | DEI | VID | + * +------+------+------+------+------+------+------+------+ + * 7: 0 | VID | + * +------+------+------+------+------+------+------+------+ + */ + +static struct sk_buff *ocelot_xmit(struct sk_buff *skb, + struct net_device *netdev) +{ + struct dsa_port *dp = dsa_slave_to_port(netdev); + u64 bypass, dest, src, qos_class, rew_op; + struct dsa_switch *ds = dp->ds; + int port = dp->index; + struct ocelot *ocelot = ds->priv; + struct ocelot_port *ocelot_port = ocelot->ports[port]; + u8 *injection; + + if (unlikely(skb_cow_head(skb, OCELOT_TAG_LEN) < 0)) { + netdev_err(netdev, "Cannot make room for tag.\n"); + return NULL; + } + + injection = skb_push(skb, OCELOT_TAG_LEN); + + memset(injection, 0, OCELOT_TAG_LEN); + + src = dsa_upstream_port(ds, port); + dest = BIT(port); + bypass = true; + qos_class = skb->priority; + + packing(injection, &bypass, 127, 127, OCELOT_TAG_LEN, PACK, 0); + packing(injection, &dest, 68, 56, OCELOT_TAG_LEN, PACK, 0); + packing(injection, &src, 46, 43, OCELOT_TAG_LEN, PACK, 0); + packing(injection, &qos_class, 19, 17, OCELOT_TAG_LEN, PACK, 0); + + if (ocelot->ptp && (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) { + rew_op = ocelot_port->ptp_cmd; + if (ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) { + rew_op |= (ocelot_port->ts_id % 4) << 3; + ocelot_port->ts_id++; + } + + packing(injection, &rew_op, 125, 117, OCELOT_TAG_LEN, PACK, 0); + } + + return skb; +} + +static struct sk_buff *ocelot_rcv(struct sk_buff *skb, + struct net_device *netdev, + struct packet_type *pt) +{ + u64 src_port, qos_class; + u8 *start = skb->data; + u8 *extraction; + + /* Revert skb->data by the amount consumed by the DSA master, + * so it points to the beginning of the frame. + */ + skb_push(skb, ETH_HLEN); + /* We don't care about the long prefix, it is just for easy entrance + * into the DSA master's RX filter. Discard it now by moving it into + * the headroom. + */ + skb_pull(skb, OCELOT_LONG_PREFIX_LEN); + /* And skb->data now points to the extraction frame header. + * Keep a pointer to it. + */ + extraction = skb->data; + /* Now the EFH is part of the headroom as well */ + skb_pull(skb, OCELOT_TAG_LEN); + /* Reset the pointer to the real MAC header */ + skb_reset_mac_header(skb); + skb_reset_mac_len(skb); + /* And move skb->data to the correct location again */ + skb_pull(skb, ETH_HLEN); + + /* Remove from inet csum the extraction header */ + skb_postpull_rcsum(skb, start, OCELOT_LONG_PREFIX_LEN + OCELOT_TAG_LEN); + + packing(extraction, &src_port, 46, 43, OCELOT_TAG_LEN, UNPACK, 0); + packing(extraction, &qos_class, 19, 17, OCELOT_TAG_LEN, UNPACK, 0); + + skb->dev = dsa_master_find_slave(netdev, 0, src_port); + if (!skb->dev) + /* The switch will reflect back some frames sent through + * sockets opened on the bare DSA master. These will come back + * with src_port equal to the index of the CPU port, for which + * there is no slave registered. So don't print any error + * message here (ignore and drop those frames). + */ + return NULL; + + skb->offload_fwd_mark = 1; + skb->priority = qos_class; + + return skb; +} + +static struct dsa_device_ops ocelot_netdev_ops = { + .name = "ocelot", + .proto = DSA_TAG_PROTO_OCELOT, + .xmit = ocelot_xmit, + .rcv = ocelot_rcv, + .overhead = OCELOT_TAG_LEN + OCELOT_LONG_PREFIX_LEN, +}; + +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_OCELOT); + +module_dsa_tag_driver(ocelot_netdev_ops); diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index c95885215525..70db7c909f74 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -33,10 +33,7 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) struct dsa_port *dp = dsa_slave_to_port(dev); u16 *phdr, hdr; - dev->stats.tx_packets++; - dev->stats.tx_bytes += skb->len; - - if (skb_cow_head(skb, 0) < 0) + if (skb_cow_head(skb, QCA_HDR_LEN) < 0) return NULL; skb_push(skb, QCA_HDR_LEN); diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 47ee88163a9d..5366ea430349 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -83,20 +83,33 @@ static bool sja1105_filter(const struct sk_buff *skb, struct net_device *dev) return false; } +/* Calls sja1105_port_deferred_xmit in sja1105_main.c */ +static struct sk_buff *sja1105_defer_xmit(struct sja1105_port *sp, + struct sk_buff *skb) +{ + /* Increase refcount so the kfree_skb in dsa_slave_xmit + * won't really free the packet. + */ + skb_queue_tail(&sp->xmit_queue, skb_get(skb)); + kthread_queue_work(sp->xmit_worker, &sp->xmit_work); + + return NULL; +} + static struct sk_buff *sja1105_xmit(struct sk_buff *skb, struct net_device *netdev) { struct dsa_port *dp = dsa_slave_to_port(netdev); - struct dsa_switch *ds = dp->ds; - u16 tx_vid = dsa_8021q_tx_vid(ds, dp->index); - u8 pcp = skb->priority; + u16 tx_vid = dsa_8021q_tx_vid(dp->ds, dp->index); + u16 queue_mapping = skb_get_queue_mapping(skb); + u8 pcp = netdev_txq_to_tc(netdev, queue_mapping); /* Transmitting management traffic does not rely upon switch tagging, * but instead SPI-installed management routes. Part 2 of this * is the .port_deferred_xmit driver callback. */ if (unlikely(sja1105_is_link_local(skb))) - return dsa_defer_xmit(skb, netdev); + return sja1105_defer_xmit(dp->priv, skb); /* If we are under a vlan_filtering bridge, IP termination on * switch ports based on 802.1Q tags is simply too brittle to @@ -155,7 +168,11 @@ static struct sk_buff /* Step 1: A timestampable frame was received. * Buffer it until we get its meta frame. */ - if (is_link_local && sp->data->hwts_rx_en) { + if (is_link_local) { + if (!test_bit(SJA1105_HWTS_RX_EN, &sp->data->state)) + /* Do normal processing. */ + return skb; + spin_lock(&sp->data->meta_lock); /* Was this a link-local frame instead of the meta * that we were expecting? @@ -186,6 +203,12 @@ static struct sk_buff } else if (is_meta) { struct sk_buff *stampable_skb; + /* Drop the meta frame if we're not in the right state + * to process it. + */ + if (!test_bit(SJA1105_HWTS_RX_EN, &sp->data->state)) + return NULL; + spin_lock(&sp->data->meta_lock); stampable_skb = sp->data->stampable_skb; diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 17374afee28f..c8b903302ff2 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -244,7 +244,12 @@ int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, __be16 eth->h_proto = type; memcpy(eth->h_source, dev->dev_addr, ETH_ALEN); memcpy(eth->h_dest, neigh->ha, ETH_ALEN); - hh->hh_len = ETH_HLEN; + + /* Pairs with READ_ONCE() in neigh_resolve_output(), + * neigh_hh_output() and neigh_update_hhs(). + */ + smp_store_release(&hh->hh_len, ETH_HLEN); + return 0; } EXPORT_SYMBOL(eth_header_cache); @@ -330,22 +335,6 @@ int eth_mac_addr(struct net_device *dev, void *p) } EXPORT_SYMBOL(eth_mac_addr); -/** - * eth_change_mtu - set new MTU size - * @dev: network device - * @new_mtu: new Maximum Transfer Unit - * - * Allow changing MTU size. Needs to be overridden for devices - * supporting jumbo frames. - */ -int eth_change_mtu(struct net_device *dev, int new_mtu) -{ - netdev_warn(dev, "%s is deprecated\n", __func__); - dev->mtu = new_mtu; - return 0; -} -EXPORT_SYMBOL(eth_change_mtu); - int eth_validate_addr(struct net_device *dev) { if (!is_valid_ether_addr(dev->dev_addr)) diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile new file mode 100644 index 000000000000..424545a4aaec --- /dev/null +++ b/net/ethtool/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-y += ioctl.o common.o + +obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o + +ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \ + linkstate.o debug.o wol.o diff --git a/net/ethtool/bitset.c b/net/ethtool/bitset.c new file mode 100644 index 000000000000..fce45dac4205 --- /dev/null +++ b/net/ethtool/bitset.c @@ -0,0 +1,735 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/ethtool_netlink.h> +#include <linux/bitmap.h> +#include "netlink.h" +#include "bitset.h" + +/* Some bitmaps are internally represented as an array of unsigned long, some + * as an array of u32 (some even as single u32 for now). To avoid the need of + * wrappers on caller side, we provide two set of functions: those with "32" + * suffix in their names expect u32 based bitmaps, those without it expect + * unsigned long bitmaps. + */ + +static u32 ethnl_lower_bits(unsigned int n) +{ + return ~(u32)0 >> (32 - n % 32); +} + +static u32 ethnl_upper_bits(unsigned int n) +{ + return ~(u32)0 << (n % 32); +} + +/** + * ethnl_bitmap32_clear() - Clear u32 based bitmap + * @dst: bitmap to clear + * @start: beginning of the interval + * @end: end of the interval + * @mod: set if bitmap was modified + * + * Clear @nbits bits of a bitmap with indices @start <= i < @end + */ +static void ethnl_bitmap32_clear(u32 *dst, unsigned int start, unsigned int end, + bool *mod) +{ + unsigned int start_word = start / 32; + unsigned int end_word = end / 32; + unsigned int i; + u32 mask; + + if (end <= start) + return; + + if (start % 32) { + mask = ethnl_upper_bits(start); + if (end_word == start_word) { + mask &= ethnl_lower_bits(end); + if (dst[start_word] & mask) { + dst[start_word] &= ~mask; + *mod = true; + } + return; + } + if (dst[start_word] & mask) { + dst[start_word] &= ~mask; + *mod = true; + } + start_word++; + } + + for (i = start_word; i < end_word; i++) { + if (dst[i]) { + dst[i] = 0; + *mod = true; + } + } + if (end % 32) { + mask = ethnl_lower_bits(end); + if (dst[end_word] & mask) { + dst[end_word] &= ~mask; + *mod = true; + } + } +} + +/** + * ethnl_bitmap32_not_zero() - Check if any bit is set in an interval + * @map: bitmap to test + * @start: beginning of the interval + * @end: end of the interval + * + * Return: true if there is non-zero bit with index @start <= i < @end, + * false if the whole interval is zero + */ +static bool ethnl_bitmap32_not_zero(const u32 *map, unsigned int start, + unsigned int end) +{ + unsigned int start_word = start / 32; + unsigned int end_word = end / 32; + u32 mask; + + if (end <= start) + return true; + + if (start % 32) { + mask = ethnl_upper_bits(start); + if (end_word == start_word) { + mask &= ethnl_lower_bits(end); + return map[start_word] & mask; + } + if (map[start_word] & mask) + return true; + start_word++; + } + + if (!memchr_inv(map + start_word, '\0', + (end_word - start_word) * sizeof(u32))) + return true; + if (end % 32 == 0) + return true; + return map[end_word] & ethnl_lower_bits(end); +} + +/** + * ethnl_bitmap32_update() - Modify u32 based bitmap according to value/mask + * pair + * @dst: bitmap to update + * @nbits: bit size of the bitmap + * @value: values to set + * @mask: mask of bits to set + * @mod: set to true if bitmap is modified, preserve if not + * + * Set bits in @dst bitmap which are set in @mask to values from @value, leave + * the rest untouched. If destination bitmap was modified, set @mod to true, + * leave as it is if not. + */ +static void ethnl_bitmap32_update(u32 *dst, unsigned int nbits, + const u32 *value, const u32 *mask, bool *mod) +{ + while (nbits > 0) { + u32 real_mask = mask ? *mask : ~(u32)0; + u32 new_value; + + if (nbits < 32) + real_mask &= ethnl_lower_bits(nbits); + new_value = (*dst & ~real_mask) | (*value & real_mask); + if (new_value != *dst) { + *dst = new_value; + *mod = true; + } + + if (nbits <= 32) + break; + dst++; + nbits -= 32; + value++; + if (mask) + mask++; + } +} + +static bool ethnl_bitmap32_test_bit(const u32 *map, unsigned int index) +{ + return map[index / 32] & (1U << (index % 32)); +} + +/** + * ethnl_bitset32_size() - Calculate size of bitset nested attribute + * @val: value bitmap (u32 based) + * @mask: mask bitmap (u32 based, optional) + * @nbits: bit length of the bitset + * @names: array of bit names (optional) + * @compact: assume compact format for output + * + * Estimate length of netlink attribute composed by a later call to + * ethnl_put_bitset32() call with the same arguments. + * + * Return: negative error code or attribute length estimate + */ +int ethnl_bitset32_size(const u32 *val, const u32 *mask, unsigned int nbits, + ethnl_string_array_t names, bool compact) +{ + unsigned int len = 0; + + /* list flag */ + if (!mask) + len += nla_total_size(sizeof(u32)); + /* size */ + len += nla_total_size(sizeof(u32)); + + if (compact) { + unsigned int nwords = DIV_ROUND_UP(nbits, 32); + + /* value, mask */ + len += (mask ? 2 : 1) * nla_total_size(nwords * sizeof(u32)); + } else { + unsigned int bits_len = 0; + unsigned int bit_len, i; + + for (i = 0; i < nbits; i++) { + const char *name = names ? names[i] : NULL; + + if (!ethnl_bitmap32_test_bit(mask ?: val, i)) + continue; + /* index */ + bit_len = nla_total_size(sizeof(u32)); + /* name */ + if (name) + bit_len += ethnl_strz_size(name); + /* value */ + if (mask && ethnl_bitmap32_test_bit(val, i)) + bit_len += nla_total_size(0); + + /* bit nest */ + bits_len += nla_total_size(bit_len); + } + /* bits nest */ + len += nla_total_size(bits_len); + } + + /* outermost nest */ + return nla_total_size(len); +} + +/** + * ethnl_put_bitset32() - Put a bitset nest into a message + * @skb: skb with the message + * @attrtype: attribute type for the bitset nest + * @val: value bitmap (u32 based) + * @mask: mask bitmap (u32 based, optional) + * @nbits: bit length of the bitset + * @names: array of bit names (optional) + * @compact: use compact format for the output + * + * Compose a nested attribute representing a bitset. If @mask is null, simple + * bitmap (bit list) is created, if @mask is provided, represent a value/mask + * pair. Bit names are only used in verbose mode and when provided by calller. + * + * Return: 0 on success, negative error value on error + */ +int ethnl_put_bitset32(struct sk_buff *skb, int attrtype, const u32 *val, + const u32 *mask, unsigned int nbits, + ethnl_string_array_t names, bool compact) +{ + struct nlattr *nest; + struct nlattr *attr; + + nest = nla_nest_start(skb, attrtype); + if (!nest) + return -EMSGSIZE; + + if (!mask && nla_put_flag(skb, ETHTOOL_A_BITSET_NOMASK)) + goto nla_put_failure; + if (nla_put_u32(skb, ETHTOOL_A_BITSET_SIZE, nbits)) + goto nla_put_failure; + if (compact) { + unsigned int nwords = DIV_ROUND_UP(nbits, 32); + unsigned int nbytes = nwords * sizeof(u32); + u32 *dst; + + attr = nla_reserve(skb, ETHTOOL_A_BITSET_VALUE, nbytes); + if (!attr) + goto nla_put_failure; + dst = nla_data(attr); + memcpy(dst, val, nbytes); + if (nbits % 32) + dst[nwords - 1] &= ethnl_lower_bits(nbits); + + if (mask) { + attr = nla_reserve(skb, ETHTOOL_A_BITSET_MASK, nbytes); + if (!attr) + goto nla_put_failure; + dst = nla_data(attr); + memcpy(dst, mask, nbytes); + if (nbits % 32) + dst[nwords - 1] &= ethnl_lower_bits(nbits); + } + } else { + struct nlattr *bits; + unsigned int i; + + bits = nla_nest_start(skb, ETHTOOL_A_BITSET_BITS); + if (!bits) + goto nla_put_failure; + for (i = 0; i < nbits; i++) { + const char *name = names ? names[i] : NULL; + + if (!ethnl_bitmap32_test_bit(mask ?: val, i)) + continue; + attr = nla_nest_start(skb, ETHTOOL_A_BITSET_BITS_BIT); + if (!attr) + goto nla_put_failure; + if (nla_put_u32(skb, ETHTOOL_A_BITSET_BIT_INDEX, i)) + goto nla_put_failure; + if (name && + ethnl_put_strz(skb, ETHTOOL_A_BITSET_BIT_NAME, name)) + goto nla_put_failure; + if (mask && ethnl_bitmap32_test_bit(val, i) && + nla_put_flag(skb, ETHTOOL_A_BITSET_BIT_VALUE)) + goto nla_put_failure; + nla_nest_end(skb, attr); + } + nla_nest_end(skb, bits); + } + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static const struct nla_policy bitset_policy[ETHTOOL_A_BITSET_MAX + 1] = { + [ETHTOOL_A_BITSET_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_BITSET_NOMASK] = { .type = NLA_FLAG }, + [ETHTOOL_A_BITSET_SIZE] = { .type = NLA_U32 }, + [ETHTOOL_A_BITSET_BITS] = { .type = NLA_NESTED }, + [ETHTOOL_A_BITSET_VALUE] = { .type = NLA_BINARY }, + [ETHTOOL_A_BITSET_MASK] = { .type = NLA_BINARY }, +}; + +static const struct nla_policy bit_policy[ETHTOOL_A_BITSET_BIT_MAX + 1] = { + [ETHTOOL_A_BITSET_BIT_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_BITSET_BIT_INDEX] = { .type = NLA_U32 }, + [ETHTOOL_A_BITSET_BIT_NAME] = { .type = NLA_NUL_STRING }, + [ETHTOOL_A_BITSET_BIT_VALUE] = { .type = NLA_FLAG }, +}; + +/** + * ethnl_bitset_is_compact() - check if bitset attribute represents a compact + * bitset + * @bitset: nested attribute representing a bitset + * @compact: pointer for return value + * + * Return: 0 on success, negative error code on failure + */ +int ethnl_bitset_is_compact(const struct nlattr *bitset, bool *compact) +{ + struct nlattr *tb[ETHTOOL_A_BITSET_MAX + 1]; + int ret; + + ret = nla_parse_nested(tb, ETHTOOL_A_BITSET_MAX, bitset, + bitset_policy, NULL); + if (ret < 0) + return ret; + + if (tb[ETHTOOL_A_BITSET_BITS]) { + if (tb[ETHTOOL_A_BITSET_VALUE] || tb[ETHTOOL_A_BITSET_MASK]) + return -EINVAL; + *compact = false; + return 0; + } + if (!tb[ETHTOOL_A_BITSET_SIZE] || !tb[ETHTOOL_A_BITSET_VALUE]) + return -EINVAL; + + *compact = true; + return 0; +} + +/** + * ethnl_name_to_idx() - look up string index for a name + * @names: array of ETH_GSTRING_LEN sized strings + * @n_names: number of strings in the array + * @name: name to look up + * + * Return: index of the string if found, -ENOENT if not found + */ +static int ethnl_name_to_idx(ethnl_string_array_t names, unsigned int n_names, + const char *name) +{ + unsigned int i; + + if (!names) + return -ENOENT; + + for (i = 0; i < n_names; i++) { + /* names[i] may not be null terminated */ + if (!strncmp(names[i], name, ETH_GSTRING_LEN) && + strlen(name) <= ETH_GSTRING_LEN) + return i; + } + + return -ENOENT; +} + +static int ethnl_parse_bit(unsigned int *index, bool *val, unsigned int nbits, + const struct nlattr *bit_attr, bool no_mask, + ethnl_string_array_t names, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[ETHTOOL_A_BITSET_BIT_MAX + 1]; + int ret, idx; + + ret = nla_parse_nested(tb, ETHTOOL_A_BITSET_BIT_MAX, bit_attr, + bit_policy, extack); + if (ret < 0) + return ret; + + if (tb[ETHTOOL_A_BITSET_BIT_INDEX]) { + const char *name; + + idx = nla_get_u32(tb[ETHTOOL_A_BITSET_BIT_INDEX]); + if (idx >= nbits) { + NL_SET_ERR_MSG_ATTR(extack, + tb[ETHTOOL_A_BITSET_BIT_INDEX], + "bit index too high"); + return -EOPNOTSUPP; + } + name = names ? names[idx] : NULL; + if (tb[ETHTOOL_A_BITSET_BIT_NAME] && name && + strncmp(nla_data(tb[ETHTOOL_A_BITSET_BIT_NAME]), name, + nla_len(tb[ETHTOOL_A_BITSET_BIT_NAME]))) { + NL_SET_ERR_MSG_ATTR(extack, bit_attr, + "bit index and name mismatch"); + return -EINVAL; + } + } else if (tb[ETHTOOL_A_BITSET_BIT_NAME]) { + idx = ethnl_name_to_idx(names, nbits, + nla_data(tb[ETHTOOL_A_BITSET_BIT_NAME])); + if (idx < 0) { + NL_SET_ERR_MSG_ATTR(extack, + tb[ETHTOOL_A_BITSET_BIT_NAME], + "bit name not found"); + return -EOPNOTSUPP; + } + } else { + NL_SET_ERR_MSG_ATTR(extack, bit_attr, + "neither bit index nor name specified"); + return -EINVAL; + } + + *index = idx; + *val = no_mask || tb[ETHTOOL_A_BITSET_BIT_VALUE]; + return 0; +} + +static int +ethnl_update_bitset32_verbose(u32 *bitmap, unsigned int nbits, + const struct nlattr *attr, struct nlattr **tb, + ethnl_string_array_t names, + struct netlink_ext_ack *extack, bool *mod) +{ + struct nlattr *bit_attr; + bool no_mask; + int rem; + int ret; + + if (tb[ETHTOOL_A_BITSET_VALUE]) { + NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_VALUE], + "value only allowed in compact bitset"); + return -EINVAL; + } + if (tb[ETHTOOL_A_BITSET_MASK]) { + NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK], + "mask only allowed in compact bitset"); + return -EINVAL; + } + no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; + + nla_for_each_nested(bit_attr, tb[ETHTOOL_A_BITSET_BITS], rem) { + bool old_val, new_val; + unsigned int idx; + + if (nla_type(bit_attr) != ETHTOOL_A_BITSET_BITS_BIT) { + NL_SET_ERR_MSG_ATTR(extack, bit_attr, + "only ETHTOOL_A_BITSET_BITS_BIT allowed in ETHTOOL_A_BITSET_BITS"); + return -EINVAL; + } + ret = ethnl_parse_bit(&idx, &new_val, nbits, bit_attr, no_mask, + names, extack); + if (ret < 0) + return ret; + old_val = bitmap[idx / 32] & ((u32)1 << (idx % 32)); + if (new_val != old_val) { + if (new_val) + bitmap[idx / 32] |= ((u32)1 << (idx % 32)); + else + bitmap[idx / 32] &= ~((u32)1 << (idx % 32)); + *mod = true; + } + } + + return 0; +} + +static int ethnl_compact_sanity_checks(unsigned int nbits, + const struct nlattr *nest, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + bool no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; + unsigned int attr_nbits, attr_nwords; + const struct nlattr *test_attr; + + if (no_mask && tb[ETHTOOL_A_BITSET_MASK]) { + NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK], + "mask not allowed in list bitset"); + return -EINVAL; + } + if (!tb[ETHTOOL_A_BITSET_SIZE]) { + NL_SET_ERR_MSG_ATTR(extack, nest, + "missing size in compact bitset"); + return -EINVAL; + } + if (!tb[ETHTOOL_A_BITSET_VALUE]) { + NL_SET_ERR_MSG_ATTR(extack, nest, + "missing value in compact bitset"); + return -EINVAL; + } + if (!no_mask && !tb[ETHTOOL_A_BITSET_MASK]) { + NL_SET_ERR_MSG_ATTR(extack, nest, + "missing mask in compact nonlist bitset"); + return -EINVAL; + } + + attr_nbits = nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]); + attr_nwords = DIV_ROUND_UP(attr_nbits, 32); + if (nla_len(tb[ETHTOOL_A_BITSET_VALUE]) != attr_nwords * sizeof(u32)) { + NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_VALUE], + "bitset value length does not match size"); + return -EINVAL; + } + if (tb[ETHTOOL_A_BITSET_MASK] && + nla_len(tb[ETHTOOL_A_BITSET_MASK]) != attr_nwords * sizeof(u32)) { + NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK], + "bitset mask length does not match size"); + return -EINVAL; + } + if (attr_nbits <= nbits) + return 0; + + test_attr = no_mask ? tb[ETHTOOL_A_BITSET_VALUE] : + tb[ETHTOOL_A_BITSET_MASK]; + if (ethnl_bitmap32_not_zero(nla_data(test_attr), nbits, attr_nbits)) { + NL_SET_ERR_MSG_ATTR(extack, test_attr, + "cannot modify bits past kernel bitset size"); + return -EINVAL; + } + return 0; +} + +/** + * ethnl_update_bitset32() - Apply a bitset nest to a u32 based bitmap + * @bitmap: bitmap to update + * @nbits: size of the updated bitmap in bits + * @attr: nest attribute to parse and apply + * @names: array of bit names; may be null for compact format + * @extack: extack for error reporting + * @mod: set this to true if bitmap is modified, leave as it is if not + * + * Apply bitset netsted attribute to a bitmap. If the attribute represents + * a bit list, @bitmap is set to its contents; otherwise, bits in mask are + * set to values from value. Bitmaps in the attribute may be longer than + * @nbits but the message must not request modifying any bits past @nbits. + * + * Return: negative error code on failure, 0 on success + */ +int ethnl_update_bitset32(u32 *bitmap, unsigned int nbits, + const struct nlattr *attr, ethnl_string_array_t names, + struct netlink_ext_ack *extack, bool *mod) +{ + struct nlattr *tb[ETHTOOL_A_BITSET_MAX + 1]; + unsigned int change_bits; + bool no_mask; + int ret; + + if (!attr) + return 0; + ret = nla_parse_nested(tb, ETHTOOL_A_BITSET_MAX, attr, bitset_policy, + extack); + if (ret < 0) + return ret; + + if (tb[ETHTOOL_A_BITSET_BITS]) + return ethnl_update_bitset32_verbose(bitmap, nbits, attr, tb, + names, extack, mod); + ret = ethnl_compact_sanity_checks(nbits, attr, tb, extack); + if (ret < 0) + return ret; + + no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; + change_bits = min_t(unsigned int, + nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]), nbits); + ethnl_bitmap32_update(bitmap, change_bits, + nla_data(tb[ETHTOOL_A_BITSET_VALUE]), + no_mask ? NULL : + nla_data(tb[ETHTOOL_A_BITSET_MASK]), + mod); + if (no_mask && change_bits < nbits) + ethnl_bitmap32_clear(bitmap, change_bits, nbits, mod); + + return 0; +} + +#if BITS_PER_LONG == 64 && defined(__BIG_ENDIAN) + +/* 64-bit big endian architectures are the only case when u32 based bitmaps + * and unsigned long based bitmaps have different memory layout so that we + * cannot simply cast the latter to the former and need actual wrappers + * converting the latter to the former. + * + * To reduce the number of slab allocations, the wrappers use fixed size local + * variables for bitmaps up to ETHNL_SMALL_BITMAP_BITS bits which is the + * majority of bitmaps used by ethtool. + */ +#define ETHNL_SMALL_BITMAP_BITS 128 +#define ETHNL_SMALL_BITMAP_WORDS DIV_ROUND_UP(ETHNL_SMALL_BITMAP_BITS, 32) + +int ethnl_bitset_size(const unsigned long *val, const unsigned long *mask, + unsigned int nbits, ethnl_string_array_t names, + bool compact) +{ + u32 small_mask32[ETHNL_SMALL_BITMAP_WORDS]; + u32 small_val32[ETHNL_SMALL_BITMAP_WORDS]; + u32 *mask32; + u32 *val32; + int ret; + + if (nbits > ETHNL_SMALL_BITMAP_BITS) { + unsigned int nwords = DIV_ROUND_UP(nbits, 32); + + val32 = kmalloc_array(2 * nwords, sizeof(u32), GFP_KERNEL); + if (!val32) + return -ENOMEM; + mask32 = val32 + nwords; + } else { + val32 = small_val32; + mask32 = small_mask32; + } + + bitmap_to_arr32(val32, val, nbits); + if (mask) + bitmap_to_arr32(mask32, mask, nbits); + else + mask32 = NULL; + ret = ethnl_bitset32_size(val32, mask32, nbits, names, compact); + + if (nbits > ETHNL_SMALL_BITMAP_BITS) + kfree(val32); + + return ret; +} + +int ethnl_put_bitset(struct sk_buff *skb, int attrtype, + const unsigned long *val, const unsigned long *mask, + unsigned int nbits, ethnl_string_array_t names, + bool compact) +{ + u32 small_mask32[ETHNL_SMALL_BITMAP_WORDS]; + u32 small_val32[ETHNL_SMALL_BITMAP_WORDS]; + u32 *mask32; + u32 *val32; + int ret; + + if (nbits > ETHNL_SMALL_BITMAP_BITS) { + unsigned int nwords = DIV_ROUND_UP(nbits, 32); + + val32 = kmalloc_array(2 * nwords, sizeof(u32), GFP_KERNEL); + if (!val32) + return -ENOMEM; + mask32 = val32 + nwords; + } else { + val32 = small_val32; + mask32 = small_mask32; + } + + bitmap_to_arr32(val32, val, nbits); + if (mask) + bitmap_to_arr32(mask32, mask, nbits); + else + mask32 = NULL; + ret = ethnl_put_bitset32(skb, attrtype, val32, mask32, nbits, names, + compact); + + if (nbits > ETHNL_SMALL_BITMAP_BITS) + kfree(val32); + + return ret; +} + +int ethnl_update_bitset(unsigned long *bitmap, unsigned int nbits, + const struct nlattr *attr, ethnl_string_array_t names, + struct netlink_ext_ack *extack, bool *mod) +{ + u32 small_bitmap32[ETHNL_SMALL_BITMAP_WORDS]; + u32 *bitmap32 = small_bitmap32; + bool u32_mod = false; + int ret; + + if (nbits > ETHNL_SMALL_BITMAP_BITS) { + unsigned int dst_words = DIV_ROUND_UP(nbits, 32); + + bitmap32 = kmalloc_array(dst_words, sizeof(u32), GFP_KERNEL); + if (!bitmap32) + return -ENOMEM; + } + + bitmap_to_arr32(bitmap32, bitmap, nbits); + ret = ethnl_update_bitset32(bitmap32, nbits, attr, names, extack, + &u32_mod); + if (u32_mod) { + bitmap_from_arr32(bitmap, bitmap32, nbits); + *mod = true; + } + + if (nbits > ETHNL_SMALL_BITMAP_BITS) + kfree(bitmap32); + + return ret; +} + +#else + +/* On little endian 64-bit and all 32-bit architectures, an unsigned long + * based bitmap can be interpreted as u32 based one using a simple cast. + */ + +int ethnl_bitset_size(const unsigned long *val, const unsigned long *mask, + unsigned int nbits, ethnl_string_array_t names, + bool compact) +{ + return ethnl_bitset32_size((const u32 *)val, (const u32 *)mask, nbits, + names, compact); +} + +int ethnl_put_bitset(struct sk_buff *skb, int attrtype, + const unsigned long *val, const unsigned long *mask, + unsigned int nbits, ethnl_string_array_t names, + bool compact) +{ + return ethnl_put_bitset32(skb, attrtype, (const u32 *)val, + (const u32 *)mask, nbits, names, compact); +} + +int ethnl_update_bitset(unsigned long *bitmap, unsigned int nbits, + const struct nlattr *attr, ethnl_string_array_t names, + struct netlink_ext_ack *extack, bool *mod) +{ + return ethnl_update_bitset32((u32 *)bitmap, nbits, attr, names, extack, + mod); +} + +#endif /* BITS_PER_LONG == 64 && defined(__BIG_ENDIAN) */ diff --git a/net/ethtool/bitset.h b/net/ethtool/bitset.h new file mode 100644 index 000000000000..b8247e34109d --- /dev/null +++ b/net/ethtool/bitset.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef _NET_ETHTOOL_BITSET_H +#define _NET_ETHTOOL_BITSET_H + +typedef const char (*const ethnl_string_array_t)[ETH_GSTRING_LEN]; + +int ethnl_bitset_is_compact(const struct nlattr *bitset, bool *compact); +int ethnl_bitset_size(const unsigned long *val, const unsigned long *mask, + unsigned int nbits, ethnl_string_array_t names, + bool compact); +int ethnl_bitset32_size(const u32 *val, const u32 *mask, unsigned int nbits, + ethnl_string_array_t names, bool compact); +int ethnl_put_bitset(struct sk_buff *skb, int attrtype, + const unsigned long *val, const unsigned long *mask, + unsigned int nbits, ethnl_string_array_t names, + bool compact); +int ethnl_put_bitset32(struct sk_buff *skb, int attrtype, const u32 *val, + const u32 *mask, unsigned int nbits, + ethnl_string_array_t names, bool compact); +int ethnl_update_bitset(unsigned long *bitmap, unsigned int nbits, + const struct nlattr *attr, ethnl_string_array_t names, + struct netlink_ext_ack *extack, bool *mod); +int ethnl_update_bitset32(u32 *bitmap, unsigned int nbits, + const struct nlattr *attr, ethnl_string_array_t names, + struct netlink_ext_ack *extack, bool *mod); + +#endif /* _NET_ETHTOOL_BITSET_H */ diff --git a/net/ethtool/common.c b/net/ethtool/common.c new file mode 100644 index 000000000000..636ec6d5110e --- /dev/null +++ b/net/ethtool/common.c @@ -0,0 +1,259 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "common.h" + +const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { + [NETIF_F_SG_BIT] = "tx-scatter-gather", + [NETIF_F_IP_CSUM_BIT] = "tx-checksum-ipv4", + [NETIF_F_HW_CSUM_BIT] = "tx-checksum-ip-generic", + [NETIF_F_IPV6_CSUM_BIT] = "tx-checksum-ipv6", + [NETIF_F_HIGHDMA_BIT] = "highdma", + [NETIF_F_FRAGLIST_BIT] = "tx-scatter-gather-fraglist", + [NETIF_F_HW_VLAN_CTAG_TX_BIT] = "tx-vlan-hw-insert", + + [NETIF_F_HW_VLAN_CTAG_RX_BIT] = "rx-vlan-hw-parse", + [NETIF_F_HW_VLAN_CTAG_FILTER_BIT] = "rx-vlan-filter", + [NETIF_F_HW_VLAN_STAG_TX_BIT] = "tx-vlan-stag-hw-insert", + [NETIF_F_HW_VLAN_STAG_RX_BIT] = "rx-vlan-stag-hw-parse", + [NETIF_F_HW_VLAN_STAG_FILTER_BIT] = "rx-vlan-stag-filter", + [NETIF_F_VLAN_CHALLENGED_BIT] = "vlan-challenged", + [NETIF_F_GSO_BIT] = "tx-generic-segmentation", + [NETIF_F_LLTX_BIT] = "tx-lockless", + [NETIF_F_NETNS_LOCAL_BIT] = "netns-local", + [NETIF_F_GRO_BIT] = "rx-gro", + [NETIF_F_GRO_HW_BIT] = "rx-gro-hw", + [NETIF_F_LRO_BIT] = "rx-lro", + + [NETIF_F_TSO_BIT] = "tx-tcp-segmentation", + [NETIF_F_GSO_ROBUST_BIT] = "tx-gso-robust", + [NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation", + [NETIF_F_TSO_MANGLEID_BIT] = "tx-tcp-mangleid-segmentation", + [NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation", + [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation", + [NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation", + [NETIF_F_GSO_GRE_CSUM_BIT] = "tx-gre-csum-segmentation", + [NETIF_F_GSO_IPXIP4_BIT] = "tx-ipxip4-segmentation", + [NETIF_F_GSO_IPXIP6_BIT] = "tx-ipxip6-segmentation", + [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", + [NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation", + [NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial", + [NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation", + [NETIF_F_GSO_ESP_BIT] = "tx-esp-segmentation", + [NETIF_F_GSO_UDP_L4_BIT] = "tx-udp-segmentation", + + [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", + [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp", + [NETIF_F_FCOE_MTU_BIT] = "fcoe-mtu", + [NETIF_F_NTUPLE_BIT] = "rx-ntuple-filter", + [NETIF_F_RXHASH_BIT] = "rx-hashing", + [NETIF_F_RXCSUM_BIT] = "rx-checksum", + [NETIF_F_NOCACHE_COPY_BIT] = "tx-nocache-copy", + [NETIF_F_LOOPBACK_BIT] = "loopback", + [NETIF_F_RXFCS_BIT] = "rx-fcs", + [NETIF_F_RXALL_BIT] = "rx-all", + [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload", + [NETIF_F_HW_TC_BIT] = "hw-tc-offload", + [NETIF_F_HW_ESP_BIT] = "esp-hw-offload", + [NETIF_F_HW_ESP_TX_CSUM_BIT] = "esp-tx-csum-hw-offload", + [NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload", + [NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record", + [NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload", + [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload", + [NETIF_F_GRO_FRAGLIST_BIT] = "rx-gro-list", +}; + +const char +rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN] = { + [ETH_RSS_HASH_TOP_BIT] = "toeplitz", + [ETH_RSS_HASH_XOR_BIT] = "xor", + [ETH_RSS_HASH_CRC32_BIT] = "crc32", +}; + +const char +tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = { + [ETHTOOL_ID_UNSPEC] = "Unspec", + [ETHTOOL_RX_COPYBREAK] = "rx-copybreak", + [ETHTOOL_TX_COPYBREAK] = "tx-copybreak", + [ETHTOOL_PFC_PREVENTION_TOUT] = "pfc-prevention-tout", +}; + +const char +phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN] = { + [ETHTOOL_ID_UNSPEC] = "Unspec", + [ETHTOOL_PHY_DOWNSHIFT] = "phy-downshift", + [ETHTOOL_PHY_FAST_LINK_DOWN] = "phy-fast-link-down", + [ETHTOOL_PHY_EDPD] = "phy-energy-detect-power-down", +}; + +#define __LINK_MODE_NAME(speed, type, duplex) \ + #speed "base" #type "/" #duplex +#define __DEFINE_LINK_MODE_NAME(speed, type, duplex) \ + [ETHTOOL_LINK_MODE(speed, type, duplex)] = \ + __LINK_MODE_NAME(speed, type, duplex) +#define __DEFINE_SPECIAL_MODE_NAME(_mode, _name) \ + [ETHTOOL_LINK_MODE_ ## _mode ## _BIT] = _name + +const char link_mode_names[][ETH_GSTRING_LEN] = { + __DEFINE_LINK_MODE_NAME(10, T, Half), + __DEFINE_LINK_MODE_NAME(10, T, Full), + __DEFINE_LINK_MODE_NAME(100, T, Half), + __DEFINE_LINK_MODE_NAME(100, T, Full), + __DEFINE_LINK_MODE_NAME(1000, T, Half), + __DEFINE_LINK_MODE_NAME(1000, T, Full), + __DEFINE_SPECIAL_MODE_NAME(Autoneg, "Autoneg"), + __DEFINE_SPECIAL_MODE_NAME(TP, "TP"), + __DEFINE_SPECIAL_MODE_NAME(AUI, "AUI"), + __DEFINE_SPECIAL_MODE_NAME(MII, "MII"), + __DEFINE_SPECIAL_MODE_NAME(FIBRE, "FIBRE"), + __DEFINE_SPECIAL_MODE_NAME(BNC, "BNC"), + __DEFINE_LINK_MODE_NAME(10000, T, Full), + __DEFINE_SPECIAL_MODE_NAME(Pause, "Pause"), + __DEFINE_SPECIAL_MODE_NAME(Asym_Pause, "Asym_Pause"), + __DEFINE_LINK_MODE_NAME(2500, X, Full), + __DEFINE_SPECIAL_MODE_NAME(Backplane, "Backplane"), + __DEFINE_LINK_MODE_NAME(1000, KX, Full), + __DEFINE_LINK_MODE_NAME(10000, KX4, Full), + __DEFINE_LINK_MODE_NAME(10000, KR, Full), + __DEFINE_SPECIAL_MODE_NAME(10000baseR_FEC, "10000baseR_FEC"), + __DEFINE_LINK_MODE_NAME(20000, MLD2, Full), + __DEFINE_LINK_MODE_NAME(20000, KR2, Full), + __DEFINE_LINK_MODE_NAME(40000, KR4, Full), + __DEFINE_LINK_MODE_NAME(40000, CR4, Full), + __DEFINE_LINK_MODE_NAME(40000, SR4, Full), + __DEFINE_LINK_MODE_NAME(40000, LR4, Full), + __DEFINE_LINK_MODE_NAME(56000, KR4, Full), + __DEFINE_LINK_MODE_NAME(56000, CR4, Full), + __DEFINE_LINK_MODE_NAME(56000, SR4, Full), + __DEFINE_LINK_MODE_NAME(56000, LR4, Full), + __DEFINE_LINK_MODE_NAME(25000, CR, Full), + __DEFINE_LINK_MODE_NAME(25000, KR, Full), + __DEFINE_LINK_MODE_NAME(25000, SR, Full), + __DEFINE_LINK_MODE_NAME(50000, CR2, Full), + __DEFINE_LINK_MODE_NAME(50000, KR2, Full), + __DEFINE_LINK_MODE_NAME(100000, KR4, Full), + __DEFINE_LINK_MODE_NAME(100000, SR4, Full), + __DEFINE_LINK_MODE_NAME(100000, CR4, Full), + __DEFINE_LINK_MODE_NAME(100000, LR4_ER4, Full), + __DEFINE_LINK_MODE_NAME(50000, SR2, Full), + __DEFINE_LINK_MODE_NAME(1000, X, Full), + __DEFINE_LINK_MODE_NAME(10000, CR, Full), + __DEFINE_LINK_MODE_NAME(10000, SR, Full), + __DEFINE_LINK_MODE_NAME(10000, LR, Full), + __DEFINE_LINK_MODE_NAME(10000, LRM, Full), + __DEFINE_LINK_MODE_NAME(10000, ER, Full), + __DEFINE_LINK_MODE_NAME(2500, T, Full), + __DEFINE_LINK_MODE_NAME(5000, T, Full), + __DEFINE_SPECIAL_MODE_NAME(FEC_NONE, "None"), + __DEFINE_SPECIAL_MODE_NAME(FEC_RS, "RS"), + __DEFINE_SPECIAL_MODE_NAME(FEC_BASER, "BASER"), + __DEFINE_LINK_MODE_NAME(50000, KR, Full), + __DEFINE_LINK_MODE_NAME(50000, SR, Full), + __DEFINE_LINK_MODE_NAME(50000, CR, Full), + __DEFINE_LINK_MODE_NAME(50000, LR_ER_FR, Full), + __DEFINE_LINK_MODE_NAME(50000, DR, Full), + __DEFINE_LINK_MODE_NAME(100000, KR2, Full), + __DEFINE_LINK_MODE_NAME(100000, SR2, Full), + __DEFINE_LINK_MODE_NAME(100000, CR2, Full), + __DEFINE_LINK_MODE_NAME(100000, LR2_ER2_FR2, Full), + __DEFINE_LINK_MODE_NAME(100000, DR2, Full), + __DEFINE_LINK_MODE_NAME(200000, KR4, Full), + __DEFINE_LINK_MODE_NAME(200000, SR4, Full), + __DEFINE_LINK_MODE_NAME(200000, LR4_ER4_FR4, Full), + __DEFINE_LINK_MODE_NAME(200000, DR4, Full), + __DEFINE_LINK_MODE_NAME(200000, CR4, Full), + __DEFINE_LINK_MODE_NAME(100, T1, Full), + __DEFINE_LINK_MODE_NAME(1000, T1, Full), + __DEFINE_LINK_MODE_NAME(400000, KR8, Full), + __DEFINE_LINK_MODE_NAME(400000, SR8, Full), + __DEFINE_LINK_MODE_NAME(400000, LR8_ER8_FR8, Full), + __DEFINE_LINK_MODE_NAME(400000, DR8, Full), + __DEFINE_LINK_MODE_NAME(400000, CR8, Full), +}; +static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); + +const char netif_msg_class_names[][ETH_GSTRING_LEN] = { + [NETIF_MSG_DRV_BIT] = "drv", + [NETIF_MSG_PROBE_BIT] = "probe", + [NETIF_MSG_LINK_BIT] = "link", + [NETIF_MSG_TIMER_BIT] = "timer", + [NETIF_MSG_IFDOWN_BIT] = "ifdown", + [NETIF_MSG_IFUP_BIT] = "ifup", + [NETIF_MSG_RX_ERR_BIT] = "rx_err", + [NETIF_MSG_TX_ERR_BIT] = "tx_err", + [NETIF_MSG_TX_QUEUED_BIT] = "tx_queued", + [NETIF_MSG_INTR_BIT] = "intr", + [NETIF_MSG_TX_DONE_BIT] = "tx_done", + [NETIF_MSG_RX_STATUS_BIT] = "rx_status", + [NETIF_MSG_PKTDATA_BIT] = "pktdata", + [NETIF_MSG_HW_BIT] = "hw", + [NETIF_MSG_WOL_BIT] = "wol", +}; +static_assert(ARRAY_SIZE(netif_msg_class_names) == NETIF_MSG_CLASS_COUNT); + +const char wol_mode_names[][ETH_GSTRING_LEN] = { + [const_ilog2(WAKE_PHY)] = "phy", + [const_ilog2(WAKE_UCAST)] = "ucast", + [const_ilog2(WAKE_MCAST)] = "mcast", + [const_ilog2(WAKE_BCAST)] = "bcast", + [const_ilog2(WAKE_ARP)] = "arp", + [const_ilog2(WAKE_MAGIC)] = "magic", + [const_ilog2(WAKE_MAGICSECURE)] = "magicsecure", + [const_ilog2(WAKE_FILTER)] = "filter", +}; +static_assert(ARRAY_SIZE(wol_mode_names) == WOL_MODE_COUNT); + +/* return false if legacy contained non-0 deprecated fields + * maxtxpkt/maxrxpkt. rest of ksettings always updated + */ +bool +convert_legacy_settings_to_link_ksettings( + struct ethtool_link_ksettings *link_ksettings, + const struct ethtool_cmd *legacy_settings) +{ + bool retval = true; + + memset(link_ksettings, 0, sizeof(*link_ksettings)); + + /* This is used to tell users that driver is still using these + * deprecated legacy fields, and they should not use + * %ETHTOOL_GLINKSETTINGS/%ETHTOOL_SLINKSETTINGS + */ + if (legacy_settings->maxtxpkt || + legacy_settings->maxrxpkt) + retval = false; + + ethtool_convert_legacy_u32_to_link_mode( + link_ksettings->link_modes.supported, + legacy_settings->supported); + ethtool_convert_legacy_u32_to_link_mode( + link_ksettings->link_modes.advertising, + legacy_settings->advertising); + ethtool_convert_legacy_u32_to_link_mode( + link_ksettings->link_modes.lp_advertising, + legacy_settings->lp_advertising); + link_ksettings->base.speed + = ethtool_cmd_speed(legacy_settings); + link_ksettings->base.duplex + = legacy_settings->duplex; + link_ksettings->base.port + = legacy_settings->port; + link_ksettings->base.phy_address + = legacy_settings->phy_address; + link_ksettings->base.autoneg + = legacy_settings->autoneg; + link_ksettings->base.mdio_support + = legacy_settings->mdio_support; + link_ksettings->base.eth_tp_mdix + = legacy_settings->eth_tp_mdix; + link_ksettings->base.eth_tp_mdix_ctrl + = legacy_settings->eth_tp_mdix_ctrl; + return retval; +} + +int __ethtool_get_link(struct net_device *dev) +{ + if (!dev->ethtool_ops->get_link) + return -EOPNOTSUPP; + + return netif_running(dev) && dev->ethtool_ops->get_link(dev); +} diff --git a/net/ethtool/common.h b/net/ethtool/common.h new file mode 100644 index 000000000000..40ba74e0b9bb --- /dev/null +++ b/net/ethtool/common.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef _ETHTOOL_COMMON_H +#define _ETHTOOL_COMMON_H + +#include <linux/netdevice.h> +#include <linux/ethtool.h> + +/* compose link mode index from speed, type and duplex */ +#define ETHTOOL_LINK_MODE(speed, type, duplex) \ + ETHTOOL_LINK_MODE_ ## speed ## base ## type ## _ ## duplex ## _BIT + +extern const char +netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]; +extern const char +rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN]; +extern const char +tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN]; +extern const char +phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN]; +extern const char link_mode_names[][ETH_GSTRING_LEN]; +extern const char netif_msg_class_names[][ETH_GSTRING_LEN]; +extern const char wol_mode_names[][ETH_GSTRING_LEN]; + +int __ethtool_get_link(struct net_device *dev); + +bool convert_legacy_settings_to_link_ksettings( + struct ethtool_link_ksettings *link_ksettings, + const struct ethtool_cmd *legacy_settings); + +#endif /* _ETHTOOL_COMMON_H */ diff --git a/net/ethtool/debug.c b/net/ethtool/debug.c new file mode 100644 index 000000000000..aaef4843e6ba --- /dev/null +++ b/net/ethtool/debug.c @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "netlink.h" +#include "common.h" +#include "bitset.h" + +struct debug_req_info { + struct ethnl_req_info base; +}; + +struct debug_reply_data { + struct ethnl_reply_data base; + u32 msg_mask; +}; + +#define DEBUG_REPDATA(__reply_base) \ + container_of(__reply_base, struct debug_reply_data, base) + +static const struct nla_policy +debug_get_policy[ETHTOOL_A_DEBUG_MAX + 1] = { + [ETHTOOL_A_DEBUG_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_DEBUG_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_DEBUG_MSGMASK] = { .type = NLA_REJECT }, +}; + +static int debug_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct debug_reply_data *data = DEBUG_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + int ret; + + if (!dev->ethtool_ops->get_msglevel) + return -EOPNOTSUPP; + + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + data->msg_mask = dev->ethtool_ops->get_msglevel(dev); + ethnl_ops_complete(dev); + + return 0; +} + +static int debug_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct debug_reply_data *data = DEBUG_REPDATA(reply_base); + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + + return ethnl_bitset32_size(&data->msg_mask, NULL, NETIF_MSG_CLASS_COUNT, + netif_msg_class_names, compact); +} + +static int debug_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct debug_reply_data *data = DEBUG_REPDATA(reply_base); + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + + return ethnl_put_bitset32(skb, ETHTOOL_A_DEBUG_MSGMASK, &data->msg_mask, + NULL, NETIF_MSG_CLASS_COUNT, + netif_msg_class_names, compact); +} + +const struct ethnl_request_ops ethnl_debug_request_ops = { + .request_cmd = ETHTOOL_MSG_DEBUG_GET, + .reply_cmd = ETHTOOL_MSG_DEBUG_GET_REPLY, + .hdr_attr = ETHTOOL_A_DEBUG_HEADER, + .max_attr = ETHTOOL_A_DEBUG_MAX, + .req_info_size = sizeof(struct debug_req_info), + .reply_data_size = sizeof(struct debug_reply_data), + .request_policy = debug_get_policy, + + .prepare_data = debug_prepare_data, + .reply_size = debug_reply_size, + .fill_reply = debug_fill_reply, +}; + +/* DEBUG_SET */ + +static const struct nla_policy +debug_set_policy[ETHTOOL_A_DEBUG_MAX + 1] = { + [ETHTOOL_A_DEBUG_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_DEBUG_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_DEBUG_MSGMASK] = { .type = NLA_NESTED }, +}; + +int ethnl_set_debug(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *tb[ETHTOOL_A_DEBUG_MAX + 1]; + struct ethnl_req_info req_info = {}; + struct net_device *dev; + bool mod = false; + u32 msg_mask; + int ret; + + ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, + ETHTOOL_A_DEBUG_MAX, debug_set_policy, + info->extack); + if (ret < 0) + return ret; + ret = ethnl_parse_header(&req_info, tb[ETHTOOL_A_DEBUG_HEADER], + genl_info_net(info), info->extack, true); + if (ret < 0) + return ret; + dev = req_info.dev; + if (!dev->ethtool_ops->get_msglevel || !dev->ethtool_ops->set_msglevel) + return -EOPNOTSUPP; + + rtnl_lock(); + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out_rtnl; + + msg_mask = dev->ethtool_ops->get_msglevel(dev); + ret = ethnl_update_bitset32(&msg_mask, NETIF_MSG_CLASS_COUNT, + tb[ETHTOOL_A_DEBUG_MSGMASK], + netif_msg_class_names, info->extack, &mod); + if (ret < 0 || !mod) + goto out_ops; + + dev->ethtool_ops->set_msglevel(dev, msg_mask); + ethtool_notify(dev, ETHTOOL_MSG_DEBUG_NTF, NULL); + +out_ops: + ethnl_ops_complete(dev); +out_rtnl: + rtnl_unlock(); + dev_put(dev); + return ret; +} diff --git a/net/core/ethtool.c b/net/ethtool/ioctl.c index 6288e69e94fc..b987052d91ef 100644 --- a/net/core/ethtool.c +++ b/net/ethtool/ioctl.c @@ -17,6 +17,7 @@ #include <linux/phy.h> #include <linux/bitops.h> #include <linux/uaccess.h> +#include <linux/vermagic.h> #include <linux/vmalloc.h> #include <linux/sfp.h> #include <linux/slab.h> @@ -26,6 +27,9 @@ #include <net/devlink.h> #include <net/xdp_sock.h> #include <net/flow_offload.h> +#include <linux/ethtool_netlink.h> + +#include "common.h" /* * Some useful ethtool_ops methods that're device independent. @@ -54,87 +58,6 @@ EXPORT_SYMBOL(ethtool_op_get_ts_info); #define ETHTOOL_DEV_FEATURE_WORDS ((NETDEV_FEATURE_COUNT + 31) / 32) -static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { - [NETIF_F_SG_BIT] = "tx-scatter-gather", - [NETIF_F_IP_CSUM_BIT] = "tx-checksum-ipv4", - [NETIF_F_HW_CSUM_BIT] = "tx-checksum-ip-generic", - [NETIF_F_IPV6_CSUM_BIT] = "tx-checksum-ipv6", - [NETIF_F_HIGHDMA_BIT] = "highdma", - [NETIF_F_FRAGLIST_BIT] = "tx-scatter-gather-fraglist", - [NETIF_F_HW_VLAN_CTAG_TX_BIT] = "tx-vlan-hw-insert", - - [NETIF_F_HW_VLAN_CTAG_RX_BIT] = "rx-vlan-hw-parse", - [NETIF_F_HW_VLAN_CTAG_FILTER_BIT] = "rx-vlan-filter", - [NETIF_F_HW_VLAN_STAG_TX_BIT] = "tx-vlan-stag-hw-insert", - [NETIF_F_HW_VLAN_STAG_RX_BIT] = "rx-vlan-stag-hw-parse", - [NETIF_F_HW_VLAN_STAG_FILTER_BIT] = "rx-vlan-stag-filter", - [NETIF_F_VLAN_CHALLENGED_BIT] = "vlan-challenged", - [NETIF_F_GSO_BIT] = "tx-generic-segmentation", - [NETIF_F_LLTX_BIT] = "tx-lockless", - [NETIF_F_NETNS_LOCAL_BIT] = "netns-local", - [NETIF_F_GRO_BIT] = "rx-gro", - [NETIF_F_GRO_HW_BIT] = "rx-gro-hw", - [NETIF_F_LRO_BIT] = "rx-lro", - - [NETIF_F_TSO_BIT] = "tx-tcp-segmentation", - [NETIF_F_GSO_ROBUST_BIT] = "tx-gso-robust", - [NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation", - [NETIF_F_TSO_MANGLEID_BIT] = "tx-tcp-mangleid-segmentation", - [NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation", - [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation", - [NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation", - [NETIF_F_GSO_GRE_CSUM_BIT] = "tx-gre-csum-segmentation", - [NETIF_F_GSO_IPXIP4_BIT] = "tx-ipxip4-segmentation", - [NETIF_F_GSO_IPXIP6_BIT] = "tx-ipxip6-segmentation", - [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", - [NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation", - [NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial", - [NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation", - [NETIF_F_GSO_ESP_BIT] = "tx-esp-segmentation", - [NETIF_F_GSO_UDP_L4_BIT] = "tx-udp-segmentation", - - [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", - [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp", - [NETIF_F_FCOE_MTU_BIT] = "fcoe-mtu", - [NETIF_F_NTUPLE_BIT] = "rx-ntuple-filter", - [NETIF_F_RXHASH_BIT] = "rx-hashing", - [NETIF_F_RXCSUM_BIT] = "rx-checksum", - [NETIF_F_NOCACHE_COPY_BIT] = "tx-nocache-copy", - [NETIF_F_LOOPBACK_BIT] = "loopback", - [NETIF_F_RXFCS_BIT] = "rx-fcs", - [NETIF_F_RXALL_BIT] = "rx-all", - [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload", - [NETIF_F_HW_TC_BIT] = "hw-tc-offload", - [NETIF_F_HW_ESP_BIT] = "esp-hw-offload", - [NETIF_F_HW_ESP_TX_CSUM_BIT] = "esp-tx-csum-hw-offload", - [NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload", - [NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record", - [NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload", - [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload", -}; - -static const char -rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN] = { - [ETH_RSS_HASH_TOP_BIT] = "toeplitz", - [ETH_RSS_HASH_XOR_BIT] = "xor", - [ETH_RSS_HASH_CRC32_BIT] = "crc32", -}; - -static const char -tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = { - [ETHTOOL_ID_UNSPEC] = "Unspec", - [ETHTOOL_RX_COPYBREAK] = "rx-copybreak", - [ETHTOOL_TX_COPYBREAK] = "tx-copybreak", - [ETHTOOL_PFC_PREVENTION_TOUT] = "pfc-prevention-tout", -}; - -static const char -phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN] = { - [ETHTOOL_ID_UNSPEC] = "Unspec", - [ETHTOOL_PHY_DOWNSHIFT] = "phy-downshift", - [ETHTOOL_PHY_FAST_LINK_DOWN] = "phy-fast-link-down", -}; - static int ethtool_get_features(struct net_device *dev, void __user *useraddr) { struct ethtool_gfeatures cmd = { @@ -233,6 +156,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset) !ops->get_ethtool_phy_stats) return phy_ethtool_get_sset_count(dev->phydev); + if (sset == ETH_SS_LINK_MODES) + return __ETHTOOL_LINK_MODE_MASK_NBITS; + if (ops->get_sset_count && ops->get_strings) return ops->get_sset_count(dev, sset); else @@ -257,6 +183,9 @@ static void __ethtool_get_strings(struct net_device *dev, else if (stringset == ETH_SS_PHY_STATS && dev->phydev && !ops->get_ethtool_phy_stats) phy_ethtool_get_strings(dev->phydev, data); + else if (stringset == ETH_SS_LINK_MODES) + memcpy(data, link_mode_names, + __ETHTOOL_LINK_MODE_MASK_NBITS * ETH_GSTRING_LEN); else /* ops->get_strings is valid because checked earlier */ ops->get_strings(dev, stringset, data); @@ -431,54 +360,6 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, } EXPORT_SYMBOL(ethtool_convert_link_mode_to_legacy_u32); -/* return false if legacy contained non-0 deprecated fields - * maxtxpkt/maxrxpkt. rest of ksettings always updated - */ -static bool -convert_legacy_settings_to_link_ksettings( - struct ethtool_link_ksettings *link_ksettings, - const struct ethtool_cmd *legacy_settings) -{ - bool retval = true; - - memset(link_ksettings, 0, sizeof(*link_ksettings)); - - /* This is used to tell users that driver is still using these - * deprecated legacy fields, and they should not use - * %ETHTOOL_GLINKSETTINGS/%ETHTOOL_SLINKSETTINGS - */ - if (legacy_settings->maxtxpkt || - legacy_settings->maxrxpkt) - retval = false; - - ethtool_convert_legacy_u32_to_link_mode( - link_ksettings->link_modes.supported, - legacy_settings->supported); - ethtool_convert_legacy_u32_to_link_mode( - link_ksettings->link_modes.advertising, - legacy_settings->advertising); - ethtool_convert_legacy_u32_to_link_mode( - link_ksettings->link_modes.lp_advertising, - legacy_settings->lp_advertising); - link_ksettings->base.speed - = ethtool_cmd_speed(legacy_settings); - link_ksettings->base.duplex - = legacy_settings->duplex; - link_ksettings->base.port - = legacy_settings->port; - link_ksettings->base.phy_address - = legacy_settings->phy_address; - link_ksettings->base.autoneg - = legacy_settings->autoneg; - link_ksettings->base.mdio_support - = legacy_settings->mdio_support; - link_ksettings->base.eth_tp_mdix - = legacy_settings->eth_tp_mdix; - link_ksettings->base.eth_tp_mdix_ctrl - = legacy_settings->eth_tp_mdix_ctrl; - return retval; -} - /* return false if ksettings link modes had higher bits * set. legacy_settings always updated (best effort) */ @@ -692,7 +573,12 @@ static int ethtool_set_link_ksettings(struct net_device *dev, != link_ksettings.base.link_mode_masks_nwords) return -EINVAL; - return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); + err = dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); + if (err >= 0) { + ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF, NULL); + ethtool_notify(dev, ETHTOOL_MSG_LINKMODES_NTF, NULL); + } + return err; } /* Query device for its ethtool_cmd settings. @@ -741,6 +627,7 @@ static int ethtool_set_settings(struct net_device *dev, void __user *useraddr) { struct ethtool_link_ksettings link_ksettings; struct ethtool_cmd cmd; + int ret; ASSERT_RTNL(); @@ -753,7 +640,12 @@ static int ethtool_set_settings(struct net_device *dev, void __user *useraddr) return -EINVAL; link_ksettings.base.link_mode_masks_nwords = __ETHTOOL_LINK_MODE_MASK_NU32; - return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); + ret = dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); + if (ret >= 0) { + ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF, NULL); + ethtool_notify(dev, ETHTOOL_MSG_LINKMODES_NTF, NULL); + } + return ret; } static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, @@ -764,6 +656,7 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, memset(&info, 0, sizeof(info)); info.cmd = ETHTOOL_GDRVINFO; + strlcpy(info.version, UTS_RELEASE, sizeof(info.version)); if (ops->get_drvinfo) { ops->get_drvinfo(dev, &info); } else if (dev->dev.parent && dev->dev.parent->driver) { @@ -1395,11 +1288,13 @@ static int ethtool_reset(struct net_device *dev, char __user *useraddr) static int ethtool_get_wol(struct net_device *dev, char __user *useraddr) { - struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL }; + struct ethtool_wolinfo wol; if (!dev->ethtool_ops->get_wol) return -EOPNOTSUPP; + memset(&wol, 0, sizeof(struct ethtool_wolinfo)); + wol.cmd = ETHTOOL_GWOL; dev->ethtool_ops->get_wol(dev, &wol); if (copy_to_user(useraddr, &wol, sizeof(wol))) @@ -1423,6 +1318,7 @@ static int ethtool_set_wol(struct net_device *dev, char __user *useraddr) return ret; dev->wol_enabled = !!wol.wolopts; + ethtool_notify(dev, ETHTOOL_MSG_WOL_NTF, NULL); return 0; } @@ -1472,12 +1368,12 @@ static int ethtool_nway_reset(struct net_device *dev) static int ethtool_get_link(struct net_device *dev, char __user *useraddr) { struct ethtool_value edata = { .cmd = ETHTOOL_GLINK }; + int link = __ethtool_get_link(dev); - if (!dev->ethtool_ops->get_link) - return -EOPNOTSUPP; - - edata.data = netif_running(dev) && dev->ethtool_ops->get_link(dev); + if (link < 0) + return link; + edata.data = link; if (copy_to_user(useraddr, &edata, sizeof(edata))) return -EFAULT; return 0; @@ -2167,8 +2063,8 @@ static int ethtool_get_ts_info(struct net_device *dev, void __user *useraddr) memset(&info, 0, sizeof(info)); info.cmd = ETHTOOL_GET_TS_INFO; - if (phydev && phydev->drv && phydev->drv->ts_info) { - err = phydev->drv->ts_info(phydev, &info); + if (phy_has_tsinfo(phydev)) { + err = phy_ts_info(phydev, &info); } else if (ops->get_ts_info) { err = ops->get_ts_info(dev, &info); } else { @@ -2451,6 +2347,11 @@ static int ethtool_phy_tunable_valid(const struct ethtool_tunable *tuna) tuna->type_id != ETHTOOL_TUNABLE_U8) return -EINVAL; break; + case ETHTOOL_PHY_EDPD: + if (tuna->len != sizeof(u16) || + tuna->type_id != ETHTOOL_TUNABLE_U16) + return -EINVAL; + break; default: return -EINVAL; } @@ -2647,6 +2548,8 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_SMSGLVL: rc = ethtool_set_value_void(dev, useraddr, dev->ethtool_ops->set_msglevel); + if (!rc) + ethtool_notify(dev, ETHTOOL_MSG_DEBUG_NTF, NULL); break; case ETHTOOL_GEEE: rc = ethtool_get_eee(dev, useraddr); diff --git a/net/ethtool/linkinfo.c b/net/ethtool/linkinfo.c new file mode 100644 index 000000000000..5d16cb4e8693 --- /dev/null +++ b/net/ethtool/linkinfo.c @@ -0,0 +1,167 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "netlink.h" +#include "common.h" + +struct linkinfo_req_info { + struct ethnl_req_info base; +}; + +struct linkinfo_reply_data { + struct ethnl_reply_data base; + struct ethtool_link_ksettings ksettings; + struct ethtool_link_settings *lsettings; +}; + +#define LINKINFO_REPDATA(__reply_base) \ + container_of(__reply_base, struct linkinfo_reply_data, base) + +static const struct nla_policy +linkinfo_get_policy[ETHTOOL_A_LINKINFO_MAX + 1] = { + [ETHTOOL_A_LINKINFO_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKINFO_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_LINKINFO_PORT] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKINFO_PHYADDR] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKINFO_TP_MDIX] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKINFO_TP_MDIX_CTRL] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKINFO_TRANSCEIVER] = { .type = NLA_REJECT }, +}; + +static int linkinfo_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct linkinfo_reply_data *data = LINKINFO_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + int ret; + + data->lsettings = &data->ksettings.base; + + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + ret = __ethtool_get_link_ksettings(dev, &data->ksettings); + if (ret < 0 && info) + GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); + ethnl_ops_complete(dev); + + return ret; +} + +static int linkinfo_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + return nla_total_size(sizeof(u8)) /* LINKINFO_PORT */ + + nla_total_size(sizeof(u8)) /* LINKINFO_PHYADDR */ + + nla_total_size(sizeof(u8)) /* LINKINFO_TP_MDIX */ + + nla_total_size(sizeof(u8)) /* LINKINFO_TP_MDIX_CTRL */ + + nla_total_size(sizeof(u8)) /* LINKINFO_TRANSCEIVER */ + + 0; +} + +static int linkinfo_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct linkinfo_reply_data *data = LINKINFO_REPDATA(reply_base); + + if (nla_put_u8(skb, ETHTOOL_A_LINKINFO_PORT, data->lsettings->port) || + nla_put_u8(skb, ETHTOOL_A_LINKINFO_PHYADDR, + data->lsettings->phy_address) || + nla_put_u8(skb, ETHTOOL_A_LINKINFO_TP_MDIX, + data->lsettings->eth_tp_mdix) || + nla_put_u8(skb, ETHTOOL_A_LINKINFO_TP_MDIX_CTRL, + data->lsettings->eth_tp_mdix_ctrl) || + nla_put_u8(skb, ETHTOOL_A_LINKINFO_TRANSCEIVER, + data->lsettings->transceiver)) + return -EMSGSIZE; + + return 0; +} + +const struct ethnl_request_ops ethnl_linkinfo_request_ops = { + .request_cmd = ETHTOOL_MSG_LINKINFO_GET, + .reply_cmd = ETHTOOL_MSG_LINKINFO_GET_REPLY, + .hdr_attr = ETHTOOL_A_LINKINFO_HEADER, + .max_attr = ETHTOOL_A_LINKINFO_MAX, + .req_info_size = sizeof(struct linkinfo_req_info), + .reply_data_size = sizeof(struct linkinfo_reply_data), + .request_policy = linkinfo_get_policy, + + .prepare_data = linkinfo_prepare_data, + .reply_size = linkinfo_reply_size, + .fill_reply = linkinfo_fill_reply, +}; + +/* LINKINFO_SET */ + +static const struct nla_policy +linkinfo_set_policy[ETHTOOL_A_LINKINFO_MAX + 1] = { + [ETHTOOL_A_LINKINFO_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKINFO_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_LINKINFO_PORT] = { .type = NLA_U8 }, + [ETHTOOL_A_LINKINFO_PHYADDR] = { .type = NLA_U8 }, + [ETHTOOL_A_LINKINFO_TP_MDIX] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKINFO_TP_MDIX_CTRL] = { .type = NLA_U8 }, + [ETHTOOL_A_LINKINFO_TRANSCEIVER] = { .type = NLA_REJECT }, +}; + +int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *tb[ETHTOOL_A_LINKINFO_MAX + 1]; + struct ethtool_link_ksettings ksettings = {}; + struct ethtool_link_settings *lsettings; + struct ethnl_req_info req_info = {}; + struct net_device *dev; + bool mod = false; + int ret; + + ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, + ETHTOOL_A_LINKINFO_MAX, linkinfo_set_policy, + info->extack); + if (ret < 0) + return ret; + ret = ethnl_parse_header(&req_info, tb[ETHTOOL_A_LINKINFO_HEADER], + genl_info_net(info), info->extack, true); + if (ret < 0) + return ret; + dev = req_info.dev; + if (!dev->ethtool_ops->get_link_ksettings || + !dev->ethtool_ops->set_link_ksettings) + return -EOPNOTSUPP; + + rtnl_lock(); + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out_rtnl; + + ret = __ethtool_get_link_ksettings(dev, &ksettings); + if (ret < 0) { + if (info) + GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); + goto out_ops; + } + lsettings = &ksettings.base; + + ethnl_update_u8(&lsettings->port, tb[ETHTOOL_A_LINKINFO_PORT], &mod); + ethnl_update_u8(&lsettings->phy_address, tb[ETHTOOL_A_LINKINFO_PHYADDR], + &mod); + ethnl_update_u8(&lsettings->eth_tp_mdix_ctrl, + tb[ETHTOOL_A_LINKINFO_TP_MDIX_CTRL], &mod); + ret = 0; + if (!mod) + goto out_ops; + + ret = dev->ethtool_ops->set_link_ksettings(dev, &ksettings); + if (ret < 0) + GENL_SET_ERR_MSG(info, "link settings update failed"); + else + ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF, NULL); + +out_ops: + ethnl_ops_complete(dev); +out_rtnl: + rtnl_unlock(); + dev_put(dev); + return ret; +} diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c new file mode 100644 index 000000000000..96f20be64553 --- /dev/null +++ b/net/ethtool/linkmodes.c @@ -0,0 +1,375 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "netlink.h" +#include "common.h" +#include "bitset.h" + +struct linkmodes_req_info { + struct ethnl_req_info base; +}; + +struct linkmodes_reply_data { + struct ethnl_reply_data base; + struct ethtool_link_ksettings ksettings; + struct ethtool_link_settings *lsettings; + bool peer_empty; +}; + +#define LINKMODES_REPDATA(__reply_base) \ + container_of(__reply_base, struct linkmodes_reply_data, base) + +static const struct nla_policy +linkmodes_get_policy[ETHTOOL_A_LINKMODES_MAX + 1] = { + [ETHTOOL_A_LINKMODES_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKMODES_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_LINKMODES_AUTONEG] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKMODES_OURS] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKMODES_PEER] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKMODES_SPEED] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKMODES_DUPLEX] = { .type = NLA_REJECT }, +}; + +static int linkmodes_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + int ret; + + data->lsettings = &data->ksettings.base; + + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + + ret = __ethtool_get_link_ksettings(dev, &data->ksettings); + if (ret < 0 && info) { + GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); + goto out; + } + + data->peer_empty = + bitmap_empty(data->ksettings.link_modes.lp_advertising, + __ETHTOOL_LINK_MODE_MASK_NBITS); + +out: + ethnl_ops_complete(dev); + return ret; +} + +static int linkmodes_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base); + const struct ethtool_link_ksettings *ksettings = &data->ksettings; + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + int len, ret; + + len = nla_total_size(sizeof(u8)) /* LINKMODES_AUTONEG */ + + nla_total_size(sizeof(u32)) /* LINKMODES_SPEED */ + + nla_total_size(sizeof(u8)) /* LINKMODES_DUPLEX */ + + 0; + ret = ethnl_bitset_size(ksettings->link_modes.advertising, + ksettings->link_modes.supported, + __ETHTOOL_LINK_MODE_MASK_NBITS, + link_mode_names, compact); + if (ret < 0) + return ret; + len += ret; + if (!data->peer_empty) { + ret = ethnl_bitset_size(ksettings->link_modes.lp_advertising, + NULL, __ETHTOOL_LINK_MODE_MASK_NBITS, + link_mode_names, compact); + if (ret < 0) + return ret; + len += ret; + } + + return len; +} + +static int linkmodes_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base); + const struct ethtool_link_ksettings *ksettings = &data->ksettings; + const struct ethtool_link_settings *lsettings = &ksettings->base; + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + int ret; + + if (nla_put_u8(skb, ETHTOOL_A_LINKMODES_AUTONEG, lsettings->autoneg)) + return -EMSGSIZE; + + ret = ethnl_put_bitset(skb, ETHTOOL_A_LINKMODES_OURS, + ksettings->link_modes.advertising, + ksettings->link_modes.supported, + __ETHTOOL_LINK_MODE_MASK_NBITS, link_mode_names, + compact); + if (ret < 0) + return -EMSGSIZE; + if (!data->peer_empty) { + ret = ethnl_put_bitset(skb, ETHTOOL_A_LINKMODES_PEER, + ksettings->link_modes.lp_advertising, + NULL, __ETHTOOL_LINK_MODE_MASK_NBITS, + link_mode_names, compact); + if (ret < 0) + return -EMSGSIZE; + } + + if (nla_put_u32(skb, ETHTOOL_A_LINKMODES_SPEED, lsettings->speed) || + nla_put_u8(skb, ETHTOOL_A_LINKMODES_DUPLEX, lsettings->duplex)) + return -EMSGSIZE; + + return 0; +} + +const struct ethnl_request_ops ethnl_linkmodes_request_ops = { + .request_cmd = ETHTOOL_MSG_LINKMODES_GET, + .reply_cmd = ETHTOOL_MSG_LINKMODES_GET_REPLY, + .hdr_attr = ETHTOOL_A_LINKMODES_HEADER, + .max_attr = ETHTOOL_A_LINKMODES_MAX, + .req_info_size = sizeof(struct linkmodes_req_info), + .reply_data_size = sizeof(struct linkmodes_reply_data), + .request_policy = linkmodes_get_policy, + + .prepare_data = linkmodes_prepare_data, + .reply_size = linkmodes_reply_size, + .fill_reply = linkmodes_fill_reply, +}; + +/* LINKMODES_SET */ + +struct link_mode_info { + int speed; + u8 duplex; +}; + +#define __DEFINE_LINK_MODE_PARAMS(_speed, _type, _duplex) \ + [ETHTOOL_LINK_MODE(_speed, _type, _duplex)] = { \ + .speed = SPEED_ ## _speed, \ + .duplex = __DUPLEX_ ## _duplex \ + } +#define __DUPLEX_Half DUPLEX_HALF +#define __DUPLEX_Full DUPLEX_FULL +#define __DEFINE_SPECIAL_MODE_PARAMS(_mode) \ + [ETHTOOL_LINK_MODE_ ## _mode ## _BIT] = { \ + .speed = SPEED_UNKNOWN, \ + .duplex = DUPLEX_UNKNOWN, \ + } + +static const struct link_mode_info link_mode_params[] = { + __DEFINE_LINK_MODE_PARAMS(10, T, Half), + __DEFINE_LINK_MODE_PARAMS(10, T, Full), + __DEFINE_LINK_MODE_PARAMS(100, T, Half), + __DEFINE_LINK_MODE_PARAMS(100, T, Full), + __DEFINE_LINK_MODE_PARAMS(1000, T, Half), + __DEFINE_LINK_MODE_PARAMS(1000, T, Full), + __DEFINE_SPECIAL_MODE_PARAMS(Autoneg), + __DEFINE_SPECIAL_MODE_PARAMS(TP), + __DEFINE_SPECIAL_MODE_PARAMS(AUI), + __DEFINE_SPECIAL_MODE_PARAMS(MII), + __DEFINE_SPECIAL_MODE_PARAMS(FIBRE), + __DEFINE_SPECIAL_MODE_PARAMS(BNC), + __DEFINE_LINK_MODE_PARAMS(10000, T, Full), + __DEFINE_SPECIAL_MODE_PARAMS(Pause), + __DEFINE_SPECIAL_MODE_PARAMS(Asym_Pause), + __DEFINE_LINK_MODE_PARAMS(2500, X, Full), + __DEFINE_SPECIAL_MODE_PARAMS(Backplane), + __DEFINE_LINK_MODE_PARAMS(1000, KX, Full), + __DEFINE_LINK_MODE_PARAMS(10000, KX4, Full), + __DEFINE_LINK_MODE_PARAMS(10000, KR, Full), + [ETHTOOL_LINK_MODE_10000baseR_FEC_BIT] = { + .speed = SPEED_10000, + .duplex = DUPLEX_FULL, + }, + __DEFINE_LINK_MODE_PARAMS(20000, MLD2, Full), + __DEFINE_LINK_MODE_PARAMS(20000, KR2, Full), + __DEFINE_LINK_MODE_PARAMS(40000, KR4, Full), + __DEFINE_LINK_MODE_PARAMS(40000, CR4, Full), + __DEFINE_LINK_MODE_PARAMS(40000, SR4, Full), + __DEFINE_LINK_MODE_PARAMS(40000, LR4, Full), + __DEFINE_LINK_MODE_PARAMS(56000, KR4, Full), + __DEFINE_LINK_MODE_PARAMS(56000, CR4, Full), + __DEFINE_LINK_MODE_PARAMS(56000, SR4, Full), + __DEFINE_LINK_MODE_PARAMS(56000, LR4, Full), + __DEFINE_LINK_MODE_PARAMS(25000, CR, Full), + __DEFINE_LINK_MODE_PARAMS(25000, KR, Full), + __DEFINE_LINK_MODE_PARAMS(25000, SR, Full), + __DEFINE_LINK_MODE_PARAMS(50000, CR2, Full), + __DEFINE_LINK_MODE_PARAMS(50000, KR2, Full), + __DEFINE_LINK_MODE_PARAMS(100000, KR4, Full), + __DEFINE_LINK_MODE_PARAMS(100000, SR4, Full), + __DEFINE_LINK_MODE_PARAMS(100000, CR4, Full), + __DEFINE_LINK_MODE_PARAMS(100000, LR4_ER4, Full), + __DEFINE_LINK_MODE_PARAMS(50000, SR2, Full), + __DEFINE_LINK_MODE_PARAMS(1000, X, Full), + __DEFINE_LINK_MODE_PARAMS(10000, CR, Full), + __DEFINE_LINK_MODE_PARAMS(10000, SR, Full), + __DEFINE_LINK_MODE_PARAMS(10000, LR, Full), + __DEFINE_LINK_MODE_PARAMS(10000, LRM, Full), + __DEFINE_LINK_MODE_PARAMS(10000, ER, Full), + __DEFINE_LINK_MODE_PARAMS(2500, T, Full), + __DEFINE_LINK_MODE_PARAMS(5000, T, Full), + __DEFINE_SPECIAL_MODE_PARAMS(FEC_NONE), + __DEFINE_SPECIAL_MODE_PARAMS(FEC_RS), + __DEFINE_SPECIAL_MODE_PARAMS(FEC_BASER), + __DEFINE_LINK_MODE_PARAMS(50000, KR, Full), + __DEFINE_LINK_MODE_PARAMS(50000, SR, Full), + __DEFINE_LINK_MODE_PARAMS(50000, CR, Full), + __DEFINE_LINK_MODE_PARAMS(50000, LR_ER_FR, Full), + __DEFINE_LINK_MODE_PARAMS(50000, DR, Full), + __DEFINE_LINK_MODE_PARAMS(100000, KR2, Full), + __DEFINE_LINK_MODE_PARAMS(100000, SR2, Full), + __DEFINE_LINK_MODE_PARAMS(100000, CR2, Full), + __DEFINE_LINK_MODE_PARAMS(100000, LR2_ER2_FR2, Full), + __DEFINE_LINK_MODE_PARAMS(100000, DR2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, KR4, Full), + __DEFINE_LINK_MODE_PARAMS(200000, SR4, Full), + __DEFINE_LINK_MODE_PARAMS(200000, LR4_ER4_FR4, Full), + __DEFINE_LINK_MODE_PARAMS(200000, DR4, Full), + __DEFINE_LINK_MODE_PARAMS(200000, CR4, Full), + __DEFINE_LINK_MODE_PARAMS(100, T1, Full), + __DEFINE_LINK_MODE_PARAMS(1000, T1, Full), + __DEFINE_LINK_MODE_PARAMS(400000, KR8, Full), + __DEFINE_LINK_MODE_PARAMS(400000, SR8, Full), + __DEFINE_LINK_MODE_PARAMS(400000, LR8_ER8_FR8, Full), + __DEFINE_LINK_MODE_PARAMS(400000, DR8, Full), + __DEFINE_LINK_MODE_PARAMS(400000, CR8, Full), +}; + +static const struct nla_policy +linkmodes_set_policy[ETHTOOL_A_LINKMODES_MAX + 1] = { + [ETHTOOL_A_LINKMODES_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKMODES_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_LINKMODES_AUTONEG] = { .type = NLA_U8 }, + [ETHTOOL_A_LINKMODES_OURS] = { .type = NLA_NESTED }, + [ETHTOOL_A_LINKMODES_PEER] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKMODES_SPEED] = { .type = NLA_U32 }, + [ETHTOOL_A_LINKMODES_DUPLEX] = { .type = NLA_U8 }, +}; + +/* Set advertised link modes to all supported modes matching requested speed + * and duplex values. Called when autonegotiation is on, speed or duplex is + * requested but no link mode change. This is done in userspace with ioctl() + * interface, move it into kernel for netlink. + * Returns true if advertised modes bitmap was modified. + */ +static bool ethnl_auto_linkmodes(struct ethtool_link_ksettings *ksettings, + bool req_speed, bool req_duplex) +{ + unsigned long *advertising = ksettings->link_modes.advertising; + unsigned long *supported = ksettings->link_modes.supported; + DECLARE_BITMAP(old_adv, __ETHTOOL_LINK_MODE_MASK_NBITS); + unsigned int i; + + BUILD_BUG_ON(ARRAY_SIZE(link_mode_params) != + __ETHTOOL_LINK_MODE_MASK_NBITS); + + bitmap_copy(old_adv, advertising, __ETHTOOL_LINK_MODE_MASK_NBITS); + + for (i = 0; i < __ETHTOOL_LINK_MODE_MASK_NBITS; i++) { + const struct link_mode_info *info = &link_mode_params[i]; + + if (info->speed == SPEED_UNKNOWN) + continue; + if (test_bit(i, supported) && + (!req_speed || info->speed == ksettings->base.speed) && + (!req_duplex || info->duplex == ksettings->base.duplex)) + set_bit(i, advertising); + else + clear_bit(i, advertising); + } + + return !bitmap_equal(old_adv, advertising, + __ETHTOOL_LINK_MODE_MASK_NBITS); +} + +static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb, + struct ethtool_link_ksettings *ksettings, + bool *mod) +{ + struct ethtool_link_settings *lsettings = &ksettings->base; + bool req_speed, req_duplex; + int ret; + + *mod = false; + req_speed = tb[ETHTOOL_A_LINKMODES_SPEED]; + req_duplex = tb[ETHTOOL_A_LINKMODES_DUPLEX]; + + ethnl_update_u8(&lsettings->autoneg, tb[ETHTOOL_A_LINKMODES_AUTONEG], + mod); + ret = ethnl_update_bitset(ksettings->link_modes.advertising, + __ETHTOOL_LINK_MODE_MASK_NBITS, + tb[ETHTOOL_A_LINKMODES_OURS], link_mode_names, + info->extack, mod); + if (ret < 0) + return ret; + ethnl_update_u32(&lsettings->speed, tb[ETHTOOL_A_LINKMODES_SPEED], + mod); + ethnl_update_u8(&lsettings->duplex, tb[ETHTOOL_A_LINKMODES_DUPLEX], + mod); + + if (!tb[ETHTOOL_A_LINKMODES_OURS] && lsettings->autoneg && + (req_speed || req_duplex) && + ethnl_auto_linkmodes(ksettings, req_speed, req_duplex)) + *mod = true; + + return 0; +} + +int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *tb[ETHTOOL_A_LINKMODES_MAX + 1]; + struct ethtool_link_ksettings ksettings = {}; + struct ethnl_req_info req_info = {}; + struct net_device *dev; + bool mod = false; + int ret; + + ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, + ETHTOOL_A_LINKMODES_MAX, linkmodes_set_policy, + info->extack); + if (ret < 0) + return ret; + ret = ethnl_parse_header(&req_info, tb[ETHTOOL_A_LINKMODES_HEADER], + genl_info_net(info), info->extack, true); + if (ret < 0) + return ret; + dev = req_info.dev; + if (!dev->ethtool_ops->get_link_ksettings || + !dev->ethtool_ops->set_link_ksettings) + return -EOPNOTSUPP; + + rtnl_lock(); + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out_rtnl; + + ret = __ethtool_get_link_ksettings(dev, &ksettings); + if (ret < 0) { + if (info) + GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); + goto out_ops; + } + + ret = ethnl_update_linkmodes(info, tb, &ksettings, &mod); + if (ret < 0) + goto out_ops; + + if (mod) { + ret = dev->ethtool_ops->set_link_ksettings(dev, &ksettings); + if (ret < 0) + GENL_SET_ERR_MSG(info, "link settings update failed"); + else + ethtool_notify(dev, ETHTOOL_MSG_LINKMODES_NTF, NULL); + } + +out_ops: + ethnl_ops_complete(dev); +out_rtnl: + rtnl_unlock(); + dev_put(dev); + return ret; +} diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c new file mode 100644 index 000000000000..2740cde0a182 --- /dev/null +++ b/net/ethtool/linkstate.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "netlink.h" +#include "common.h" + +struct linkstate_req_info { + struct ethnl_req_info base; +}; + +struct linkstate_reply_data { + struct ethnl_reply_data base; + int link; +}; + +#define LINKSTATE_REPDATA(__reply_base) \ + container_of(__reply_base, struct linkstate_reply_data, base) + +static const struct nla_policy +linkstate_get_policy[ETHTOOL_A_LINKSTATE_MAX + 1] = { + [ETHTOOL_A_LINKSTATE_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKSTATE_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_LINKSTATE_LINK] = { .type = NLA_REJECT }, +}; + +static int linkstate_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct linkstate_reply_data *data = LINKSTATE_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + int ret; + + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + data->link = __ethtool_get_link(dev); + ethnl_ops_complete(dev); + + return 0; +} + +static int linkstate_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + return nla_total_size(sizeof(u8)) /* LINKSTATE_LINK */ + + 0; +} + +static int linkstate_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + struct linkstate_reply_data *data = LINKSTATE_REPDATA(reply_base); + + if (data->link >= 0 && + nla_put_u8(skb, ETHTOOL_A_LINKSTATE_LINK, !!data->link)) + return -EMSGSIZE; + + return 0; +} + +const struct ethnl_request_ops ethnl_linkstate_request_ops = { + .request_cmd = ETHTOOL_MSG_LINKSTATE_GET, + .reply_cmd = ETHTOOL_MSG_LINKSTATE_GET_REPLY, + .hdr_attr = ETHTOOL_A_LINKSTATE_HEADER, + .max_attr = ETHTOOL_A_LINKSTATE_MAX, + .req_info_size = sizeof(struct linkstate_req_info), + .reply_data_size = sizeof(struct linkstate_reply_data), + .request_policy = linkstate_get_policy, + + .prepare_data = linkstate_prepare_data, + .reply_size = linkstate_reply_size, + .fill_reply = linkstate_fill_reply, +}; diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c new file mode 100644 index 000000000000..180c194fab07 --- /dev/null +++ b/net/ethtool/netlink.c @@ -0,0 +1,729 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <net/sock.h> +#include <linux/ethtool_netlink.h> +#include "netlink.h" + +static struct genl_family ethtool_genl_family; + +static bool ethnl_ok __read_mostly; +static u32 ethnl_bcast_seq; + +static const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_MAX + 1] = { + [ETHTOOL_A_HEADER_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 }, + [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING, + .len = ALTIFNAMSIZ - 1 }, + [ETHTOOL_A_HEADER_FLAGS] = { .type = NLA_U32 }, +}; + +/** + * ethnl_parse_header() - parse request header + * @req_info: structure to put results into + * @header: nest attribute with request header + * @net: request netns + * @extack: netlink extack for error reporting + * @require_dev: fail if no device identified in header + * + * Parse request header in nested attribute @nest and puts results into + * the structure pointed to by @req_info. Extack from @info is used for error + * reporting. If req_info->dev is not null on return, reference to it has + * been taken. If error is returned, *req_info is null initialized and no + * reference is held. + * + * Return: 0 on success or negative error code + */ +int ethnl_parse_header(struct ethnl_req_info *req_info, + const struct nlattr *header, struct net *net, + struct netlink_ext_ack *extack, bool require_dev) +{ + struct nlattr *tb[ETHTOOL_A_HEADER_MAX + 1]; + const struct nlattr *devname_attr; + struct net_device *dev = NULL; + int ret; + + if (!header) { + NL_SET_ERR_MSG(extack, "request header missing"); + return -EINVAL; + } + ret = nla_parse_nested(tb, ETHTOOL_A_HEADER_MAX, header, + ethnl_header_policy, extack); + if (ret < 0) + return ret; + devname_attr = tb[ETHTOOL_A_HEADER_DEV_NAME]; + + if (tb[ETHTOOL_A_HEADER_DEV_INDEX]) { + u32 ifindex = nla_get_u32(tb[ETHTOOL_A_HEADER_DEV_INDEX]); + + dev = dev_get_by_index(net, ifindex); + if (!dev) { + NL_SET_ERR_MSG_ATTR(extack, + tb[ETHTOOL_A_HEADER_DEV_INDEX], + "no device matches ifindex"); + return -ENODEV; + } + /* if both ifindex and ifname are passed, they must match */ + if (devname_attr && + strncmp(dev->name, nla_data(devname_attr), IFNAMSIZ)) { + dev_put(dev); + NL_SET_ERR_MSG_ATTR(extack, header, + "ifindex and name do not match"); + return -ENODEV; + } + } else if (devname_attr) { + dev = dev_get_by_name(net, nla_data(devname_attr)); + if (!dev) { + NL_SET_ERR_MSG_ATTR(extack, devname_attr, + "no device matches name"); + return -ENODEV; + } + } else if (require_dev) { + NL_SET_ERR_MSG_ATTR(extack, header, + "neither ifindex nor name specified"); + return -EINVAL; + } + + if (dev && !netif_device_present(dev)) { + dev_put(dev); + NL_SET_ERR_MSG(extack, "device not present"); + return -ENODEV; + } + + req_info->dev = dev; + if (tb[ETHTOOL_A_HEADER_FLAGS]) + req_info->flags = nla_get_u32(tb[ETHTOOL_A_HEADER_FLAGS]); + + return 0; +} + +/** + * ethnl_fill_reply_header() - Put common header into a reply message + * @skb: skb with the message + * @dev: network device to describe in header + * @attrtype: attribute type to use for the nest + * + * Create a nested attribute with attributes describing given network device. + * + * Return: 0 on success, error value (-EMSGSIZE only) on error + */ +int ethnl_fill_reply_header(struct sk_buff *skb, struct net_device *dev, + u16 attrtype) +{ + struct nlattr *nest; + + if (!dev) + return 0; + nest = nla_nest_start(skb, attrtype); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u32(skb, ETHTOOL_A_HEADER_DEV_INDEX, (u32)dev->ifindex) || + nla_put_string(skb, ETHTOOL_A_HEADER_DEV_NAME, dev->name)) + goto nla_put_failure; + /* If more attributes are put into reply header, ethnl_header_size() + * must be updated to account for them. + */ + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +/** + * ethnl_reply_init() - Create skb for a reply and fill device identification + * @payload: payload length (without netlink and genetlink header) + * @dev: device the reply is about (may be null) + * @cmd: ETHTOOL_MSG_* message type for reply + * @hdr_attrtype: attribute type for common header + * @info: genetlink info of the received packet we respond to + * @ehdrp: place to store payload pointer returned by genlmsg_new() + * + * Return: pointer to allocated skb on success, NULL on error + */ +struct sk_buff *ethnl_reply_init(size_t payload, struct net_device *dev, u8 cmd, + u16 hdr_attrtype, struct genl_info *info, + void **ehdrp) +{ + struct sk_buff *skb; + + skb = genlmsg_new(payload, GFP_KERNEL); + if (!skb) + goto err; + *ehdrp = genlmsg_put_reply(skb, info, ðtool_genl_family, 0, cmd); + if (!*ehdrp) + goto err_free; + + if (dev) { + int ret; + + ret = ethnl_fill_reply_header(skb, dev, hdr_attrtype); + if (ret < 0) + goto err_free; + } + return skb; + +err_free: + nlmsg_free(skb); +err: + if (info) + GENL_SET_ERR_MSG(info, "failed to setup reply message"); + return NULL; +} + +static void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd) +{ + return genlmsg_put(skb, 0, ++ethnl_bcast_seq, ðtool_genl_family, 0, + cmd); +} + +static int ethnl_multicast(struct sk_buff *skb, struct net_device *dev) +{ + return genlmsg_multicast_netns(ðtool_genl_family, dev_net(dev), skb, + 0, ETHNL_MCGRP_MONITOR, GFP_KERNEL); +} + +/* GET request helpers */ + +/** + * struct ethnl_dump_ctx - context structure for generic dumpit() callback + * @ops: request ops of currently processed message type + * @req_info: parsed request header of processed request + * @reply_data: data needed to compose the reply + * @pos_hash: saved iteration position - hashbucket + * @pos_idx: saved iteration position - index + * + * These parameters are kept in struct netlink_callback as context preserved + * between iterations. They are initialized by ethnl_default_start() and used + * in ethnl_default_dumpit() and ethnl_default_done(). + */ +struct ethnl_dump_ctx { + const struct ethnl_request_ops *ops; + struct ethnl_req_info *req_info; + struct ethnl_reply_data *reply_data; + int pos_hash; + int pos_idx; +}; + +static const struct ethnl_request_ops * +ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { + [ETHTOOL_MSG_STRSET_GET] = ðnl_strset_request_ops, + [ETHTOOL_MSG_LINKINFO_GET] = ðnl_linkinfo_request_ops, + [ETHTOOL_MSG_LINKMODES_GET] = ðnl_linkmodes_request_ops, + [ETHTOOL_MSG_LINKSTATE_GET] = ðnl_linkstate_request_ops, + [ETHTOOL_MSG_DEBUG_GET] = ðnl_debug_request_ops, + [ETHTOOL_MSG_WOL_GET] = ðnl_wol_request_ops, +}; + +static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) +{ + return (struct ethnl_dump_ctx *)cb->ctx; +} + +/** + * ethnl_default_parse() - Parse request message + * @req_info: pointer to structure to put data into + * @nlhdr: pointer to request message header + * @net: request netns + * @request_ops: struct request_ops for request type + * @extack: netlink extack for error reporting + * @require_dev: fail if no device identified in header + * + * Parse universal request header and call request specific ->parse_request() + * callback (if defined) to parse the rest of the message. + * + * Return: 0 on success or negative error code + */ +static int ethnl_default_parse(struct ethnl_req_info *req_info, + const struct nlmsghdr *nlhdr, struct net *net, + const struct ethnl_request_ops *request_ops, + struct netlink_ext_ack *extack, bool require_dev) +{ + struct nlattr **tb; + int ret; + + tb = kmalloc_array(request_ops->max_attr + 1, sizeof(tb[0]), + GFP_KERNEL); + if (!tb) + return -ENOMEM; + + ret = nlmsg_parse(nlhdr, GENL_HDRLEN, tb, request_ops->max_attr, + request_ops->request_policy, extack); + if (ret < 0) + goto out; + ret = ethnl_parse_header(req_info, tb[request_ops->hdr_attr], net, + extack, require_dev); + if (ret < 0) + goto out; + + if (request_ops->parse_request) { + ret = request_ops->parse_request(req_info, tb, extack); + if (ret < 0) + goto out; + } + + ret = 0; +out: + kfree(tb); + return ret; +} + +/** + * ethnl_init_reply_data() - Initialize reply data for GET request + * @reply_data: pointer to embedded struct ethnl_reply_data + * @ops: instance of struct ethnl_request_ops describing the layout + * @dev: network device to initialize the reply for + * + * Fills the reply data part with zeros and sets the dev member. Must be called + * before calling the ->fill_reply() callback (for each iteration when handling + * dump requests). + */ +static void ethnl_init_reply_data(struct ethnl_reply_data *reply_data, + const struct ethnl_request_ops *ops, + struct net_device *dev) +{ + memset(reply_data, 0, ops->reply_data_size); + reply_data->dev = dev; +} + +/* default ->doit() handler for GET type requests */ +static int ethnl_default_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct ethnl_reply_data *reply_data = NULL; + struct ethnl_req_info *req_info = NULL; + const u8 cmd = info->genlhdr->cmd; + const struct ethnl_request_ops *ops; + struct sk_buff *rskb; + void *reply_payload; + int reply_len; + int ret; + + ops = ethnl_default_requests[cmd]; + if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", cmd)) + return -EOPNOTSUPP; + req_info = kzalloc(ops->req_info_size, GFP_KERNEL); + if (!req_info) + return -ENOMEM; + reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL); + if (!reply_data) { + kfree(req_info); + return -ENOMEM; + } + + ret = ethnl_default_parse(req_info, info->nlhdr, genl_info_net(info), ops, + info->extack, !ops->allow_nodev_do); + if (ret < 0) + goto err_dev; + ethnl_init_reply_data(reply_data, ops, req_info->dev); + + rtnl_lock(); + ret = ops->prepare_data(req_info, reply_data, info); + rtnl_unlock(); + if (ret < 0) + goto err_cleanup; + ret = ops->reply_size(req_info, reply_data); + if (ret < 0) + goto err_cleanup; + reply_len = ret; + ret = -ENOMEM; + rskb = ethnl_reply_init(reply_len, req_info->dev, ops->reply_cmd, + ops->hdr_attr, info, &reply_payload); + if (!rskb) + goto err_cleanup; + ret = ops->fill_reply(rskb, req_info, reply_data); + if (ret < 0) + goto err_msg; + if (ops->cleanup_data) + ops->cleanup_data(reply_data); + + genlmsg_end(rskb, reply_payload); + if (req_info->dev) + dev_put(req_info->dev); + kfree(reply_data); + kfree(req_info); + return genlmsg_reply(rskb, info); + +err_msg: + WARN_ONCE(ret == -EMSGSIZE, "calculated message payload length (%d) not sufficient\n", reply_len); + nlmsg_free(rskb); +err_cleanup: + if (ops->cleanup_data) + ops->cleanup_data(reply_data); +err_dev: + if (req_info->dev) + dev_put(req_info->dev); + kfree(reply_data); + kfree(req_info); + return ret; +} + +static int ethnl_default_dump_one(struct sk_buff *skb, struct net_device *dev, + const struct ethnl_dump_ctx *ctx) +{ + int ret; + + ethnl_init_reply_data(ctx->reply_data, ctx->ops, dev); + rtnl_lock(); + ret = ctx->ops->prepare_data(ctx->req_info, ctx->reply_data, NULL); + rtnl_unlock(); + if (ret < 0) + goto out; + ret = ethnl_fill_reply_header(skb, dev, ctx->ops->hdr_attr); + if (ret < 0) + goto out; + ret = ctx->ops->fill_reply(skb, ctx->req_info, ctx->reply_data); + +out: + if (ctx->ops->cleanup_data) + ctx->ops->cleanup_data(ctx->reply_data); + ctx->reply_data->dev = NULL; + return ret; +} + +/* Default ->dumpit() handler for GET requests. Device iteration copied from + * rtnl_dump_ifinfo(); we have to be more careful about device hashtable + * persistence as we cannot guarantee to hold RTNL lock through the whole + * function as rtnetnlink does. + */ +static int ethnl_default_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb); + struct net *net = sock_net(skb->sk); + int s_idx = ctx->pos_idx; + int h, idx = 0; + int ret = 0; + void *ehdr; + + rtnl_lock(); + for (h = ctx->pos_hash; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + struct hlist_head *head; + struct net_device *dev; + unsigned int seq; + + head = &net->dev_index_head[h]; + +restart_chain: + seq = net->dev_base_seq; + cb->seq = seq; + idx = 0; + hlist_for_each_entry(dev, head, index_hlist) { + if (idx < s_idx) + goto cont; + dev_hold(dev); + rtnl_unlock(); + + ehdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + ðtool_genl_family, 0, + ctx->ops->reply_cmd); + if (!ehdr) { + dev_put(dev); + ret = -EMSGSIZE; + goto out; + } + ret = ethnl_default_dump_one(skb, dev, ctx); + dev_put(dev); + if (ret < 0) { + genlmsg_cancel(skb, ehdr); + if (ret == -EOPNOTSUPP) + goto lock_and_cont; + if (likely(skb->len)) + ret = skb->len; + goto out; + } + genlmsg_end(skb, ehdr); +lock_and_cont: + rtnl_lock(); + if (net->dev_base_seq != seq) { + s_idx = idx + 1; + goto restart_chain; + } +cont: + idx++; + } + + } + rtnl_unlock(); + +out: + ctx->pos_hash = h; + ctx->pos_idx = idx; + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + + return ret; +} + +/* generic ->start() handler for GET requests */ +static int ethnl_default_start(struct netlink_callback *cb) +{ + struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb); + struct ethnl_reply_data *reply_data; + const struct ethnl_request_ops *ops; + struct ethnl_req_info *req_info; + struct genlmsghdr *ghdr; + int ret; + + BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); + + ghdr = nlmsg_data(cb->nlh); + ops = ethnl_default_requests[ghdr->cmd]; + if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", ghdr->cmd)) + return -EOPNOTSUPP; + req_info = kzalloc(ops->req_info_size, GFP_KERNEL); + if (!req_info) + return -ENOMEM; + reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL); + if (!reply_data) { + ret = -ENOMEM; + goto free_req_info; + } + + ret = ethnl_default_parse(req_info, cb->nlh, sock_net(cb->skb->sk), ops, + cb->extack, false); + if (req_info->dev) { + /* We ignore device specification in dump requests but as the + * same parser as for non-dump (doit) requests is used, it + * would take reference to the device if it finds one + */ + dev_put(req_info->dev); + req_info->dev = NULL; + } + if (ret < 0) + goto free_reply_data; + + ctx->ops = ops; + ctx->req_info = req_info; + ctx->reply_data = reply_data; + ctx->pos_hash = 0; + ctx->pos_idx = 0; + + return 0; + +free_reply_data: + kfree(reply_data); +free_req_info: + kfree(req_info); + + return ret; +} + +/* default ->done() handler for GET requests */ +static int ethnl_default_done(struct netlink_callback *cb) +{ + struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb); + + kfree(ctx->reply_data); + kfree(ctx->req_info); + + return 0; +} + +static const struct ethnl_request_ops * +ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = { + [ETHTOOL_MSG_LINKINFO_NTF] = ðnl_linkinfo_request_ops, + [ETHTOOL_MSG_LINKMODES_NTF] = ðnl_linkmodes_request_ops, + [ETHTOOL_MSG_DEBUG_NTF] = ðnl_debug_request_ops, + [ETHTOOL_MSG_WOL_NTF] = ðnl_wol_request_ops, +}; + +/* default notification handler */ +static void ethnl_default_notify(struct net_device *dev, unsigned int cmd, + const void *data) +{ + struct ethnl_reply_data *reply_data; + const struct ethnl_request_ops *ops; + struct ethnl_req_info *req_info; + struct sk_buff *skb; + void *reply_payload; + int reply_len; + int ret; + + if (WARN_ONCE(cmd > ETHTOOL_MSG_KERNEL_MAX || + !ethnl_default_notify_ops[cmd], + "unexpected notification type %u\n", cmd)) + return; + ops = ethnl_default_notify_ops[cmd]; + req_info = kzalloc(ops->req_info_size, GFP_KERNEL); + if (!req_info) + return; + reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL); + if (!reply_data) { + kfree(req_info); + return; + } + + req_info->dev = dev; + req_info->flags |= ETHTOOL_FLAG_COMPACT_BITSETS; + + ethnl_init_reply_data(reply_data, ops, dev); + ret = ops->prepare_data(req_info, reply_data, NULL); + if (ret < 0) + goto err_cleanup; + ret = ops->reply_size(req_info, reply_data); + if (ret < 0) + goto err_cleanup; + reply_len = ret; + ret = -ENOMEM; + skb = genlmsg_new(reply_len, GFP_KERNEL); + if (!skb) + goto err_cleanup; + reply_payload = ethnl_bcastmsg_put(skb, cmd); + if (!reply_payload) + goto err_skb; + ret = ethnl_fill_reply_header(skb, dev, ops->hdr_attr); + if (ret < 0) + goto err_msg; + ret = ops->fill_reply(skb, req_info, reply_data); + if (ret < 0) + goto err_msg; + if (ops->cleanup_data) + ops->cleanup_data(reply_data); + + genlmsg_end(skb, reply_payload); + kfree(reply_data); + kfree(req_info); + ethnl_multicast(skb, dev); + return; + +err_msg: + WARN_ONCE(ret == -EMSGSIZE, + "calculated message payload length (%d) not sufficient\n", + reply_len); +err_skb: + nlmsg_free(skb); +err_cleanup: + if (ops->cleanup_data) + ops->cleanup_data(reply_data); + kfree(reply_data); + kfree(req_info); + return; +} + +/* notifications */ + +typedef void (*ethnl_notify_handler_t)(struct net_device *dev, unsigned int cmd, + const void *data); + +static const ethnl_notify_handler_t ethnl_notify_handlers[] = { + [ETHTOOL_MSG_LINKINFO_NTF] = ethnl_default_notify, + [ETHTOOL_MSG_LINKMODES_NTF] = ethnl_default_notify, + [ETHTOOL_MSG_DEBUG_NTF] = ethnl_default_notify, + [ETHTOOL_MSG_WOL_NTF] = ethnl_default_notify, +}; + +void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data) +{ + if (unlikely(!ethnl_ok)) + return; + ASSERT_RTNL(); + + if (likely(cmd < ARRAY_SIZE(ethnl_notify_handlers) && + ethnl_notify_handlers[cmd])) + ethnl_notify_handlers[cmd](dev, cmd, data); + else + WARN_ONCE(1, "notification %u not implemented (dev=%s)\n", + cmd, netdev_name(dev)); +} +EXPORT_SYMBOL(ethtool_notify); + +/* genetlink setup */ + +static const struct genl_ops ethtool_genl_ops[] = { + { + .cmd = ETHTOOL_MSG_STRSET_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + }, + { + .cmd = ETHTOOL_MSG_LINKINFO_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + }, + { + .cmd = ETHTOOL_MSG_LINKINFO_SET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_set_linkinfo, + }, + { + .cmd = ETHTOOL_MSG_LINKMODES_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + }, + { + .cmd = ETHTOOL_MSG_LINKMODES_SET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_set_linkmodes, + }, + { + .cmd = ETHTOOL_MSG_LINKSTATE_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + }, + { + .cmd = ETHTOOL_MSG_DEBUG_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + }, + { + .cmd = ETHTOOL_MSG_DEBUG_SET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_set_debug, + }, + { + .cmd = ETHTOOL_MSG_WOL_GET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + }, + { + .cmd = ETHTOOL_MSG_WOL_SET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_set_wol, + }, +}; + +static const struct genl_multicast_group ethtool_nl_mcgrps[] = { + [ETHNL_MCGRP_MONITOR] = { .name = ETHTOOL_MCGRP_MONITOR_NAME }, +}; + +static struct genl_family ethtool_genl_family = { + .name = ETHTOOL_GENL_NAME, + .version = ETHTOOL_GENL_VERSION, + .netnsok = true, + .parallel_ops = true, + .ops = ethtool_genl_ops, + .n_ops = ARRAY_SIZE(ethtool_genl_ops), + .mcgrps = ethtool_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(ethtool_nl_mcgrps), +}; + +/* module setup */ + +static int __init ethnl_init(void) +{ + int ret; + + ret = genl_register_family(ðtool_genl_family); + if (WARN(ret < 0, "ethtool: genetlink family registration failed")) + return ret; + ethnl_ok = true; + + return 0; +} + +subsys_initcall(ethnl_init); diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h new file mode 100644 index 000000000000..60efd87686ad --- /dev/null +++ b/net/ethtool/netlink.h @@ -0,0 +1,345 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef _NET_ETHTOOL_NETLINK_H +#define _NET_ETHTOOL_NETLINK_H + +#include <linux/ethtool_netlink.h> +#include <linux/netdevice.h> +#include <net/genetlink.h> +#include <net/sock.h> + +struct ethnl_req_info; + +int ethnl_parse_header(struct ethnl_req_info *req_info, + const struct nlattr *nest, struct net *net, + struct netlink_ext_ack *extack, bool require_dev); +int ethnl_fill_reply_header(struct sk_buff *skb, struct net_device *dev, + u16 attrtype); +struct sk_buff *ethnl_reply_init(size_t payload, struct net_device *dev, u8 cmd, + u16 hdr_attrtype, struct genl_info *info, + void **ehdrp); + +/** + * ethnl_strz_size() - calculate attribute length for fixed size string + * @s: ETH_GSTRING_LEN sized string (may not be null terminated) + * + * Return: total length of an attribute with null terminated string from @s + */ +static inline int ethnl_strz_size(const char *s) +{ + return nla_total_size(strnlen(s, ETH_GSTRING_LEN) + 1); +} + +/** + * ethnl_put_strz() - put string attribute with fixed size string + * @skb: skb with the message + * @attrype: attribute type + * @s: ETH_GSTRING_LEN sized string (may not be null terminated) + * + * Puts an attribute with null terminated string from @s into the message. + * + * Return: 0 on success, negative error code on failure + */ +static inline int ethnl_put_strz(struct sk_buff *skb, u16 attrtype, + const char *s) +{ + unsigned int len = strnlen(s, ETH_GSTRING_LEN); + struct nlattr *attr; + + attr = nla_reserve(skb, attrtype, len + 1); + if (!attr) + return -EMSGSIZE; + + memcpy(nla_data(attr), s, len); + ((char *)nla_data(attr))[len] = '\0'; + return 0; +} + +/** + * ethnl_update_u32() - update u32 value from NLA_U32 attribute + * @dst: value to update + * @attr: netlink attribute with new value or null + * @mod: pointer to bool for modification tracking + * + * Copy the u32 value from NLA_U32 netlink attribute @attr into variable + * pointed to by @dst; do nothing if @attr is null. Bool pointed to by @mod + * is set to true if this function changed the value of *dst, otherwise it + * is left as is. + */ +static inline void ethnl_update_u32(u32 *dst, const struct nlattr *attr, + bool *mod) +{ + u32 val; + + if (!attr) + return; + val = nla_get_u32(attr); + if (*dst == val) + return; + + *dst = val; + *mod = true; +} + +/** + * ethnl_update_u8() - update u8 value from NLA_U8 attribute + * @dst: value to update + * @attr: netlink attribute with new value or null + * @mod: pointer to bool for modification tracking + * + * Copy the u8 value from NLA_U8 netlink attribute @attr into variable + * pointed to by @dst; do nothing if @attr is null. Bool pointed to by @mod + * is set to true if this function changed the value of *dst, otherwise it + * is left as is. + */ +static inline void ethnl_update_u8(u8 *dst, const struct nlattr *attr, + bool *mod) +{ + u8 val; + + if (!attr) + return; + val = nla_get_u8(attr); + if (*dst == val) + return; + + *dst = val; + *mod = true; +} + +/** + * ethnl_update_bool32() - update u32 used as bool from NLA_U8 attribute + * @dst: value to update + * @attr: netlink attribute with new value or null + * @mod: pointer to bool for modification tracking + * + * Use the u8 value from NLA_U8 netlink attribute @attr to set u32 variable + * pointed to by @dst to 0 (if zero) or 1 (if not); do nothing if @attr is + * null. Bool pointed to by @mod is set to true if this function changed the + * logical value of *dst, otherwise it is left as is. + */ +static inline void ethnl_update_bool32(u32 *dst, const struct nlattr *attr, + bool *mod) +{ + u8 val; + + if (!attr) + return; + val = !!nla_get_u8(attr); + if (!!*dst == val) + return; + + *dst = val; + *mod = true; +} + +/** + * ethnl_update_binary() - update binary data from NLA_BINARY atribute + * @dst: value to update + * @len: destination buffer length + * @attr: netlink attribute with new value or null + * @mod: pointer to bool for modification tracking + * + * Use the u8 value from NLA_U8 netlink attribute @attr to rewrite data block + * of length @len at @dst by attribute payload; do nothing if @attr is null. + * Bool pointed to by @mod is set to true if this function changed the logical + * value of *dst, otherwise it is left as is. + */ +static inline void ethnl_update_binary(void *dst, unsigned int len, + const struct nlattr *attr, bool *mod) +{ + if (!attr) + return; + if (nla_len(attr) < len) + len = nla_len(attr); + if (!memcmp(dst, nla_data(attr), len)) + return; + + memcpy(dst, nla_data(attr), len); + *mod = true; +} + +/** + * ethnl_update_bitfield32() - update u32 value from NLA_BITFIELD32 attribute + * @dst: value to update + * @attr: netlink attribute with new value or null + * @mod: pointer to bool for modification tracking + * + * Update bits in u32 value which are set in attribute's mask to values from + * attribute's value. Do nothing if @attr is null or the value wouldn't change; + * otherwise, set bool pointed to by @mod to true. + */ +static inline void ethnl_update_bitfield32(u32 *dst, const struct nlattr *attr, + bool *mod) +{ + struct nla_bitfield32 change; + u32 newval; + + if (!attr) + return; + change = nla_get_bitfield32(attr); + newval = (*dst & ~change.selector) | (change.value & change.selector); + if (*dst == newval) + return; + + *dst = newval; + *mod = true; +} + +/** + * ethnl_reply_header_size() - total size of reply header + * + * This is an upper estimate so that we do not need to hold RTNL lock longer + * than necessary (to prevent rename between size estimate and composing the + * message). Accounts only for device ifindex and name as those are the only + * attributes ethnl_fill_reply_header() puts into the reply header. + */ +static inline unsigned int ethnl_reply_header_size(void) +{ + return nla_total_size(nla_total_size(sizeof(u32)) + + nla_total_size(IFNAMSIZ)); +} + +/* GET request handling */ + +/* Unified processing of GET requests uses two data structures: request info + * and reply data. Request info holds information parsed from client request + * and its stays constant through all request processing. Reply data holds data + * retrieved from ethtool_ops callbacks or other internal sources which is used + * to compose the reply. When processing a dump request, request info is filled + * only once (when the request message is parsed) but reply data is filled for + * each reply message. + * + * Both structures consist of part common for all request types (struct + * ethnl_req_info and struct ethnl_reply_data defined below) and optional + * parts specific for each request type. Common part always starts at offset 0. + */ + +/** + * struct ethnl_req_info - base type of request information for GET requests + * @dev: network device the request is for (may be null) + * @flags: request flags common for all request types + * + * This is a common base for request specific structures holding data from + * parsed userspace request. These always embed struct ethnl_req_info at + * zero offset. + */ +struct ethnl_req_info { + struct net_device *dev; + u32 flags; +}; + +/** + * struct ethnl_reply_data - base type of reply data for GET requests + * @dev: device for current reply message; in single shot requests it is + * equal to ðnl_req_info.dev; in dumps it's different for each + * reply message + * + * This is a common base for request specific structures holding data for + * kernel reply message. These always embed struct ethnl_reply_data at zero + * offset. + */ +struct ethnl_reply_data { + struct net_device *dev; +}; + +static inline int ethnl_ops_begin(struct net_device *dev) +{ + if (dev && dev->ethtool_ops->begin) + return dev->ethtool_ops->begin(dev); + else + return 0; +} + +static inline void ethnl_ops_complete(struct net_device *dev) +{ + if (dev && dev->ethtool_ops->complete) + dev->ethtool_ops->complete(dev); +} + +/** + * struct ethnl_request_ops - unified handling of GET requests + * @request_cmd: command id for request (GET) + * @reply_cmd: command id for reply (GET_REPLY) + * @hdr_attr: attribute type for request header + * @max_attr: maximum (top level) attribute type + * @req_info_size: size of request info + * @reply_data_size: size of reply data + * @request_policy: netlink policy for message contents + * @allow_nodev_do: allow non-dump request with no device identification + * @parse_request: + * Parse request except common header (struct ethnl_req_info). Common + * header is already filled on entry, the rest up to @repdata_offset + * is zero initialized. This callback should only modify type specific + * request info by parsed attributes from request message. + * @prepare_data: + * Retrieve and prepare data needed to compose a reply message. Calls to + * ethtool_ops handlers are limited to this callback. Common reply data + * (struct ethnl_reply_data) is filled on entry, type specific part after + * it is zero initialized. This callback should only modify the type + * specific part of reply data. Device identification from struct + * ethnl_reply_data is to be used as for dump requests, it iterates + * through network devices while dev member of struct ethnl_req_info + * points to the device from client request. + * @reply_size: + * Estimate reply message size. Returned value must be sufficient for + * message payload without common reply header. The callback may returned + * estimate higher than actual message size if exact calculation would + * not be worth the saved memory space. + * @fill_reply: + * Fill reply message payload (except for common header) from reply data. + * The callback must not generate more payload than previously called + * ->reply_size() estimated. + * @cleanup_data: + * Optional cleanup called when reply data is no longer needed. Can be + * used e.g. to free any additional data structures outside the main + * structure which were allocated by ->prepare_data(). When processing + * dump requests, ->cleanup() is called for each message. + * + * Description of variable parts of GET request handling when using the + * unified infrastructure. When used, a pointer to an instance of this + * structure is to be added to ðnl_default_requests array and generic + * handlers ethnl_default_doit(), ethnl_default_dumpit(), + * ethnl_default_start() and ethnl_default_done() used in @ethtool_genl_ops; + * ethnl_default_notify() can be used in @ethnl_notify_handlers to send + * notifications of the corresponding type. + */ +struct ethnl_request_ops { + u8 request_cmd; + u8 reply_cmd; + u16 hdr_attr; + unsigned int max_attr; + unsigned int req_info_size; + unsigned int reply_data_size; + const struct nla_policy *request_policy; + bool allow_nodev_do; + + int (*parse_request)(struct ethnl_req_info *req_info, + struct nlattr **tb, + struct netlink_ext_ack *extack); + int (*prepare_data)(const struct ethnl_req_info *req_info, + struct ethnl_reply_data *reply_data, + struct genl_info *info); + int (*reply_size)(const struct ethnl_req_info *req_info, + const struct ethnl_reply_data *reply_data); + int (*fill_reply)(struct sk_buff *skb, + const struct ethnl_req_info *req_info, + const struct ethnl_reply_data *reply_data); + void (*cleanup_data)(struct ethnl_reply_data *reply_data); +}; + +/* request handlers */ + +extern const struct ethnl_request_ops ethnl_strset_request_ops; +extern const struct ethnl_request_ops ethnl_linkinfo_request_ops; +extern const struct ethnl_request_ops ethnl_linkmodes_request_ops; +extern const struct ethnl_request_ops ethnl_linkstate_request_ops; +extern const struct ethnl_request_ops ethnl_debug_request_ops; +extern const struct ethnl_request_ops ethnl_wol_request_ops; + +int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); +int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); +int ethnl_set_debug(struct sk_buff *skb, struct genl_info *info); +int ethnl_set_wol(struct sk_buff *skb, struct genl_info *info); + +#endif /* _NET_ETHTOOL_NETLINK_H */ diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c new file mode 100644 index 000000000000..8e5911887b4c --- /dev/null +++ b/net/ethtool/strset.c @@ -0,0 +1,437 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/ethtool.h> +#include <linux/phy.h> +#include "netlink.h" +#include "common.h" + +struct strset_info { + bool per_dev; + bool free_strings; + unsigned int count; + const char (*strings)[ETH_GSTRING_LEN]; +}; + +static const struct strset_info info_template[] = { + [ETH_SS_TEST] = { + .per_dev = true, + }, + [ETH_SS_STATS] = { + .per_dev = true, + }, + [ETH_SS_PRIV_FLAGS] = { + .per_dev = true, + }, + [ETH_SS_FEATURES] = { + .per_dev = false, + .count = ARRAY_SIZE(netdev_features_strings), + .strings = netdev_features_strings, + }, + [ETH_SS_RSS_HASH_FUNCS] = { + .per_dev = false, + .count = ARRAY_SIZE(rss_hash_func_strings), + .strings = rss_hash_func_strings, + }, + [ETH_SS_TUNABLES] = { + .per_dev = false, + .count = ARRAY_SIZE(tunable_strings), + .strings = tunable_strings, + }, + [ETH_SS_PHY_STATS] = { + .per_dev = true, + }, + [ETH_SS_PHY_TUNABLES] = { + .per_dev = false, + .count = ARRAY_SIZE(phy_tunable_strings), + .strings = phy_tunable_strings, + }, + [ETH_SS_LINK_MODES] = { + .per_dev = false, + .count = __ETHTOOL_LINK_MODE_MASK_NBITS, + .strings = link_mode_names, + }, + [ETH_SS_MSG_CLASSES] = { + .per_dev = false, + .count = NETIF_MSG_CLASS_COUNT, + .strings = netif_msg_class_names, + }, + [ETH_SS_WOL_MODES] = { + .per_dev = false, + .count = WOL_MODE_COUNT, + .strings = wol_mode_names, + }, +}; + +struct strset_req_info { + struct ethnl_req_info base; + u32 req_ids; + bool counts_only; +}; + +#define STRSET_REQINFO(__req_base) \ + container_of(__req_base, struct strset_req_info, base) + +struct strset_reply_data { + struct ethnl_reply_data base; + struct strset_info sets[ETH_SS_COUNT]; +}; + +#define STRSET_REPDATA(__reply_base) \ + container_of(__reply_base, struct strset_reply_data, base) + +static const struct nla_policy strset_get_policy[ETHTOOL_A_STRSET_MAX + 1] = { + [ETHTOOL_A_STRSET_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_STRSET_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_STRSET_STRINGSETS] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy +get_stringset_policy[ETHTOOL_A_STRINGSET_MAX + 1] = { + [ETHTOOL_A_STRINGSET_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_STRINGSET_ID] = { .type = NLA_U32 }, + [ETHTOOL_A_STRINGSET_COUNT] = { .type = NLA_REJECT }, + [ETHTOOL_A_STRINGSET_STRINGS] = { .type = NLA_REJECT }, +}; + +/** + * strset_include() - test if a string set should be included in reply + * @info: parsed client request + * @data: pointer to request data structure + * @id: id of string set to check (ETH_SS_* constants) + */ +static bool strset_include(const struct strset_req_info *info, + const struct strset_reply_data *data, u32 id) +{ + bool per_dev; + + BUILD_BUG_ON(ETH_SS_COUNT >= BITS_PER_BYTE * sizeof(info->req_ids)); + + if (info->req_ids) + return info->req_ids & (1U << id); + per_dev = data->sets[id].per_dev; + if (!per_dev && !data->sets[id].strings) + return false; + + return data->base.dev ? per_dev : !per_dev; +} + +static int strset_get_id(const struct nlattr *nest, u32 *val, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[ETHTOOL_A_STRINGSET_MAX + 1]; + int ret; + + ret = nla_parse_nested(tb, ETHTOOL_A_STRINGSET_MAX, nest, + get_stringset_policy, extack); + if (ret < 0) + return ret; + if (!tb[ETHTOOL_A_STRINGSET_ID]) + return -EINVAL; + + *val = nla_get_u32(tb[ETHTOOL_A_STRINGSET_ID]); + return 0; +} + +static const struct nla_policy +strset_stringsets_policy[ETHTOOL_A_STRINGSETS_MAX + 1] = { + [ETHTOOL_A_STRINGSETS_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_STRINGSETS_STRINGSET] = { .type = NLA_NESTED }, +}; + +static int strset_parse_request(struct ethnl_req_info *req_base, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct strset_req_info *req_info = STRSET_REQINFO(req_base); + struct nlattr *nest = tb[ETHTOOL_A_STRSET_STRINGSETS]; + struct nlattr *attr; + int rem, ret; + + if (!nest) + return 0; + ret = nla_validate_nested(nest, ETHTOOL_A_STRINGSETS_MAX, + strset_stringsets_policy, extack); + if (ret < 0) + return ret; + + req_info->counts_only = tb[ETHTOOL_A_STRSET_COUNTS_ONLY]; + nla_for_each_nested(attr, nest, rem) { + u32 id; + + if (WARN_ONCE(nla_type(attr) != ETHTOOL_A_STRINGSETS_STRINGSET, + "unexpected attrtype %u in ETHTOOL_A_STRSET_STRINGSETS\n", + nla_type(attr))) + return -EINVAL; + + ret = strset_get_id(attr, &id, extack); + if (ret < 0) + return ret; + if (ret >= ETH_SS_COUNT) { + NL_SET_ERR_MSG_ATTR(extack, attr, + "unknown string set id"); + return -EOPNOTSUPP; + } + + req_info->req_ids |= (1U << id); + } + + return 0; +} + +static void strset_cleanup_data(struct ethnl_reply_data *reply_base) +{ + struct strset_reply_data *data = STRSET_REPDATA(reply_base); + unsigned int i; + + for (i = 0; i < ETH_SS_COUNT; i++) + if (data->sets[i].free_strings) { + kfree(data->sets[i].strings); + data->sets[i].strings = NULL; + data->sets[i].free_strings = false; + } +} + +static int strset_prepare_set(struct strset_info *info, struct net_device *dev, + unsigned int id, bool counts_only) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + void *strings; + int count, ret; + + if (id == ETH_SS_PHY_STATS && dev->phydev && + !ops->get_ethtool_phy_stats) + ret = phy_ethtool_get_sset_count(dev->phydev); + else if (ops->get_sset_count && ops->get_strings) + ret = ops->get_sset_count(dev, id); + else + ret = -EOPNOTSUPP; + if (ret <= 0) { + info->count = 0; + return 0; + } + + count = ret; + if (!counts_only) { + strings = kcalloc(count, ETH_GSTRING_LEN, GFP_KERNEL); + if (!strings) + return -ENOMEM; + if (id == ETH_SS_PHY_STATS && dev->phydev && + !ops->get_ethtool_phy_stats) + phy_ethtool_get_strings(dev->phydev, strings); + else + ops->get_strings(dev, id, strings); + info->strings = strings; + info->free_strings = true; + } + info->count = count; + + return 0; +} + +static int strset_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + const struct strset_req_info *req_info = STRSET_REQINFO(req_base); + struct strset_reply_data *data = STRSET_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + unsigned int i; + int ret; + + BUILD_BUG_ON(ARRAY_SIZE(info_template) != ETH_SS_COUNT); + memcpy(&data->sets, &info_template, sizeof(data->sets)); + + if (!dev) { + for (i = 0; i < ETH_SS_COUNT; i++) { + if ((req_info->req_ids & (1U << i)) && + data->sets[i].per_dev) { + if (info) + GENL_SET_ERR_MSG(info, "requested per device strings without dev"); + return -EINVAL; + } + } + return 0; + } + + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto err_strset; + for (i = 0; i < ETH_SS_COUNT; i++) { + if (!strset_include(req_info, data, i) || + !data->sets[i].per_dev) + continue; + + ret = strset_prepare_set(&data->sets[i], dev, i, + req_info->counts_only); + if (ret < 0) + goto err_ops; + } + ethnl_ops_complete(dev); + + return 0; +err_ops: + ethnl_ops_complete(dev); +err_strset: + strset_cleanup_data(reply_base); + return ret; +} + +/* calculate size of ETHTOOL_A_STRSET_STRINGSET nest for one string set */ +static int strset_set_size(const struct strset_info *info, bool counts_only) +{ + unsigned int len = 0; + unsigned int i; + + if (info->count == 0) + return 0; + if (counts_only) + return nla_total_size(2 * nla_total_size(sizeof(u32))); + + for (i = 0; i < info->count; i++) { + const char *str = info->strings[i]; + + /* ETHTOOL_A_STRING_INDEX, ETHTOOL_A_STRING_VALUE, nest */ + len += nla_total_size(nla_total_size(sizeof(u32)) + + ethnl_strz_size(str)); + } + /* ETHTOOL_A_STRINGSET_ID, ETHTOOL_A_STRINGSET_COUNT */ + len = 2 * nla_total_size(sizeof(u32)) + nla_total_size(len); + + return nla_total_size(len); +} + +static int strset_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct strset_req_info *req_info = STRSET_REQINFO(req_base); + const struct strset_reply_data *data = STRSET_REPDATA(reply_base); + unsigned int i; + int len = 0; + int ret; + + len += ethnl_reply_header_size(); + for (i = 0; i < ETH_SS_COUNT; i++) { + const struct strset_info *set_info = &data->sets[i]; + + if (!strset_include(req_info, data, i)) + continue; + + ret = strset_set_size(set_info, req_info->counts_only); + if (ret < 0) + return ret; + len += ret; + } + + return len; +} + +/* fill one string into reply */ +static int strset_fill_string(struct sk_buff *skb, + const struct strset_info *set_info, u32 idx) +{ + struct nlattr *string_attr; + const char *value; + + value = set_info->strings[idx]; + + string_attr = nla_nest_start(skb, ETHTOOL_A_STRINGS_STRING); + if (!string_attr) + return -EMSGSIZE; + if (nla_put_u32(skb, ETHTOOL_A_STRING_INDEX, idx) || + ethnl_put_strz(skb, ETHTOOL_A_STRING_VALUE, value)) + goto nla_put_failure; + nla_nest_end(skb, string_attr); + + return 0; +nla_put_failure: + nla_nest_cancel(skb, string_attr); + return -EMSGSIZE; +} + +/* fill one string set into reply */ +static int strset_fill_set(struct sk_buff *skb, + const struct strset_info *set_info, u32 id, + bool counts_only) +{ + struct nlattr *stringset_attr; + struct nlattr *strings_attr; + unsigned int i; + + if (!set_info->per_dev && !set_info->strings) + return -EOPNOTSUPP; + if (set_info->count == 0) + return 0; + stringset_attr = nla_nest_start(skb, ETHTOOL_A_STRINGSETS_STRINGSET); + if (!stringset_attr) + return -EMSGSIZE; + + if (nla_put_u32(skb, ETHTOOL_A_STRINGSET_ID, id) || + nla_put_u32(skb, ETHTOOL_A_STRINGSET_COUNT, set_info->count)) + goto nla_put_failure; + + if (!counts_only) { + strings_attr = nla_nest_start(skb, ETHTOOL_A_STRINGSET_STRINGS); + if (!strings_attr) + goto nla_put_failure; + for (i = 0; i < set_info->count; i++) { + if (strset_fill_string(skb, set_info, i) < 0) + goto nla_put_failure; + } + nla_nest_end(skb, strings_attr); + } + + nla_nest_end(skb, stringset_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, stringset_attr); + return -EMSGSIZE; +} + +static int strset_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct strset_req_info *req_info = STRSET_REQINFO(req_base); + const struct strset_reply_data *data = STRSET_REPDATA(reply_base); + struct nlattr *nest; + unsigned int i; + int ret; + + nest = nla_nest_start(skb, ETHTOOL_A_STRSET_STRINGSETS); + if (!nest) + return -EMSGSIZE; + + for (i = 0; i < ETH_SS_COUNT; i++) { + if (strset_include(req_info, data, i)) { + ret = strset_fill_set(skb, &data->sets[i], i, + req_info->counts_only); + if (ret < 0) + goto nla_put_failure; + } + } + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return ret; +} + +const struct ethnl_request_ops ethnl_strset_request_ops = { + .request_cmd = ETHTOOL_MSG_STRSET_GET, + .reply_cmd = ETHTOOL_MSG_STRSET_GET_REPLY, + .hdr_attr = ETHTOOL_A_STRSET_HEADER, + .max_attr = ETHTOOL_A_STRSET_MAX, + .req_info_size = sizeof(struct strset_req_info), + .reply_data_size = sizeof(struct strset_reply_data), + .request_policy = strset_get_policy, + .allow_nodev_do = true, + + .parse_request = strset_parse_request, + .prepare_data = strset_prepare_data, + .reply_size = strset_reply_size, + .fill_reply = strset_fill_reply, + .cleanup_data = strset_cleanup_data, +}; diff --git a/net/ethtool/wol.c b/net/ethtool/wol.c new file mode 100644 index 000000000000..e1b8a65b64c4 --- /dev/null +++ b/net/ethtool/wol.c @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "netlink.h" +#include "common.h" +#include "bitset.h" + +struct wol_req_info { + struct ethnl_req_info base; +}; + +struct wol_reply_data { + struct ethnl_reply_data base; + struct ethtool_wolinfo wol; + bool show_sopass; +}; + +#define WOL_REPDATA(__reply_base) \ + container_of(__reply_base, struct wol_reply_data, base) + +static const struct nla_policy +wol_get_policy[ETHTOOL_A_WOL_MAX + 1] = { + [ETHTOOL_A_WOL_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_WOL_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_WOL_MODES] = { .type = NLA_REJECT }, + [ETHTOOL_A_WOL_SOPASS] = { .type = NLA_REJECT }, +}; + +static int wol_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct wol_reply_data *data = WOL_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + int ret; + + if (!dev->ethtool_ops->get_wol) + return -EOPNOTSUPP; + + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + dev->ethtool_ops->get_wol(dev, &data->wol); + ethnl_ops_complete(dev); + /* do not include password in notifications */ + data->show_sopass = info && (data->wol.supported & WAKE_MAGICSECURE); + + return 0; +} + +static int wol_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + const struct wol_reply_data *data = WOL_REPDATA(reply_base); + int len; + + len = ethnl_bitset32_size(&data->wol.wolopts, &data->wol.supported, + WOL_MODE_COUNT, wol_mode_names, compact); + if (len < 0) + return len; + if (data->show_sopass) + len += nla_total_size(sizeof(data->wol.sopass)); + + return len; +} + +static int wol_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + const struct wol_reply_data *data = WOL_REPDATA(reply_base); + int ret; + + ret = ethnl_put_bitset32(skb, ETHTOOL_A_WOL_MODES, &data->wol.wolopts, + &data->wol.supported, WOL_MODE_COUNT, + wol_mode_names, compact); + if (ret < 0) + return ret; + if (data->show_sopass && + nla_put(skb, ETHTOOL_A_WOL_SOPASS, sizeof(data->wol.sopass), + data->wol.sopass)) + return -EMSGSIZE; + + return 0; +} + +const struct ethnl_request_ops ethnl_wol_request_ops = { + .request_cmd = ETHTOOL_MSG_WOL_GET, + .reply_cmd = ETHTOOL_MSG_WOL_GET_REPLY, + .hdr_attr = ETHTOOL_A_WOL_HEADER, + .max_attr = ETHTOOL_A_WOL_MAX, + .req_info_size = sizeof(struct wol_req_info), + .reply_data_size = sizeof(struct wol_reply_data), + .request_policy = wol_get_policy, + + .prepare_data = wol_prepare_data, + .reply_size = wol_reply_size, + .fill_reply = wol_fill_reply, +}; + +/* WOL_SET */ + +static const struct nla_policy +wol_set_policy[ETHTOOL_A_WOL_MAX + 1] = { + [ETHTOOL_A_WOL_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_WOL_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_WOL_MODES] = { .type = NLA_NESTED }, + [ETHTOOL_A_WOL_SOPASS] = { .type = NLA_BINARY, + .len = SOPASS_MAX }, +}; + +int ethnl_set_wol(struct sk_buff *skb, struct genl_info *info) +{ + struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL }; + struct nlattr *tb[ETHTOOL_A_WOL_MAX + 1]; + struct ethnl_req_info req_info = {}; + struct net_device *dev; + bool mod = false; + int ret; + + ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, ETHTOOL_A_WOL_MAX, + wol_set_policy, info->extack); + if (ret < 0) + return ret; + ret = ethnl_parse_header(&req_info, tb[ETHTOOL_A_WOL_HEADER], + genl_info_net(info), info->extack, true); + if (ret < 0) + return ret; + dev = req_info.dev; + if (!dev->ethtool_ops->get_wol || !dev->ethtool_ops->set_wol) + return -EOPNOTSUPP; + + rtnl_lock(); + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out_rtnl; + + dev->ethtool_ops->get_wol(dev, &wol); + ret = ethnl_update_bitset32(&wol.wolopts, WOL_MODE_COUNT, + tb[ETHTOOL_A_WOL_MODES], wol_mode_names, + info->extack, &mod); + if (ret < 0) + goto out_ops; + if (wol.wolopts & ~wol.supported) { + NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_WOL_MODES], + "cannot enable unsupported WoL mode"); + ret = -EINVAL; + goto out_ops; + } + if (tb[ETHTOOL_A_WOL_SOPASS]) { + if (!(wol.supported & WAKE_MAGICSECURE)) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_WOL_SOPASS], + "magicsecure not supported, cannot set password"); + ret = -EINVAL; + goto out_ops; + } + ethnl_update_binary(wol.sopass, sizeof(wol.sopass), + tb[ETHTOOL_A_WOL_SOPASS], &mod); + } + + if (!mod) + goto out_ops; + ret = dev->ethtool_ops->set_wol(dev, &wol); + if (ret) + goto out_ops; + dev->wol_enabled = !!wol.wolopts; + ethtool_notify(dev, ETHTOOL_MSG_WOL_NTF, NULL); + +out_ops: + ethnl_ops_complete(dev); +out_rtnl: + rtnl_unlock(); + dev_put(dev); + return ret; +} diff --git a/net/hsr/hsr_debugfs.c b/net/hsr/hsr_debugfs.c index 94447974a3c0..d5f709b940ff 100644 --- a/net/hsr/hsr_debugfs.c +++ b/net/hsr/hsr_debugfs.c @@ -20,6 +20,8 @@ #include "hsr_main.h" #include "hsr_framereg.h" +static struct dentry *hsr_debugfs_root_dir; + static void print_mac_address(struct seq_file *sfp, unsigned char *mac) { seq_printf(sfp, "%02x:%02x:%02x:%02x:%02x:%02x:", @@ -63,8 +65,20 @@ hsr_node_table_open(struct inode *inode, struct file *filp) return single_open(filp, hsr_node_table_show, inode->i_private); } +void hsr_debugfs_rename(struct net_device *dev) +{ + struct hsr_priv *priv = netdev_priv(dev); + struct dentry *d; + + d = debugfs_rename(hsr_debugfs_root_dir, priv->node_tbl_root, + hsr_debugfs_root_dir, dev->name); + if (IS_ERR(d)) + netdev_warn(dev, "failed to rename\n"); + else + priv->node_tbl_root = d; +} + static const struct file_operations hsr_fops = { - .owner = THIS_MODULE, .open = hsr_node_table_open, .read = seq_read, .llseek = seq_lseek, @@ -78,15 +92,14 @@ static const struct file_operations hsr_fops = { * When debugfs is configured this routine sets up the node_table file per * hsr device for dumping the node_table entries */ -int hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev) +void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev) { - int rc = -1; struct dentry *de = NULL; - de = debugfs_create_dir(hsr_dev->name, NULL); - if (!de) { - pr_err("Cannot create hsr debugfs root\n"); - return rc; + de = debugfs_create_dir(hsr_dev->name, hsr_debugfs_root_dir); + if (IS_ERR(de)) { + pr_err("Cannot create hsr debugfs directory\n"); + return; } priv->node_tbl_root = de; @@ -94,13 +107,13 @@ int hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev) de = debugfs_create_file("node_table", S_IFREG | 0444, priv->node_tbl_root, priv, &hsr_fops); - if (!de) { - pr_err("Cannot create hsr node_table directory\n"); - return rc; + if (IS_ERR(de)) { + pr_err("Cannot create hsr node_table file\n"); + debugfs_remove(priv->node_tbl_root); + priv->node_tbl_root = NULL; + return; } priv->node_tbl_file = de; - - return 0; } /* hsr_debugfs_term - Tear down debugfs intrastructure @@ -117,3 +130,18 @@ hsr_debugfs_term(struct hsr_priv *priv) debugfs_remove(priv->node_tbl_root); priv->node_tbl_root = NULL; } + +void hsr_debugfs_create_root(void) +{ + hsr_debugfs_root_dir = debugfs_create_dir("hsr", NULL); + if (IS_ERR(hsr_debugfs_root_dir)) { + pr_err("Cannot create hsr debugfs root directory\n"); + hsr_debugfs_root_dir = NULL; + } +} + +void hsr_debugfs_remove_root(void) +{ + /* debugfs_remove() internally checks NULL and ERROR */ + debugfs_remove(hsr_debugfs_root_dir); +} diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index f509b495451a..c7bd6c49fadf 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -227,8 +227,13 @@ static int hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev) struct hsr_port *master; master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); - skb->dev = master->dev; - hsr_forward_skb(skb, master); + if (master) { + skb->dev = master->dev; + hsr_forward_skb(skb, master); + } else { + atomic_long_inc(&dev->tx_dropped); + dev_kfree_skb_any(skb); + } return NETDEV_TX_OK; } @@ -267,6 +272,8 @@ static void send_hsr_supervision_frame(struct hsr_port *master, skb->dev->dev_addr, skb->len) <= 0) goto out; skb_reset_mac_header(skb); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); if (hsr_ver > 0) { hsr_tag = skb_put(skb, sizeof(struct hsr_tag)); @@ -363,7 +370,7 @@ static void hsr_dev_destroy(struct net_device *hsr_dev) del_timer_sync(&hsr->prune_timer); del_timer_sync(&hsr->announce_timer); - hsr_del_self_node(&hsr->self_node_db); + hsr_del_self_node(hsr); hsr_del_nodes(&hsr->node_db); } @@ -435,11 +442,12 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], INIT_LIST_HEAD(&hsr->ports); INIT_LIST_HEAD(&hsr->node_db); INIT_LIST_HEAD(&hsr->self_node_db); + spin_lock_init(&hsr->list_lock); ether_addr_copy(hsr_dev->dev_addr, slave[0]->dev_addr); /* Make sure we recognize frames from ourselves in hsr_rcv() */ - res = hsr_create_self_node(&hsr->self_node_db, hsr_dev->dev_addr, + res = hsr_create_self_node(hsr, hsr_dev->dev_addr, slave[1]->dev_addr); if (res < 0) return res; @@ -472,31 +480,32 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], res = hsr_add_port(hsr, hsr_dev, HSR_PT_MASTER); if (res) - goto err_add_port; + goto err_add_master; res = register_netdevice(hsr_dev); if (res) - goto fail; + goto err_unregister; res = hsr_add_port(hsr, slave[0], HSR_PT_SLAVE_A); if (res) - goto fail; + goto err_add_slaves; + res = hsr_add_port(hsr, slave[1], HSR_PT_SLAVE_B); if (res) - goto fail; + goto err_add_slaves; + hsr_debugfs_init(hsr, hsr_dev); mod_timer(&hsr->prune_timer, jiffies + msecs_to_jiffies(PRUNE_PERIOD)); - res = hsr_debugfs_init(hsr, hsr_dev); - if (res) - goto fail; return 0; -fail: +err_add_slaves: + unregister_netdevice(hsr_dev); +err_unregister: list_for_each_entry_safe(port, tmp, &hsr->ports, port_list) hsr_del_port(port); -err_add_port: - hsr_del_self_node(&hsr->self_node_db); +err_add_master: + hsr_del_self_node(hsr); return res; } diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 292be446007b..364ea2cc028e 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -35,7 +35,6 @@ static bool seq_nr_after(u16 a, u16 b) } #define seq_nr_before(a, b) seq_nr_after((b), (a)) -#define seq_nr_after_or_eq(a, b) (!seq_nr_before((a), (b))) #define seq_nr_before_or_eq(a, b) (!seq_nr_after((a), (b))) bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr) @@ -75,10 +74,11 @@ static struct hsr_node *find_node_by_addr_A(struct list_head *node_db, /* Helper for device init; the self_node_db is used in hsr_rcv() to recognize * frames from self that's been looped over the HSR ring. */ -int hsr_create_self_node(struct list_head *self_node_db, +int hsr_create_self_node(struct hsr_priv *hsr, unsigned char addr_a[ETH_ALEN], unsigned char addr_b[ETH_ALEN]) { + struct list_head *self_node_db = &hsr->self_node_db; struct hsr_node *node, *oldnode; node = kmalloc(sizeof(*node), GFP_KERNEL); @@ -88,33 +88,33 @@ int hsr_create_self_node(struct list_head *self_node_db, ether_addr_copy(node->macaddress_A, addr_a); ether_addr_copy(node->macaddress_B, addr_b); - rcu_read_lock(); + spin_lock_bh(&hsr->list_lock); oldnode = list_first_or_null_rcu(self_node_db, struct hsr_node, mac_list); if (oldnode) { list_replace_rcu(&oldnode->mac_list, &node->mac_list); - rcu_read_unlock(); - synchronize_rcu(); - kfree(oldnode); + spin_unlock_bh(&hsr->list_lock); + kfree_rcu(oldnode, rcu_head); } else { - rcu_read_unlock(); list_add_tail_rcu(&node->mac_list, self_node_db); + spin_unlock_bh(&hsr->list_lock); } return 0; } -void hsr_del_self_node(struct list_head *self_node_db) +void hsr_del_self_node(struct hsr_priv *hsr) { + struct list_head *self_node_db = &hsr->self_node_db; struct hsr_node *node; - rcu_read_lock(); + spin_lock_bh(&hsr->list_lock); node = list_first_or_null_rcu(self_node_db, struct hsr_node, mac_list); - rcu_read_unlock(); if (node) { list_del_rcu(&node->mac_list); - kfree(node); + kfree_rcu(node, rcu_head); } + spin_unlock_bh(&hsr->list_lock); } void hsr_del_nodes(struct list_head *node_db) @@ -130,30 +130,43 @@ void hsr_del_nodes(struct list_head *node_db) * seq_out is used to initialize filtering of outgoing duplicate frames * originating from the newly added node. */ -struct hsr_node *hsr_add_node(struct list_head *node_db, unsigned char addr[], - u16 seq_out) +static struct hsr_node *hsr_add_node(struct hsr_priv *hsr, + struct list_head *node_db, + unsigned char addr[], + u16 seq_out) { - struct hsr_node *node; + struct hsr_node *new_node, *node; unsigned long now; int i; - node = kzalloc(sizeof(*node), GFP_ATOMIC); - if (!node) + new_node = kzalloc(sizeof(*new_node), GFP_ATOMIC); + if (!new_node) return NULL; - ether_addr_copy(node->macaddress_A, addr); + ether_addr_copy(new_node->macaddress_A, addr); /* We are only interested in time diffs here, so use current jiffies * as initialization. (0 could trigger an spurious ring error warning). */ now = jiffies; for (i = 0; i < HSR_PT_PORTS; i++) - node->time_in[i] = now; + new_node->time_in[i] = now; for (i = 0; i < HSR_PT_PORTS; i++) - node->seq_out[i] = seq_out; - - list_add_tail_rcu(&node->mac_list, node_db); + new_node->seq_out[i] = seq_out; + spin_lock_bh(&hsr->list_lock); + list_for_each_entry_rcu(node, node_db, mac_list) { + if (ether_addr_equal(node->macaddress_A, addr)) + goto out; + if (ether_addr_equal(node->macaddress_B, addr)) + goto out; + } + list_add_tail_rcu(&new_node->mac_list, node_db); + spin_unlock_bh(&hsr->list_lock); + return new_node; +out: + spin_unlock_bh(&hsr->list_lock); + kfree(new_node); return node; } @@ -163,6 +176,7 @@ struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb, bool is_sup) { struct list_head *node_db = &port->hsr->node_db; + struct hsr_priv *hsr = port->hsr; struct hsr_node *node; struct ethhdr *ethhdr; u16 seq_out; @@ -196,7 +210,7 @@ struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb, seq_out = HSR_SEQNR_START; } - return hsr_add_node(node_db, ethhdr->h_source, seq_out); + return hsr_add_node(hsr, node_db, ethhdr->h_source, seq_out); } /* Use the Supervision frame's info about an eventual macaddress_B for merging @@ -206,10 +220,11 @@ struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb, void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr, struct hsr_port *port_rcv) { - struct ethhdr *ethhdr; - struct hsr_node *node_real; + struct hsr_priv *hsr = port_rcv->hsr; struct hsr_sup_payload *hsr_sp; + struct hsr_node *node_real; struct list_head *node_db; + struct ethhdr *ethhdr; int i; ethhdr = (struct ethhdr *)skb_mac_header(skb); @@ -231,7 +246,7 @@ void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr, node_real = find_node_by_addr_A(node_db, hsr_sp->macaddress_A); if (!node_real) /* No frame received from AddrA of this node yet */ - node_real = hsr_add_node(node_db, hsr_sp->macaddress_A, + node_real = hsr_add_node(hsr, node_db, hsr_sp->macaddress_A, HSR_SEQNR_START - 1); if (!node_real) goto done; /* No mem */ @@ -252,7 +267,9 @@ void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr, } node_real->addr_B_port = port_rcv->type; + spin_lock_bh(&hsr->list_lock); list_del_rcu(&node_curr->mac_list); + spin_unlock_bh(&hsr->list_lock); kfree_rcu(node_curr, rcu_head); done: @@ -368,12 +385,13 @@ void hsr_prune_nodes(struct timer_list *t) { struct hsr_priv *hsr = from_timer(hsr, t, prune_timer); struct hsr_node *node; + struct hsr_node *tmp; struct hsr_port *port; unsigned long timestamp; unsigned long time_a, time_b; - rcu_read_lock(); - list_for_each_entry_rcu(node, &hsr->node_db, mac_list) { + spin_lock_bh(&hsr->list_lock); + list_for_each_entry_safe(node, tmp, &hsr->node_db, mac_list) { /* Don't prune own node. Neither time_in[HSR_PT_SLAVE_A] * nor time_in[HSR_PT_SLAVE_B], will ever be updated for * the master port. Thus the master node will be repeatedly @@ -421,7 +439,7 @@ void hsr_prune_nodes(struct timer_list *t) kfree_rcu(node, rcu_head); } } - rcu_read_unlock(); + spin_unlock_bh(&hsr->list_lock); /* Restart timer */ mod_timer(&hsr->prune_timer, diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h index 89a3ce38151d..0f0fa12b4329 100644 --- a/net/hsr/hsr_framereg.h +++ b/net/hsr/hsr_framereg.h @@ -12,10 +12,8 @@ struct hsr_node; -void hsr_del_self_node(struct list_head *self_node_db); +void hsr_del_self_node(struct hsr_priv *hsr); void hsr_del_nodes(struct list_head *node_db); -struct hsr_node *hsr_add_node(struct list_head *node_db, unsigned char addr[], - u16 seq_out); struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb, bool is_sup); void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr, @@ -33,7 +31,7 @@ int hsr_register_frame_out(struct hsr_port *port, struct hsr_node *node, void hsr_prune_nodes(struct timer_list *t); -int hsr_create_self_node(struct list_head *self_node_db, +int hsr_create_self_node(struct hsr_priv *hsr, unsigned char addr_a[ETH_ALEN], unsigned char addr_b[ETH_ALEN]); diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index b9988a662ee1..9e389accbfc7 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -45,6 +45,10 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, case NETDEV_CHANGE: /* Link (carrier) state changes */ hsr_check_carrier_and_operstate(hsr); break; + case NETDEV_CHANGENAME: + if (is_hsr_master(dev)) + hsr_debugfs_rename(dev); + break; case NETDEV_CHANGEADDR: if (port->type == HSR_PT_MASTER) { /* This should not happen since there's no @@ -64,7 +68,7 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, /* Make sure we recognize frames from ourselves in hsr_rcv() */ port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); - res = hsr_create_self_node(&hsr->self_node_db, + res = hsr_create_self_node(hsr, master->dev->dev_addr, port ? port->dev->dev_addr : @@ -123,6 +127,7 @@ static void __exit hsr_exit(void) { unregister_netdevice_notifier(&hsr_nb); hsr_netlink_exit(); + hsr_debugfs_remove_root(); } module_init(hsr_init); diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index 96fac696a1e1..754d84b217f0 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -160,8 +160,9 @@ struct hsr_priv { int announce_count; u16 sequence_nr; u16 sup_sequence_nr; /* For HSRv1 separate seq_nr for supervision */ - u8 prot_version; /* Indicate if HSRv0 or HSRv1. */ - spinlock_t seqnr_lock; /* locking for sequence_nr */ + u8 prot_version; /* Indicate if HSRv0 or HSRv1. */ + spinlock_t seqnr_lock; /* locking for sequence_nr */ + spinlock_t list_lock; /* locking for node list */ unsigned char sup_multicast_addr[ETH_ALEN]; #ifdef CONFIG_DEBUG_FS struct dentry *node_tbl_root; @@ -184,17 +185,24 @@ static inline u16 hsr_get_skb_sequence_nr(struct sk_buff *skb) } #if IS_ENABLED(CONFIG_DEBUG_FS) -int hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev); +void hsr_debugfs_rename(struct net_device *dev); +void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev); void hsr_debugfs_term(struct hsr_priv *priv); +void hsr_debugfs_create_root(void); +void hsr_debugfs_remove_root(void); #else -static inline int hsr_debugfs_init(struct hsr_priv *priv, - struct net_device *hsr_dev) +static inline void hsr_debugfs_rename(struct net_device *dev) { - return 0; } - +static inline void hsr_debugfs_init(struct hsr_priv *priv, + struct net_device *hsr_dev) +{} static inline void hsr_debugfs_term(struct hsr_priv *priv) {} +static inline void hsr_debugfs_create_root(void) +{} +static inline void hsr_debugfs_remove_root(void) +{} #endif #endif /* __HSR_PRIVATE_H */ diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index 8f8337f893ba..8dc0547f01d0 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -476,6 +476,7 @@ int __init hsr_netlink_init(void) if (rc) goto fail_genl_register_family; + hsr_debugfs_create_root(); return 0; fail_genl_register_family: diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index ee561297d8a7..fbfd0db182b7 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -27,6 +27,8 @@ static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb) rcu_read_lock(); /* hsr->node_db, hsr->ports */ port = hsr_port_get_rcu(skb->dev); + if (!port) + goto finish_pass; if (hsr_addr_is_self(port->hsr, eth_hdr(skb)->h_source)) { /* Directly kill frames sent by ourselves */ diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 3297e7fa9945..c0b107cdd715 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -58,13 +58,6 @@ static const struct header_ops lowpan_header_ops = { .create = lowpan_header_create, }; -static int lowpan_dev_init(struct net_device *ldev) -{ - netdev_lockdep_set_classes(ldev); - - return 0; -} - static int lowpan_open(struct net_device *dev) { if (!open_count) @@ -96,7 +89,6 @@ static int lowpan_get_iflink(const struct net_device *dev) } static const struct net_device_ops lowpan_netdev_ops = { - .ndo_init = lowpan_dev_init, .ndo_start_xmit = lowpan_xmit, .ndo_open = lowpan_open, .ndo_stop = lowpan_stop, diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c index 60b7ac56a1f5..de259b5170ab 100644 --- a/net/ieee802154/core.c +++ b/net/ieee802154/core.c @@ -23,11 +23,6 @@ LIST_HEAD(cfg802154_rdev_list); int cfg802154_rdev_list_generation; -static int wpan_phy_match(struct device *dev, const void *data) -{ - return !strcmp(dev_name(dev), (const char *)data); -} - struct wpan_phy *wpan_phy_find(const char *str) { struct device *dev; @@ -35,7 +30,7 @@ struct wpan_phy *wpan_phy_find(const char *str) if (WARN_ON(!str)) return NULL; - dev = class_find_device(&wpan_phy_class, NULL, str, wpan_phy_match); + dev = class_find_device_by_name(&wpan_phy_class, str); if (!dev) return NULL; diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index ffcfcef76291..7c5a1aa5adb4 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -236,21 +236,14 @@ nl802154_prepare_wpan_dev_dump(struct sk_buff *skb, struct cfg802154_registered_device **rdev, struct wpan_dev **wpan_dev) { + const struct genl_dumpit_info *info = genl_dumpit_info(cb); int err; rtnl_lock(); if (!cb->args[0]) { - err = nlmsg_parse_deprecated(cb->nlh, - GENL_HDRLEN + nl802154_fam.hdrsize, - genl_family_attrbuf(&nl802154_fam), - nl802154_fam.maxattr, - nl802154_policy, NULL); - if (err) - goto out_unlock; - *wpan_dev = __cfg802154_wpan_dev_from_attrs(sock_net(skb->sk), - genl_family_attrbuf(&nl802154_fam)); + info->attrs); if (IS_ERR(*wpan_dev)) { err = PTR_ERR(*wpan_dev); goto out_unlock; @@ -557,17 +550,8 @@ static int nl802154_dump_wpan_phy_parse(struct sk_buff *skb, struct netlink_callback *cb, struct nl802154_dump_wpan_phy_state *state) { - struct nlattr **tb = genl_family_attrbuf(&nl802154_fam); - int ret = nlmsg_parse_deprecated(cb->nlh, - GENL_HDRLEN + nl802154_fam.hdrsize, - tb, nl802154_fam.maxattr, - nl802154_policy, NULL); - - /* TODO check if we can handle error here, - * we have no backward compatibility - */ - if (ret) - return 0; + const struct genl_dumpit_info *info = genl_dumpit_info(cb); + struct nlattr **tb = info->attrs; if (tb[NL802154_ATTR_WPAN_PHY]) state->filter_wpan_phy = nla_get_u32(tb[NL802154_ATTR_WPAN_PHY]); @@ -2203,7 +2187,8 @@ static void nl802154_post_doit(const struct genl_ops *ops, struct sk_buff *skb, static const struct genl_ops nl802154_ops[] = { { .cmd = NL802154_CMD_GET_WPAN_PHY, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, .doit = nl802154_get_wpan_phy, .dumpit = nl802154_dump_wpan_phy, .done = nl802154_dump_wpan_phy_done, @@ -2343,7 +2328,8 @@ static const struct genl_ops nl802154_ops[] = { }, { .cmd = NL802154_CMD_GET_SEC_KEY, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, /* TODO .doit by matching key id? */ .dumpit = nl802154_dump_llsec_key, .flags = GENL_ADMIN_PERM, @@ -2369,7 +2355,8 @@ static const struct genl_ops nl802154_ops[] = { /* TODO unique identifier must short+pan OR extended_addr */ { .cmd = NL802154_CMD_GET_SEC_DEV, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, /* TODO .doit by matching extended_addr? */ .dumpit = nl802154_dump_llsec_dev, .flags = GENL_ADMIN_PERM, @@ -2395,7 +2382,8 @@ static const struct genl_ops nl802154_ops[] = { /* TODO remove complete devkey, put it as nested? */ { .cmd = NL802154_CMD_GET_SEC_DEVKEY, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, /* TODO doit by matching ??? */ .dumpit = nl802154_dump_llsec_devkey, .flags = GENL_ADMIN_PERM, @@ -2420,7 +2408,8 @@ static const struct genl_ops nl802154_ops[] = { }, { .cmd = NL802154_CMD_GET_SEC_LEVEL, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, /* TODO .doit by matching frame_type? */ .dumpit = nl802154_dump_llsec_seclevel, .flags = GENL_ADMIN_PERM, diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index dacbd58e1799..d93d4531aa9b 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -1008,6 +1008,9 @@ static int ieee802154_create(struct net *net, struct socket *sock, switch (sock->type) { case SOCK_RAW: + rc = -EPERM; + if (!capable(CAP_NET_RAW)) + goto out; proto = &ieee802154_raw_prot; ops = &ieee802154_raw_ops; break; @@ -1092,7 +1095,7 @@ static struct packet_type ieee802154_packet_type = { static int __init af_ieee802154_init(void) { - int rc = -EINVAL; + int rc; rc = proto_register(&ieee802154_raw_prot, 1); if (rc) diff --git a/net/ife/Kconfig b/net/ife/Kconfig index 6cd1f6d18f30..bcf650564db4 100644 --- a/net/ife/Kconfig +++ b/net/ife/Kconfig @@ -5,7 +5,7 @@ menuconfig NET_IFE depends on NET - tristate "Inter-FE based on IETF ForCES InterFE LFB" + tristate "Inter-FE based on IETF ForCES InterFE LFB" default n help Say Y here to add support of IFE encapsulation protocol diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 974de4d20f25..f96bd489b362 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -180,8 +180,8 @@ config NET_IPIP config NET_IPGRE_DEMUX tristate "IP: GRE demultiplexer" help - This is helper module to demultiplex GRE packets on GRE version field criteria. - Required by ip_gre and pptp modules. + This is helper module to demultiplex GRE packets on GRE version field criteria. + Required by ip_gre and pptp modules. config NET_IP_TUNNEL tristate @@ -378,6 +378,17 @@ config INET_ESP_OFFLOAD If unsure, say N. +config INET_ESPINTCP + bool "IP: ESP in TCP encapsulation (RFC 8229)" + depends on XFRM && INET_ESP + select STREAM_PARSER + select NET_SOCK_MSG + help + Support for RFC 8229 encapsulation of ESP and IKE over + TCP/IPv4 sockets. + + If unsure, say N. + config INET_IPCOMP tristate "IP: IPComp transformation" select INET_XFRM_TUNNEL @@ -459,200 +470,200 @@ config TCP_CONG_BIC tristate "Binary Increase Congestion (BIC) control" default m ---help--- - BIC-TCP is a sender-side only change that ensures a linear RTT - fairness under large windows while offering both scalability and - bounded TCP-friendliness. The protocol combines two schemes - called additive increase and binary search increase. When the - congestion window is large, additive increase with a large - increment ensures linear RTT fairness as well as good - scalability. Under small congestion windows, binary search - increase provides TCP friendliness. - See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/ + BIC-TCP is a sender-side only change that ensures a linear RTT + fairness under large windows while offering both scalability and + bounded TCP-friendliness. The protocol combines two schemes + called additive increase and binary search increase. When the + congestion window is large, additive increase with a large + increment ensures linear RTT fairness as well as good + scalability. Under small congestion windows, binary search + increase provides TCP friendliness. + See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/ config TCP_CONG_CUBIC tristate "CUBIC TCP" default y ---help--- - This is version 2.0 of BIC-TCP which uses a cubic growth function - among other techniques. - See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf + This is version 2.0 of BIC-TCP which uses a cubic growth function + among other techniques. + See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf config TCP_CONG_WESTWOOD tristate "TCP Westwood+" default m ---help--- - TCP Westwood+ is a sender-side only modification of the TCP Reno - protocol stack that optimizes the performance of TCP congestion - control. It is based on end-to-end bandwidth estimation to set - congestion window and slow start threshold after a congestion - episode. Using this estimation, TCP Westwood+ adaptively sets a - slow start threshold and a congestion window which takes into - account the bandwidth used at the time congestion is experienced. - TCP Westwood+ significantly increases fairness wrt TCP Reno in - wired networks and throughput over wireless links. + TCP Westwood+ is a sender-side only modification of the TCP Reno + protocol stack that optimizes the performance of TCP congestion + control. It is based on end-to-end bandwidth estimation to set + congestion window and slow start threshold after a congestion + episode. Using this estimation, TCP Westwood+ adaptively sets a + slow start threshold and a congestion window which takes into + account the bandwidth used at the time congestion is experienced. + TCP Westwood+ significantly increases fairness wrt TCP Reno in + wired networks and throughput over wireless links. config TCP_CONG_HTCP - tristate "H-TCP" - default m + tristate "H-TCP" + default m ---help--- - H-TCP is a send-side only modifications of the TCP Reno - protocol stack that optimizes the performance of TCP - congestion control for high speed network links. It uses a - modeswitch to change the alpha and beta parameters of TCP Reno - based on network conditions and in a way so as to be fair with - other Reno and H-TCP flows. + H-TCP is a send-side only modifications of the TCP Reno + protocol stack that optimizes the performance of TCP + congestion control for high speed network links. It uses a + modeswitch to change the alpha and beta parameters of TCP Reno + based on network conditions and in a way so as to be fair with + other Reno and H-TCP flows. config TCP_CONG_HSTCP tristate "High Speed TCP" default n ---help--- - Sally Floyd's High Speed TCP (RFC 3649) congestion control. - A modification to TCP's congestion control mechanism for use - with large congestion windows. A table indicates how much to - increase the congestion window by when an ACK is received. - For more detail see http://www.icir.org/floyd/hstcp.html + Sally Floyd's High Speed TCP (RFC 3649) congestion control. + A modification to TCP's congestion control mechanism for use + with large congestion windows. A table indicates how much to + increase the congestion window by when an ACK is received. + For more detail see http://www.icir.org/floyd/hstcp.html config TCP_CONG_HYBLA tristate "TCP-Hybla congestion control algorithm" default n ---help--- - TCP-Hybla is a sender-side only change that eliminates penalization of - long-RTT, large-bandwidth connections, like when satellite legs are - involved, especially when sharing a common bottleneck with normal - terrestrial connections. + TCP-Hybla is a sender-side only change that eliminates penalization of + long-RTT, large-bandwidth connections, like when satellite legs are + involved, especially when sharing a common bottleneck with normal + terrestrial connections. config TCP_CONG_VEGAS tristate "TCP Vegas" default n ---help--- - TCP Vegas is a sender-side only change to TCP that anticipates - the onset of congestion by estimating the bandwidth. TCP Vegas - adjusts the sending rate by modifying the congestion - window. TCP Vegas should provide less packet loss, but it is - not as aggressive as TCP Reno. + TCP Vegas is a sender-side only change to TCP that anticipates + the onset of congestion by estimating the bandwidth. TCP Vegas + adjusts the sending rate by modifying the congestion + window. TCP Vegas should provide less packet loss, but it is + not as aggressive as TCP Reno. config TCP_CONG_NV - tristate "TCP NV" - default n - ---help--- - TCP NV is a follow up to TCP Vegas. It has been modified to deal with - 10G networks, measurement noise introduced by LRO, GRO and interrupt - coalescence. In addition, it will decrease its cwnd multiplicatively - instead of linearly. + tristate "TCP NV" + default n + ---help--- + TCP NV is a follow up to TCP Vegas. It has been modified to deal with + 10G networks, measurement noise introduced by LRO, GRO and interrupt + coalescence. In addition, it will decrease its cwnd multiplicatively + instead of linearly. - Note that in general congestion avoidance (cwnd decreased when # packets - queued grows) cannot coexist with congestion control (cwnd decreased only - when there is packet loss) due to fairness issues. One scenario when they - can coexist safely is when the CA flows have RTTs << CC flows RTTs. + Note that in general congestion avoidance (cwnd decreased when # packets + queued grows) cannot coexist with congestion control (cwnd decreased only + when there is packet loss) due to fairness issues. One scenario when they + can coexist safely is when the CA flows have RTTs << CC flows RTTs. - For further details see http://www.brakmo.org/networking/tcp-nv/ + For further details see http://www.brakmo.org/networking/tcp-nv/ config TCP_CONG_SCALABLE tristate "Scalable TCP" default n ---help--- - Scalable TCP is a sender-side only change to TCP which uses a - MIMD congestion control algorithm which has some nice scaling - properties, though is known to have fairness issues. - See http://www.deneholme.net/tom/scalable/ + Scalable TCP is a sender-side only change to TCP which uses a + MIMD congestion control algorithm which has some nice scaling + properties, though is known to have fairness issues. + See http://www.deneholme.net/tom/scalable/ config TCP_CONG_LP tristate "TCP Low Priority" default n ---help--- - TCP Low Priority (TCP-LP), a distributed algorithm whose goal is - to utilize only the excess network bandwidth as compared to the - ``fair share`` of bandwidth as targeted by TCP. - See http://www-ece.rice.edu/networks/TCP-LP/ + TCP Low Priority (TCP-LP), a distributed algorithm whose goal is + to utilize only the excess network bandwidth as compared to the + ``fair share`` of bandwidth as targeted by TCP. + See http://www-ece.rice.edu/networks/TCP-LP/ config TCP_CONG_VENO tristate "TCP Veno" default n ---help--- - TCP Veno is a sender-side only enhancement of TCP to obtain better - throughput over wireless networks. TCP Veno makes use of state - distinguishing to circumvent the difficult judgment of the packet loss - type. TCP Veno cuts down less congestion window in response to random - loss packets. - See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186> + TCP Veno is a sender-side only enhancement of TCP to obtain better + throughput over wireless networks. TCP Veno makes use of state + distinguishing to circumvent the difficult judgment of the packet loss + type. TCP Veno cuts down less congestion window in response to random + loss packets. + See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186> config TCP_CONG_YEAH tristate "YeAH TCP" select TCP_CONG_VEGAS default n ---help--- - YeAH-TCP is a sender-side high-speed enabled TCP congestion control - algorithm, which uses a mixed loss/delay approach to compute the - congestion window. It's design goals target high efficiency, - internal, RTT and Reno fairness, resilience to link loss while - keeping network elements load as low as possible. + YeAH-TCP is a sender-side high-speed enabled TCP congestion control + algorithm, which uses a mixed loss/delay approach to compute the + congestion window. It's design goals target high efficiency, + internal, RTT and Reno fairness, resilience to link loss while + keeping network elements load as low as possible. - For further details look here: - http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf + For further details look here: + http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf config TCP_CONG_ILLINOIS tristate "TCP Illinois" default n ---help--- - TCP-Illinois is a sender-side modification of TCP Reno for - high speed long delay links. It uses round-trip-time to - adjust the alpha and beta parameters to achieve a higher average - throughput and maintain fairness. + TCP-Illinois is a sender-side modification of TCP Reno for + high speed long delay links. It uses round-trip-time to + adjust the alpha and beta parameters to achieve a higher average + throughput and maintain fairness. - For further details see: - http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html + For further details see: + http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html config TCP_CONG_DCTCP tristate "DataCenter TCP (DCTCP)" default n ---help--- - DCTCP leverages Explicit Congestion Notification (ECN) in the network to - provide multi-bit feedback to the end hosts. It is designed to provide: + DCTCP leverages Explicit Congestion Notification (ECN) in the network to + provide multi-bit feedback to the end hosts. It is designed to provide: - - High burst tolerance (incast due to partition/aggregate), - - Low latency (short flows, queries), - - High throughput (continuous data updates, large file transfers) with - commodity, shallow-buffered switches. + - High burst tolerance (incast due to partition/aggregate), + - Low latency (short flows, queries), + - High throughput (continuous data updates, large file transfers) with + commodity, shallow-buffered switches. - All switches in the data center network running DCTCP must support - ECN marking and be configured for marking when reaching defined switch - buffer thresholds. The default ECN marking threshold heuristic for - DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets - (~100KB) at 10Gbps, but might need further careful tweaking. + All switches in the data center network running DCTCP must support + ECN marking and be configured for marking when reaching defined switch + buffer thresholds. The default ECN marking threshold heuristic for + DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets + (~100KB) at 10Gbps, but might need further careful tweaking. - For further details see: - http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf + For further details see: + http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf config TCP_CONG_CDG tristate "CAIA Delay-Gradient (CDG)" default n ---help--- - CAIA Delay-Gradient (CDG) is a TCP congestion control that modifies - the TCP sender in order to: + CAIA Delay-Gradient (CDG) is a TCP congestion control that modifies + the TCP sender in order to: o Use the delay gradient as a congestion signal. o Back off with an average probability that is independent of the RTT. o Coexist with flows that use loss-based congestion control. o Tolerate packet loss unrelated to congestion. - For further details see: - D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using - delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg + For further details see: + D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using + delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg config TCP_CONG_BBR tristate "BBR TCP" default n ---help--- - BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to - maximize network utilization and minimize queues. It builds an explicit - model of the the bottleneck delivery rate and path round-trip - propagation delay. It tolerates packet loss and delay unrelated to - congestion. It can operate over LAN, WAN, cellular, wifi, or cable - modem links. It can coexist with flows that use loss-based congestion - control, and can operate with shallow buffers, deep buffers, - bufferbloat, policers, or AQM schemes that do not provide a delay - signal. It requires the fq ("Fair Queue") pacing packet scheduler. + BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to + maximize network utilization and minimize queues. It builds an explicit + model of the the bottleneck delivery rate and path round-trip + propagation delay. It tolerates packet loss and delay unrelated to + congestion. It can operate over LAN, WAN, cellular, wifi, or cable + modem links. It can coexist with flows that use loss-based congestion + control, and can operate with shallow buffers, deep buffers, + bufferbloat, policers, or AQM schemes that do not provide a delay + signal. It requires the fq ("Fair Queue") pacing packet scheduler. choice prompt "Default TCP congestion control" diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index d57ecfaf89d4..9d97bace13c8 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -65,3 +65,7 @@ obj-$(CONFIG_NETLABEL) += cipso_ipv4.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o xfrm4_protocol.o + +ifeq ($(CONFIG_BPF_JIT),y) +obj-$(CONFIG_BPF_SYSCALL) += bpf_tcp_ca.o +endif diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index ed2301ef872e..2fe295432c24 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -208,7 +208,7 @@ int inet_listen(struct socket *sock, int backlog) if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN))) goto out; - sk->sk_max_ack_backlog = backlog; + WRITE_ONCE(sk->sk_max_ack_backlog, backlog); /* Really, if the socket is already in listen state * we can only allow the backlog to be adjusted. */ @@ -495,7 +495,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, snum = ntohs(addr->sin_port); err = -EACCES; - if (snum && snum < inet_prot_sock(net) && + if (snum && inet_port_requires_bind_service(net, snum) && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) goto out; @@ -1845,13 +1845,8 @@ static __net_init int inet_init_net(struct net *net) return 0; } -static __net_exit void inet_exit_net(struct net *net) -{ -} - static __net_initdata struct pernet_operations af_inet_ops = { .init = inet_init_net, - .exit = inet_exit_net, }; static int __init init_inet_pernet_ops(void) diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c new file mode 100644 index 000000000000..574972bc7299 --- /dev/null +++ b/net/ipv4/bpf_tcp_ca.c @@ -0,0 +1,252 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook */ + +#include <linux/types.h> +#include <linux/bpf_verifier.h> +#include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/filter.h> +#include <net/tcp.h> + +static u32 optional_ops[] = { + offsetof(struct tcp_congestion_ops, init), + offsetof(struct tcp_congestion_ops, release), + offsetof(struct tcp_congestion_ops, set_state), + offsetof(struct tcp_congestion_ops, cwnd_event), + offsetof(struct tcp_congestion_ops, in_ack_event), + offsetof(struct tcp_congestion_ops, pkts_acked), + offsetof(struct tcp_congestion_ops, min_tso_segs), + offsetof(struct tcp_congestion_ops, sndbuf_expand), + offsetof(struct tcp_congestion_ops, cong_control), +}; + +static u32 unsupported_ops[] = { + offsetof(struct tcp_congestion_ops, get_info), +}; + +static const struct btf_type *tcp_sock_type; +static u32 tcp_sock_id, sock_id; + +static int bpf_tcp_ca_init(struct btf *btf) +{ + s32 type_id; + + type_id = btf_find_by_name_kind(btf, "sock", BTF_KIND_STRUCT); + if (type_id < 0) + return -EINVAL; + sock_id = type_id; + + type_id = btf_find_by_name_kind(btf, "tcp_sock", BTF_KIND_STRUCT); + if (type_id < 0) + return -EINVAL; + tcp_sock_id = type_id; + tcp_sock_type = btf_type_by_id(btf, tcp_sock_id); + + return 0; +} + +static bool is_optional(u32 member_offset) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(optional_ops); i++) { + if (member_offset == optional_ops[i]) + return true; + } + + return false; +} + +static bool is_unsupported(u32 member_offset) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(unsupported_ops); i++) { + if (member_offset == unsupported_ops[i]) + return true; + } + + return false; +} + +extern struct btf *btf_vmlinux; + +static bool bpf_tcp_ca_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) + return false; + if (type != BPF_READ) + return false; + if (off % size != 0) + return false; + + if (!btf_ctx_access(off, size, type, prog, info)) + return false; + + if (info->reg_type == PTR_TO_BTF_ID && info->btf_id == sock_id) + /* promote it to tcp_sock */ + info->btf_id = tcp_sock_id; + + return true; +} + +static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, + const struct btf_type *t, int off, + int size, enum bpf_access_type atype, + u32 *next_btf_id) +{ + size_t end; + + if (atype == BPF_READ) + return btf_struct_access(log, t, off, size, atype, next_btf_id); + + if (t != tcp_sock_type) { + bpf_log(log, "only read is supported\n"); + return -EACCES; + } + + switch (off) { + case bpf_ctx_range(struct inet_connection_sock, icsk_ca_priv): + end = offsetofend(struct inet_connection_sock, icsk_ca_priv); + break; + case offsetof(struct inet_connection_sock, icsk_ack.pending): + end = offsetofend(struct inet_connection_sock, + icsk_ack.pending); + break; + case offsetof(struct tcp_sock, snd_cwnd): + end = offsetofend(struct tcp_sock, snd_cwnd); + break; + case offsetof(struct tcp_sock, snd_cwnd_cnt): + end = offsetofend(struct tcp_sock, snd_cwnd_cnt); + break; + case offsetof(struct tcp_sock, snd_ssthresh): + end = offsetofend(struct tcp_sock, snd_ssthresh); + break; + case offsetof(struct tcp_sock, ecn_flags): + end = offsetofend(struct tcp_sock, ecn_flags); + break; + default: + bpf_log(log, "no write support to tcp_sock at off %d\n", off); + return -EACCES; + } + + if (off + size > end) { + bpf_log(log, + "write access at off %d with size %d beyond the member of tcp_sock ended at %zu\n", + off, size, end); + return -EACCES; + } + + return NOT_INIT; +} + +BPF_CALL_2(bpf_tcp_send_ack, struct tcp_sock *, tp, u32, rcv_nxt) +{ + /* bpf_tcp_ca prog cannot have NULL tp */ + __tcp_send_ack((struct sock *)tp, rcv_nxt); + return 0; +} + +static const struct bpf_func_proto bpf_tcp_send_ack_proto = { + .func = bpf_tcp_send_ack, + .gpl_only = false, + /* In case we want to report error later */ + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_ANYTHING, + .btf_id = &tcp_sock_id, +}; + +static const struct bpf_func_proto * +bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, + const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_tcp_send_ack: + return &bpf_tcp_send_ack_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = { + .get_func_proto = bpf_tcp_ca_get_func_proto, + .is_valid_access = bpf_tcp_ca_is_valid_access, + .btf_struct_access = bpf_tcp_ca_btf_struct_access, +}; + +static int bpf_tcp_ca_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + const struct tcp_congestion_ops *utcp_ca; + struct tcp_congestion_ops *tcp_ca; + size_t tcp_ca_name_len; + int prog_fd; + u32 moff; + + utcp_ca = (const struct tcp_congestion_ops *)udata; + tcp_ca = (struct tcp_congestion_ops *)kdata; + + moff = btf_member_bit_offset(t, member) / 8; + switch (moff) { + case offsetof(struct tcp_congestion_ops, flags): + if (utcp_ca->flags & ~TCP_CONG_MASK) + return -EINVAL; + tcp_ca->flags = utcp_ca->flags; + return 1; + case offsetof(struct tcp_congestion_ops, name): + tcp_ca_name_len = strnlen(utcp_ca->name, sizeof(utcp_ca->name)); + if (!tcp_ca_name_len || + tcp_ca_name_len == sizeof(utcp_ca->name)) + return -EINVAL; + if (tcp_ca_find(utcp_ca->name)) + return -EEXIST; + memcpy(tcp_ca->name, utcp_ca->name, sizeof(tcp_ca->name)); + return 1; + } + + if (!btf_type_resolve_func_ptr(btf_vmlinux, member->type, NULL)) + return 0; + + /* Ensure bpf_prog is provided for compulsory func ptr */ + prog_fd = (int)(*(unsigned long *)(udata + moff)); + if (!prog_fd && !is_optional(moff) && !is_unsupported(moff)) + return -EINVAL; + + return 0; +} + +static int bpf_tcp_ca_check_member(const struct btf_type *t, + const struct btf_member *member) +{ + if (is_unsupported(btf_member_bit_offset(t, member) / 8)) + return -ENOTSUPP; + return 0; +} + +static int bpf_tcp_ca_reg(void *kdata) +{ + return tcp_register_congestion_control(kdata); +} + +static void bpf_tcp_ca_unreg(void *kdata) +{ + tcp_unregister_congestion_control(kdata); +} + +/* Avoid sparse warning. It is only used in bpf_struct_ops.c. */ +extern struct bpf_struct_ops bpf_tcp_congestion_ops; + +struct bpf_struct_ops bpf_tcp_congestion_ops = { + .verifier_ops = &bpf_tcp_ca_verifier_ops, + .reg = bpf_tcp_ca_reg, + .unreg = bpf_tcp_ca_unreg, + .check_member = bpf_tcp_ca_check_member, + .init_member = bpf_tcp_ca_init_member, + .init = bpf_tcp_ca_init, + .name = "tcp_congestion_ops", +}; diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 7bd29e694603..4a8550c49202 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -15,6 +15,7 @@ #include <net/sock.h> #include <net/route.h> #include <net/tcp_states.h> +#include <net/sock_reuseport.h> int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -69,9 +70,10 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len } inet->inet_daddr = fl4->daddr; inet->inet_dport = usin->sin_port; + reuseport_has_conns(sk, true); sk->sk_state = TCP_ESTABLISHED; sk_set_txhash(sk); - inet->inet_id = jiffies; + inet->inet_id = prandom_u32(); sk_dst_set(sk, &rt->dst); err = 0; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index a4b5bd4d2c89..e4632bd2026d 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1496,11 +1496,6 @@ skip: } } -static bool inetdev_valid_mtu(unsigned int mtu) -{ - return mtu >= IPV4_MIN_MTU; -} - static void inetdev_send_gratuitous_arp(struct net_device *dev, struct in_device *in_dev) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 5c967764041f..103c7d599a3c 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -18,6 +18,8 @@ #include <net/icmp.h> #include <net/protocol.h> #include <net/udp.h> +#include <net/tcp.h> +#include <net/espintcp.h> #include <linux/highmem.h> @@ -117,6 +119,132 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp) put_page(sg_page(sg)); } +#ifdef CONFIG_INET_ESPINTCP +struct esp_tcp_sk { + struct sock *sk; + struct rcu_head rcu; +}; + +static void esp_free_tcp_sk(struct rcu_head *head) +{ + struct esp_tcp_sk *esk = container_of(head, struct esp_tcp_sk, rcu); + + sock_put(esk->sk); + kfree(esk); +} + +static struct sock *esp_find_tcp_sk(struct xfrm_state *x) +{ + struct xfrm_encap_tmpl *encap = x->encap; + struct esp_tcp_sk *esk; + __be16 sport, dport; + struct sock *nsk; + struct sock *sk; + + sk = rcu_dereference(x->encap_sk); + if (sk && sk->sk_state == TCP_ESTABLISHED) + return sk; + + spin_lock_bh(&x->lock); + sport = encap->encap_sport; + dport = encap->encap_dport; + nsk = rcu_dereference_protected(x->encap_sk, + lockdep_is_held(&x->lock)); + if (sk && sk == nsk) { + esk = kmalloc(sizeof(*esk), GFP_ATOMIC); + if (!esk) { + spin_unlock_bh(&x->lock); + return ERR_PTR(-ENOMEM); + } + RCU_INIT_POINTER(x->encap_sk, NULL); + esk->sk = sk; + call_rcu(&esk->rcu, esp_free_tcp_sk); + } + spin_unlock_bh(&x->lock); + + sk = inet_lookup_established(xs_net(x), &tcp_hashinfo, x->id.daddr.a4, + dport, x->props.saddr.a4, sport, 0); + if (!sk) + return ERR_PTR(-ENOENT); + + if (!tcp_is_ulp_esp(sk)) { + sock_put(sk); + return ERR_PTR(-EINVAL); + } + + spin_lock_bh(&x->lock); + nsk = rcu_dereference_protected(x->encap_sk, + lockdep_is_held(&x->lock)); + if (encap->encap_sport != sport || + encap->encap_dport != dport) { + sock_put(sk); + sk = nsk ?: ERR_PTR(-EREMCHG); + } else if (sk == nsk) { + sock_put(sk); + } else { + rcu_assign_pointer(x->encap_sk, sk); + } + spin_unlock_bh(&x->lock); + + return sk; +} + +static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb) +{ + struct sock *sk; + int err; + + rcu_read_lock(); + + sk = esp_find_tcp_sk(x); + err = PTR_ERR_OR_ZERO(sk); + if (err) + goto out; + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) + err = espintcp_queue_out(sk, skb); + else + err = espintcp_push_skb(sk, skb); + bh_unlock_sock(sk); + +out: + rcu_read_unlock(); + return err; +} + +static int esp_output_tcp_encap_cb(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct xfrm_state *x = dst->xfrm; + + return esp_output_tcp_finish(x, skb); +} + +static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + + local_bh_disable(); + err = xfrm_trans_queue_net(xs_net(x), skb, esp_output_tcp_encap_cb); + local_bh_enable(); + + /* EINPROGRESS just happens to do the right thing. It + * actually means that the skb has been consumed and + * isn't coming back. + */ + return err ?: -EINPROGRESS; +} +#else +static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb) +{ + kfree_skb(skb); + + return -EOPNOTSUPP; +} +#endif + static void esp_output_done(struct crypto_async_request *base, int err) { struct sk_buff *skb = base->data; @@ -147,7 +275,11 @@ static void esp_output_done(struct crypto_async_request *base, int err) secpath_reset(skb); xfrm_dev_resume(skb); } else { - xfrm_output_resume(skb, err); + if (!err && + x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) + esp_output_tail_tcp(x, skb); + else + xfrm_output_resume(skb, err); } } @@ -225,45 +357,100 @@ static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto) tail[plen - 1] = proto; } -static int esp_output_udp_encap(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp) +static struct ip_esp_hdr *esp_output_udp_encap(struct sk_buff *skb, + int encap_type, + struct esp_info *esp, + __be16 sport, + __be16 dport) { - int encap_type; struct udphdr *uh; __be32 *udpdata32; - __be16 sport, dport; - struct xfrm_encap_tmpl *encap = x->encap; - struct ip_esp_hdr *esph = esp->esph; unsigned int len; - spin_lock_bh(&x->lock); - sport = encap->encap_sport; - dport = encap->encap_dport; - encap_type = encap->encap_type; - spin_unlock_bh(&x->lock); - len = skb->len + esp->tailen - skb_transport_offset(skb); - if (len + sizeof(struct iphdr) >= IP_MAX_MTU) - return -EMSGSIZE; + if (len + sizeof(struct iphdr) > IP_MAX_MTU) + return ERR_PTR(-EMSGSIZE); - uh = (struct udphdr *)esph; + uh = (struct udphdr *)esp->esph; uh->source = sport; uh->dest = dport; uh->len = htons(len); uh->check = 0; + *skb_mac_header(skb) = IPPROTO_UDP; + + if (encap_type == UDP_ENCAP_ESPINUDP_NON_IKE) { + udpdata32 = (__be32 *)(uh + 1); + udpdata32[0] = udpdata32[1] = 0; + return (struct ip_esp_hdr *)(udpdata32 + 2); + } + + return (struct ip_esp_hdr *)(uh + 1); +} + +#ifdef CONFIG_INET_ESPINTCP +static struct ip_esp_hdr *esp_output_tcp_encap(struct xfrm_state *x, + struct sk_buff *skb, + struct esp_info *esp) +{ + __be16 *lenp = (void *)esp->esph; + struct ip_esp_hdr *esph; + unsigned int len; + struct sock *sk; + + len = skb->len + esp->tailen - skb_transport_offset(skb); + if (len > IP_MAX_MTU) + return ERR_PTR(-EMSGSIZE); + + rcu_read_lock(); + sk = esp_find_tcp_sk(x); + rcu_read_unlock(); + + if (IS_ERR(sk)) + return ERR_CAST(sk); + + *lenp = htons(len); + esph = (struct ip_esp_hdr *)(lenp + 1); + + return esph; +} +#else +static struct ip_esp_hdr *esp_output_tcp_encap(struct xfrm_state *x, + struct sk_buff *skb, + struct esp_info *esp) +{ + return ERR_PTR(-EOPNOTSUPP); +} +#endif + +static int esp_output_encap(struct xfrm_state *x, struct sk_buff *skb, + struct esp_info *esp) +{ + struct xfrm_encap_tmpl *encap = x->encap; + struct ip_esp_hdr *esph; + __be16 sport, dport; + int encap_type; + + spin_lock_bh(&x->lock); + sport = encap->encap_sport; + dport = encap->encap_dport; + encap_type = encap->encap_type; + spin_unlock_bh(&x->lock); + switch (encap_type) { default: case UDP_ENCAP_ESPINUDP: - esph = (struct ip_esp_hdr *)(uh + 1); - break; case UDP_ENCAP_ESPINUDP_NON_IKE: - udpdata32 = (__be32 *)(uh + 1); - udpdata32[0] = udpdata32[1] = 0; - esph = (struct ip_esp_hdr *)(udpdata32 + 2); + esph = esp_output_udp_encap(skb, encap_type, esp, sport, dport); + break; + case TCP_ENCAP_ESPINTCP: + esph = esp_output_tcp_encap(x, skb, esp); break; } - *skb_mac_header(skb) = IPPROTO_UDP; + if (IS_ERR(esph)) + return PTR_ERR(esph); + esp->esph = esph; return 0; @@ -279,9 +466,9 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * struct sk_buff *trailer; int tailen = esp->tailen; - /* this is non-NULL only with UDP Encapsulation */ + /* this is non-NULL only with TCP/UDP Encapsulation */ if (x->encap) { - int err = esp_output_udp_encap(x, skb, esp); + int err = esp_output_encap(x, skb, esp); if (err < 0) return err; @@ -474,6 +661,9 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * if (sg != dsg) esp_ssg_unref(x, tmp); + if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) + err = esp_output_tail_tcp(x, skb); + error_free: kfree(tmp); error: @@ -600,7 +790,23 @@ int esp_input_done2(struct sk_buff *skb, int err) if (x->encap) { struct xfrm_encap_tmpl *encap = x->encap; + struct tcphdr *th = (void *)(skb_network_header(skb) + ihl); struct udphdr *uh = (void *)(skb_network_header(skb) + ihl); + __be16 source; + + switch (x->encap->encap_type) { + case TCP_ENCAP_ESPINTCP: + source = th->source; + break; + case UDP_ENCAP_ESPINUDP: + case UDP_ENCAP_ESPINUDP_NON_IKE: + source = uh->source; + break; + default: + WARN_ON_ONCE(1); + err = -EINVAL; + goto out; + } /* * 1) if the NAT-T peer's IP or port changed then @@ -609,11 +815,11 @@ int esp_input_done2(struct sk_buff *skb, int err) * SRC ports. */ if (iph->saddr != x->props.saddr.a4 || - uh->source != encap->encap_sport) { + source != encap->encap_sport) { xfrm_address_t ipaddr; ipaddr.a4 = iph->saddr; - km_new_mapping(x, &ipaddr, uh->source); + km_new_mapping(x, &ipaddr, source); /* XXX: perhaps add an extra * policy check here, to see @@ -988,6 +1194,14 @@ static int esp_init_state(struct xfrm_state *x) case UDP_ENCAP_ESPINUDP_NON_IKE: x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32); break; +#ifdef CONFIG_INET_ESPINTCP + case TCP_ENCAP_ESPINTCP: + /* only the length field, TCP encap is done by + * the socket + */ + x->props.header_len += 2; + break; +#endif } } diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 0e4a7cf6bc87..e2e219c7854a 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -57,6 +57,8 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head, if (!x) goto out_reset; + skb->mark = xfrm_smark_get(skb->mark, x); + sp->xvec[sp->len++] = x; sp->olen++; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index e8bc939b56dd..577db1d50a24 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -70,11 +70,6 @@ fail: fib_free_table(main_table); return -ENOMEM; } - -static bool fib4_has_custom_rules(struct net *net) -{ - return false; -} #else struct fib_table *fib_new_table(struct net *net, u32 id) @@ -124,17 +119,13 @@ struct fib_table *fib_get_table(struct net *net, u32 id) h = id & (FIB_TABLE_HASHSZ - 1); head = &net->ipv4.fib_table_hash[h]; - hlist_for_each_entry_rcu(tb, head, tb_hlist) { + hlist_for_each_entry_rcu(tb, head, tb_hlist, + lockdep_rtnl_is_held()) { if (tb->tb_id == id) return tb; } return NULL; } - -static bool fib4_has_custom_rules(struct net *net) -{ - return net->ipv4.fib_has_custom_rules; -} #endif /* CONFIG_IP_MULTIPLE_TABLES */ static void fib_replace_table(struct net *net, struct fib_table *old, @@ -1147,7 +1138,7 @@ void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric) if (!(dev->flags & IFF_UP) || ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) || ipv4_is_zeronet(prefix) || - prefix == ifa->ifa_local || ifa->ifa_prefixlen == 32) + (prefix == ifa->ifa_local && ifa->ifa_prefixlen == 32)) return; /* add the new */ diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index a68b5e21ec51..c092e9a55790 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -16,6 +16,9 @@ struct fib_alias { u8 fa_slen; u32 tb_id; s16 fa_default; + u8 offload:1, + trap:1, + unused:6; struct rcu_head rcu; }; @@ -35,9 +38,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg, int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, struct netlink_ext_ack *extack); bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi); -int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id, - u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, - unsigned int); +int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, + struct fib_rt_info *fri, unsigned int flags); void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, u32 tb_id, const struct nl_info *info, unsigned int nlm_flags); diff --git a/net/ipv4/fib_notifier.c b/net/ipv4/fib_notifier.c index b804ccbdb241..0c28bd469a68 100644 --- a/net/ipv4/fib_notifier.c +++ b/net/ipv4/fib_notifier.c @@ -9,12 +9,12 @@ #include <net/netns/ipv4.h> #include <net/ip_fib.h> -int call_fib4_notifier(struct notifier_block *nb, struct net *net, +int call_fib4_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info) { info->family = AF_INET; - return call_fib_notifier(nb, net, event_type, info); + return call_fib_notifier(nb, event_type, info); } int call_fib4_notifiers(struct net *net, enum fib_event_type event_type, @@ -34,17 +34,16 @@ static unsigned int fib4_seq_read(struct net *net) return net->ipv4.fib_seq + fib4_rules_seq_read(net); } -static int fib4_dump(struct net *net, struct notifier_block *nb) +static int fib4_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { int err; - err = fib4_rules_dump(net, nb); + err = fib4_rules_dump(net, nb, extack); if (err) return err; - fib_notify(net, nb); - - return 0; + return fib_notify(net, nb, extack); } static const struct fib_notifier_ops fib4_notifier_ops_template = { diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index b43a7ba5c6a4..f99e3bac5cab 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -65,9 +65,10 @@ bool fib4_rule_default(const struct fib_rule *rule) } EXPORT_SYMBOL_GPL(fib4_rule_default); -int fib4_rules_dump(struct net *net, struct notifier_block *nb) +int fib4_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { - return fib_rules_dump(net, nb, AF_INET); + return fib_rules_dump(net, nb, AF_INET, extack); } unsigned int fib4_rules_seq_read(struct net *net) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 2db089e10ba0..a803cdd9400a 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -504,6 +504,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, u32 tb_id, const struct nl_info *info, unsigned int nlm_flags) { + struct fib_rt_info fri; struct sk_buff *skb; u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; int err = -ENOBUFS; @@ -512,9 +513,15 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, if (!skb) goto errout; - err = fib_dump_info(skb, info->portid, seq, event, tb_id, - fa->fa_type, key, dst_len, - fa->fa_tos, fa->fa_info, nlm_flags); + fri.fi = fa->fa_info; + fri.tb_id = tb_id; + fri.dst = key; + fri.dst_len = dst_len; + fri.tos = fa->fa_tos; + fri.type = fa->fa_type; + fri.offload = fa->offload; + fri.trap = fa->trap; + err = fib_dump_info(skb, info->portid, seq, event, &fri, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -1582,7 +1589,7 @@ failure: } int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc, - unsigned char *flags, bool skip_oif) + u8 rt_family, unsigned char *flags, bool skip_oif) { if (nhc->nhc_flags & RTNH_F_DEAD) *flags |= RTNH_F_DEAD; @@ -1613,7 +1620,7 @@ int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc, /* if gateway family does not match nexthop family * gateway is encoded as RTA_VIA */ - if (nhc->nhc_gw_family != nhc->nhc_family) { + if (rt_family != nhc->nhc_gw_family) { int alen = sizeof(struct in6_addr); struct nlattr *nla; struct rtvia *via; @@ -1654,7 +1661,7 @@ EXPORT_SYMBOL_GPL(fib_nexthop_info); #if IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) || IS_ENABLED(CONFIG_IPV6) int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc, - int nh_weight) + int nh_weight, u8 rt_family) { const struct net_device *dev = nhc->nhc_dev; struct rtnexthop *rtnh; @@ -1667,7 +1674,7 @@ int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc, rtnh->rtnh_hops = nh_weight - 1; rtnh->rtnh_ifindex = dev ? dev->ifindex : 0; - if (fib_nexthop_info(skb, nhc, &flags, true) < 0) + if (fib_nexthop_info(skb, nhc, rt_family, &flags, true) < 0) goto nla_put_failure; rtnh->rtnh_flags = flags; @@ -1693,13 +1700,14 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) goto nla_put_failure; if (unlikely(fi->nh)) { - if (nexthop_mpath_fill_node(skb, fi->nh) < 0) + if (nexthop_mpath_fill_node(skb, fi->nh, AF_INET) < 0) goto nla_put_failure; goto mp_end; } for_nexthops(fi) { - if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight) < 0) + if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight, + AF_INET) < 0) goto nla_put_failure; #ifdef CONFIG_IP_ROUTE_CLASSID if (nh->nh_tclassid && @@ -1724,10 +1732,11 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) #endif int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, - u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, - struct fib_info *fi, unsigned int flags) + struct fib_rt_info *fri, unsigned int flags) { - unsigned int nhs = fib_info_num_path(fi); + unsigned int nhs = fib_info_num_path(fri->fi); + struct fib_info *fi = fri->fi; + u32 tb_id = fri->tb_id; struct nlmsghdr *nlh; struct rtmsg *rtm; @@ -1737,22 +1746,22 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, rtm = nlmsg_data(nlh); rtm->rtm_family = AF_INET; - rtm->rtm_dst_len = dst_len; + rtm->rtm_dst_len = fri->dst_len; rtm->rtm_src_len = 0; - rtm->rtm_tos = tos; + rtm->rtm_tos = fri->tos; if (tb_id < 256) rtm->rtm_table = tb_id; else rtm->rtm_table = RT_TABLE_COMPAT; if (nla_put_u32(skb, RTA_TABLE, tb_id)) goto nla_put_failure; - rtm->rtm_type = type; + rtm->rtm_type = fri->type; rtm->rtm_flags = fi->fib_flags; rtm->rtm_scope = fi->fib_scope; rtm->rtm_protocol = fi->fib_protocol; if (rtm->rtm_dst_len && - nla_put_in_addr(skb, RTA_DST, dst)) + nla_put_in_addr(skb, RTA_DST, fri->dst)) goto nla_put_failure; if (fi->fib_priority && nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority)) @@ -1775,7 +1784,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, const struct fib_nh_common *nhc = fib_info_nhc(fi, 0); unsigned char flags = 0; - if (fib_nexthop_info(skb, nhc, &flags, false) < 0) + if (fib_nexthop_info(skb, nhc, AF_INET, &flags, false) < 0) goto nla_put_failure; rtm->rtm_flags = flags; @@ -1794,6 +1803,11 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, goto nla_put_failure; } + if (fri->offload) + rtm->rtm_flags |= RTM_F_OFFLOAD; + if (fri->trap) + rtm->rtm_flags |= RTM_F_TRAP; + nlmsg_end(skb, nlh); return 0; @@ -1813,8 +1827,8 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local) int ret = 0; unsigned int hash = fib_laddr_hashfn(local); struct hlist_head *head = &fib_info_laddrhash[hash]; + int tb_id = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; struct net *net = dev_net(dev); - int tb_id = l3mdev_fib_table(dev); struct fib_info *fi; if (!fib_info_laddrhash || local == 0) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 2b2b3d291ab0..ff0c24371e33 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -74,11 +74,13 @@ #include <trace/events/fib.h> #include "fib_lookup.h" -static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, +static int call_fib_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, u32 dst, - int dst_len, struct fib_alias *fa) + int dst_len, struct fib_alias *fa, + struct netlink_ext_ack *extack) { struct fib_entry_notifier_info info = { + .info.extack = extack, .dst = dst, .dst_len = dst_len, .fi = fa->fa_info, @@ -86,7 +88,7 @@ static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, .type = fa->fa_type, .tb_id = fa->tb_id, }; - return call_fib4_notifier(nb, net, event_type, &info.info); + return call_fib4_notifier(nb, event_type, &info.info); } static int call_fib_entry_notifiers(struct net *net, @@ -978,9 +980,12 @@ static struct key_vector *fib_find_node(struct trie *t, /* Return the first fib alias matching TOS with * priority less than or equal to PRIO. + * If 'find_first' is set, return the first matching + * fib alias, regardless of TOS and priority. */ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen, - u8 tos, u32 prio, u32 tb_id) + u8 tos, u32 prio, u32 tb_id, + bool find_first) { struct fib_alias *fa; @@ -996,6 +1001,8 @@ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen, continue; if (fa->tb_id != tb_id) break; + if (find_first) + return fa; if (fa->fa_tos > tos) continue; if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos) @@ -1005,6 +1012,52 @@ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen, return NULL; } +static struct fib_alias * +fib_find_matching_alias(struct net *net, const struct fib_rt_info *fri) +{ + u8 slen = KEYLENGTH - fri->dst_len; + struct key_vector *l, *tp; + struct fib_table *tb; + struct fib_alias *fa; + struct trie *t; + + tb = fib_get_table(net, fri->tb_id); + if (!tb) + return NULL; + + t = (struct trie *)tb->tb_data; + l = fib_find_node(t, &tp, be32_to_cpu(fri->dst)); + if (!l) + return NULL; + + hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { + if (fa->fa_slen == slen && fa->tb_id == fri->tb_id && + fa->fa_tos == fri->tos && fa->fa_info == fri->fi && + fa->fa_type == fri->type) + return fa; + } + + return NULL; +} + +void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri) +{ + struct fib_alias *fa_match; + + rcu_read_lock(); + + fa_match = fib_find_matching_alias(net, fri); + if (!fa_match) + goto out; + + fa_match->offload = fri->offload; + fa_match->trap = fri->trap; + +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(fib_alias_hw_flags_set); + static void trie_rebalance(struct trie *t, struct key_vector *tn) { while (!IS_TRIE(tn)) @@ -1061,9 +1114,6 @@ noleaf: return -ENOMEM; } -/* fib notifier for ADD is sent before calling fib_insert_alias with - * the expectation that the only possible failure ENOMEM - */ static int fib_insert_alias(struct trie *t, struct key_vector *tp, struct key_vector *l, struct fib_alias *new, struct fib_alias *fa, t_key key) @@ -1116,11 +1166,13 @@ static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack) return true; } +static void fib_remove_alias(struct trie *t, struct key_vector *tp, + struct key_vector *l, struct fib_alias *old); + /* Caller must hold RTNL. */ int fib_table_insert(struct net *net, struct fib_table *tb, struct fib_config *cfg, struct netlink_ext_ack *extack) { - enum fib_event_type event = FIB_EVENT_ENTRY_ADD; struct trie *t = (struct trie *)tb->tb_data; struct fib_alias *fa, *new_fa; struct key_vector *l, *tp; @@ -1147,7 +1199,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, l = fib_find_node(t, &tp, key); fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority, - tb->tb_id) : NULL; + tb->tb_id, false) : NULL; /* Now fa, if non-NULL, points to the first fib alias * with the same keys [prefix,tos,priority], if such key already @@ -1214,19 +1266,29 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->fa_slen = fa->fa_slen; new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; + new_fa->offload = 0; + new_fa->trap = 0; - err = call_fib_entry_notifiers(net, - FIB_EVENT_ENTRY_REPLACE, - key, plen, new_fa, - extack); - if (err) - goto out_free_new_fa; + hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list); + + if (fib_find_alias(&l->leaf, fa->fa_slen, 0, 0, + tb->tb_id, true) == new_fa) { + enum fib_event_type fib_event; + + fib_event = FIB_EVENT_ENTRY_REPLACE; + err = call_fib_entry_notifiers(net, fib_event, + key, plen, + new_fa, extack); + if (err) { + hlist_replace_rcu(&new_fa->fa_list, + &fa->fa_list); + goto out_free_new_fa; + } + } rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, nlflags); - hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list); - alias_free_mem_rcu(fa); fib_release_info(fi_drop); @@ -1242,12 +1304,10 @@ int fib_table_insert(struct net *net, struct fib_table *tb, if (fa_match) goto out; - if (cfg->fc_nlflags & NLM_F_APPEND) { - event = FIB_EVENT_ENTRY_APPEND; + if (cfg->fc_nlflags & NLM_F_APPEND) nlflags |= NLM_F_APPEND; - } else { + else fa = fa_first; - } } err = -ENOENT; if (!(cfg->fc_nlflags & NLM_F_CREATE)) @@ -1266,15 +1326,29 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->fa_slen = slen; new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; - - err = call_fib_entry_notifiers(net, event, key, plen, new_fa, extack); - if (err) - goto out_free_new_fa; + new_fa->offload = 0; + new_fa->trap = 0; /* Insert new entry to the list. */ err = fib_insert_alias(t, tp, l, new_fa, fa, key); if (err) - goto out_fib_notif; + goto out_free_new_fa; + + /* The alias was already inserted, so the node must exist. */ + l = l ? l : fib_find_node(t, &tp, key); + if (WARN_ON_ONCE(!l)) + goto out_free_new_fa; + + if (fib_find_alias(&l->leaf, new_fa->fa_slen, 0, 0, tb->tb_id, true) == + new_fa) { + enum fib_event_type fib_event; + + fib_event = FIB_EVENT_ENTRY_REPLACE; + err = call_fib_entry_notifiers(net, fib_event, key, plen, + new_fa, extack); + if (err) + goto out_remove_new_fa; + } if (!plen) tb->tb_num_default++; @@ -1285,14 +1359,8 @@ int fib_table_insert(struct net *net, struct fib_table *tb, succeeded: return 0; -out_fib_notif: - /* notifier was sent that entry would be added to trie, but - * the add failed and need to recover. Only failure for - * fib_insert_alias is ENOMEM. - */ - NL_SET_ERR_MSG(extack, "Failed to insert route into trie"); - call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, - plen, new_fa, NULL); +out_remove_new_fa: + fib_remove_alias(t, tp, l, new_fa); out_free_new_fa: kmem_cache_free(fn_alias_kmem, new_fa); out: @@ -1543,6 +1611,36 @@ static void fib_remove_alias(struct trie *t, struct key_vector *tp, node_pull_suffix(tp, fa->fa_slen); } +static void fib_notify_alias_delete(struct net *net, u32 key, + struct hlist_head *fah, + struct fib_alias *fa_to_delete, + struct netlink_ext_ack *extack) +{ + struct fib_alias *fa_next, *fa_to_notify; + u32 tb_id = fa_to_delete->tb_id; + u8 slen = fa_to_delete->fa_slen; + enum fib_event_type fib_event; + + /* Do not notify if we do not care about the route. */ + if (fib_find_alias(fah, slen, 0, 0, tb_id, true) != fa_to_delete) + return; + + /* Determine if the route should be replaced by the next route in the + * list. + */ + fa_next = hlist_entry_safe(fa_to_delete->fa_list.next, + struct fib_alias, fa_list); + if (fa_next && fa_next->fa_slen == slen && fa_next->tb_id == tb_id) { + fib_event = FIB_EVENT_ENTRY_REPLACE; + fa_to_notify = fa_next; + } else { + fib_event = FIB_EVENT_ENTRY_DEL; + fa_to_notify = fa_to_delete; + } + call_fib_entry_notifiers(net, fib_event, key, KEYLENGTH - slen, + fa_to_notify, extack); +} + /* Caller must hold RTNL. */ int fib_table_delete(struct net *net, struct fib_table *tb, struct fib_config *cfg, struct netlink_ext_ack *extack) @@ -1564,7 +1662,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb, if (!l) return -ESRCH; - fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id); + fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id, false); if (!fa) return -ESRCH; @@ -1596,8 +1694,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb, if (!fa_to_delete) return -ESRCH; - call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen, - fa_to_delete, extack); + fib_notify_alias_delete(net, key, &l->leaf, fa_to_delete, extack); rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, &cfg->fc_nlinfo, 0); @@ -1921,10 +2018,8 @@ int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all) continue; } - call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, - n->key, - KEYLENGTH - fa->fa_slen, fa, - NULL); + fib_notify_alias_delete(net, n->key, &n->leaf, fa, + NULL); hlist_del_rcu(&fa->fa_list); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); @@ -2015,10 +2110,13 @@ void fib_info_notify_update(struct net *net, struct nl_info *info) } } -static void fib_leaf_notify(struct net *net, struct key_vector *l, - struct fib_table *tb, struct notifier_block *nb) +static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb, + struct notifier_block *nb, + struct netlink_ext_ack *extack) { struct fib_alias *fa; + int last_slen = -1; + int err; hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { struct fib_info *fi = fa->fa_info; @@ -2032,39 +2130,57 @@ static void fib_leaf_notify(struct net *net, struct key_vector *l, if (tb->tb_id != fa->tb_id) continue; - call_fib_entry_notifier(nb, net, FIB_EVENT_ENTRY_ADD, l->key, - KEYLENGTH - fa->fa_slen, fa); + if (fa->fa_slen == last_slen) + continue; + + last_slen = fa->fa_slen; + err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_REPLACE, + l->key, KEYLENGTH - fa->fa_slen, + fa, extack); + if (err) + return err; } + return 0; } -static void fib_table_notify(struct net *net, struct fib_table *tb, - struct notifier_block *nb) +static int fib_table_notify(struct fib_table *tb, struct notifier_block *nb, + struct netlink_ext_ack *extack) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *l, *tp = t->kv; t_key key = 0; + int err; while ((l = leaf_walk_rcu(&tp, key)) != NULL) { - fib_leaf_notify(net, l, tb, nb); + err = fib_leaf_notify(l, tb, nb, extack); + if (err) + return err; key = l->key + 1; /* stop in case of wrap around */ if (key < l->key) break; } + return 0; } -void fib_notify(struct net *net, struct notifier_block *nb) +int fib_notify(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { unsigned int h; + int err; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct fib_table *tb; - hlist_for_each_entry_rcu(tb, head, tb_hlist) - fib_table_notify(net, tb, nb); + hlist_for_each_entry_rcu(tb, head, tb_hlist) { + err = fib_table_notify(tb, nb, extack); + if (err) + return err; + } } + return 0; } static void __trie_free_rcu(struct rcu_head *head) @@ -2128,14 +2244,20 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, if (filter->dump_routes) { if (!s_fa) { + struct fib_rt_info fri; + + fri.fi = fi; + fri.tb_id = tb->tb_id; + fri.dst = xkey; + fri.dst_len = KEYLENGTH - fa->fa_slen; + fri.tos = fa->fa_tos; + fri.type = fa->fa_type; + fri.offload = fa->offload; + fri.trap = fa->trap; err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - RTM_NEWROUTE, - tb->tb_id, fa->fa_type, - xkey, - KEYLENGTH - fa->fa_slen, - fa->fa_tos, fi, flags); + RTM_NEWROUTE, &fri, flags); if (err < 0) goto stop; } @@ -2145,7 +2267,7 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, if (filter->dump_exceptions) { err = fib_dump_info_fnhe(skb, cb, tb->tb_id, fi, - &i_fa, s_fa); + &i_fa, s_fa, flags); if (err < 0) goto stop; } @@ -2175,6 +2297,12 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, int count = cb->args[2]; t_key key = cb->args[3]; + /* First time here, count and key are both always 0. Count > 0 + * and key == 0 means the dump has wrapped around and we are done. + */ + if (count && !key) + return skb->len; + while ((l = leaf_walk_rcu(&tp, key)) != NULL) { int err; diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 30fa771d382a..dcc79ff54b41 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -662,8 +662,8 @@ static const struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = { [FOU_ATTR_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG, }, [FOU_ATTR_LOCAL_V4] = { .type = NLA_U32, }, [FOU_ATTR_PEER_V4] = { .type = NLA_U32, }, - [FOU_ATTR_LOCAL_V6] = { .type = sizeof(struct in6_addr), }, - [FOU_ATTR_PEER_V6] = { .type = sizeof(struct in6_addr), }, + [FOU_ATTR_LOCAL_V6] = { .len = sizeof(struct in6_addr), }, + [FOU_ATTR_PEER_V6] = { .len = sizeof(struct in6_addr), }, [FOU_ATTR_PEER_PORT] = { .type = NLA_U16, }, [FOU_ATTR_IFINDEX] = { .type = NLA_S32, }, }; diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 44bfeecac33e..5fd6e8ed02b5 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -127,7 +127,7 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, if (!pskb_may_pull(skb, nhs + hdr_len + sizeof(*ershdr))) return -EINVAL; - ershdr = (struct erspan_base_hdr *)options; + ershdr = (struct erspan_base_hdr *)(skb->data + nhs + hdr_len); tpi->key = cpu_to_be32(get_session_id(ershdr)); } diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 4de7e962d3da..2e6d1b7a7bc9 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -174,7 +174,7 @@ static struct sk_buff *gre_gro_receive(struct list_head *head, if (skb_gro_checksum_simple_validate(skb)) goto out_unlock; - skb_gro_checksum_try_convert(skb, IPPROTO_GRE, 0, + skb_gro_checksum_try_convert(skb, IPPROTO_GRE, null_compute_pseudo); } diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 1510e951f451..f369e7ce685b 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -249,10 +249,11 @@ bool icmp_global_allow(void) bool rc = false; /* Check if token bucket is empty and cannot be refilled - * without taking the spinlock. + * without taking the spinlock. The READ_ONCE() are paired + * with the following WRITE_ONCE() in this same function. */ - if (!icmp_global.credit) { - delta = min_t(u32, now - icmp_global.stamp, HZ); + if (!READ_ONCE(icmp_global.credit)) { + delta = min_t(u32, now - READ_ONCE(icmp_global.stamp), HZ); if (delta < HZ / 50) return false; } @@ -262,14 +263,14 @@ bool icmp_global_allow(void) if (delta >= HZ / 50) { incr = sysctl_icmp_msgs_per_sec * delta / HZ ; if (incr) - icmp_global.stamp = now; + WRITE_ONCE(icmp_global.stamp, now); } credit = min_t(u32, icmp_global.credit + incr, sysctl_icmp_msgs_burst); if (credit) { credit--; rc = true; } - icmp_global.credit = credit; + WRITE_ONCE(icmp_global.credit, credit); spin_unlock(&icmp_global.lock); return rc; } @@ -582,7 +583,13 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, if (!rt) goto out; - net = dev_net(rt->dst.dev); + + if (rt->dst.dev) + net = dev_net(rt->dst.dev); + else if (skb_in->dev) + net = dev_net(skb_in->dev); + else + goto out; /* * Find the original header. It is expected to be valid, of course. @@ -676,7 +683,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, dev = dev_get_by_index_rcu(net, inet_iif(skb_in)); if (dev) - saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); + saddr = inet_select_addr(dev, iph->saddr, + RT_SCOPE_LINK); else saddr = 0; rcu_read_unlock(); @@ -740,6 +748,39 @@ out:; } EXPORT_SYMBOL(__icmp_send); +#if IS_ENABLED(CONFIG_NF_NAT) +#include <net/netfilter/nf_conntrack.h> +void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) +{ + struct sk_buff *cloned_skb = NULL; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + __be32 orig_ip; + + ct = nf_ct_get(skb_in, &ctinfo); + if (!ct || !(ct->status & IPS_SRC_NAT)) { + icmp_send(skb_in, type, code, info); + return; + } + + if (skb_shared(skb_in)) + skb_in = cloned_skb = skb_clone(skb_in, GFP_ATOMIC); + + if (unlikely(!skb_in || skb_network_header(skb_in) < skb_in->head || + (skb_network_header(skb_in) + sizeof(struct iphdr)) > + skb_tail_pointer(skb_in) || skb_ensure_writable(skb_in, + skb_network_offset(skb_in) + sizeof(struct iphdr)))) + goto out; + + orig_ip = ip_hdr(skb_in)->saddr; + ip_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.ip; + icmp_send(skb_in, type, code, info); + ip_hdr(skb_in)->saddr = orig_ip; +out: + consume_skb(cloned_skb); +} +EXPORT_SYMBOL(icmp_ndo_send); +#endif static void icmp_socket_deliver(struct sk_buff *skb, u32 info) { @@ -902,7 +943,7 @@ static bool icmp_redirect(struct sk_buff *skb) return false; } - icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway); + icmp_socket_deliver(skb, ntohl(icmp_hdr(skb)->un.gateway)); return true; } diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 180f6896b98b..3b9c7a2725a9 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1475,7 +1475,7 @@ EXPORT_SYMBOL(__ip_mc_inc_group); void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) { - __ip_mc_inc_group(in_dev, addr, MCAST_EXCLUDE); + __ip_mc_inc_group(in_dev, addr, GFP_KERNEL); } EXPORT_SYMBOL(ip_mc_inc_group); @@ -1563,7 +1563,7 @@ static int ip_mc_check_igmp_msg(struct sk_buff *skb) } } -static inline __sum16 ip_mc_validate_checksum(struct sk_buff *skb) +static __sum16 ip_mc_validate_checksum(struct sk_buff *skb) { return skb_checksum_simple_validate(skb); } @@ -2197,7 +2197,7 @@ static int __ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr, iml->sflist = NULL; iml->sfmode = mode; rcu_assign_pointer(inet->mc_list, iml); - __ip_mc_inc_group(in_dev, addr, mode); + ____ip_mc_inc_group(in_dev, addr, mode, GFP_KERNEL); err = 0; done: return err; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index f5c163d4771b..a4db79b1b643 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -560,7 +560,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; - if (opt && opt->opt.is_strictroute && rt->rt_gw_family) + if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; rcu_read_unlock(); return &rt->dst; @@ -598,7 +598,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; - if (opt && opt->opt.is_strictroute && rt->rt_gw_family) + if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; return &rt->dst; @@ -610,12 +610,6 @@ no_route: } EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); -#if IS_ENABLED(CONFIG_IPV6) -#define AF_INET_FAMILY(fam) ((fam) == AF_INET) -#else -#define AF_INET_FAMILY(fam) true -#endif - /* Decide when to expire the request and when to resend SYN-ACK */ static inline void syn_ack_recalc(struct request_sock *req, const int thresh, const int max_retries, @@ -716,7 +710,7 @@ static void reqsk_timer_handler(struct timer_list *t) * ones are about to clog our table. */ qlen = reqsk_queue_len(queue); - if ((qlen << 1) > max(8U, sk_listener->sk_max_ack_backlog)) { + if ((qlen << 1) > max(8U, READ_ONCE(sk_listener->sk_max_ack_backlog))) { int young = reqsk_queue_len_young(queue) << 1; while (thresh > 2) { @@ -770,6 +764,18 @@ void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, } EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); +static void inet_clone_ulp(const struct request_sock *req, struct sock *newsk, + const gfp_t priority) +{ + struct inet_connection_sock *icsk = inet_csk(newsk); + + if (!icsk->icsk_ulp_ops) + return; + + if (icsk->icsk_ulp_ops->clone) + icsk->icsk_ulp_ops->clone(req, newsk, priority); +} + /** * inet_csk_clone_lock - clone an inet socket, and lock its clone * @sk: the socket to clone @@ -810,6 +816,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, /* Deinitialize accept_queue to trap illegal accesses. */ memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); + inet_clone_ulp(req, newsk, priority); + security_inet_csk_clone(newsk, req); } return newsk; @@ -906,7 +914,7 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req, percpu_counter_inc(sk->sk_prot->orphan_count); if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { - BUG_ON(tcp_sk(child)->fastopen_rsk != req); + BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req); BUG_ON(sk != req->rsk_listener); /* Paranoid, to prevent race condition if @@ -915,7 +923,7 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req, * Also to satisfy an assertion in * tcp_v4_destroy_sock(). */ - tcp_sk(child)->fastopen_rsk = NULL; + RCU_INIT_POINTER(tcp_sk(child)->fastopen_rsk, NULL); } inet_csk_destroy_sock(child); } @@ -934,7 +942,7 @@ struct sock *inet_csk_reqsk_queue_add(struct sock *sk, req->sk = child; req->dl_next = NULL; if (queue->rskq_accept_head == NULL) - queue->rskq_accept_head = req; + WRITE_ONCE(queue->rskq_accept_head, req); else queue->rskq_accept_tail->dl_next = req; queue->rskq_accept_tail = req; @@ -1086,7 +1094,7 @@ struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu) if (!dst) goto out; } - dst->ops->update_pmtu(dst, sk, NULL, mtu); + dst->ops->update_pmtu(dst, sk, NULL, mtu, true); dst = __sk_dst_check(sk, 0); if (!dst) diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index bbb005eb5218..f11e997e517b 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -193,7 +193,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, if (ext & (1 << (INET_DIAG_MEMINFO - 1))) { struct inet_diag_meminfo minfo = { .idiag_rmem = sk_rmem_alloc_get(sk), - .idiag_wmem = sk->sk_wmem_queued, + .idiag_wmem = READ_ONCE(sk->sk_wmem_queued), .idiag_fmem = sk->sk_forward_alloc, .idiag_tmem = sk_wmem_alloc_get(sk), }; @@ -226,17 +226,17 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, r->idiag_timer = 1; r->idiag_retrans = icsk->icsk_retransmits; r->idiag_expires = - jiffies_to_msecs(icsk->icsk_timeout - jiffies); + jiffies_delta_to_msecs(icsk->icsk_timeout - jiffies); } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { r->idiag_timer = 4; r->idiag_retrans = icsk->icsk_probes_out; r->idiag_expires = - jiffies_to_msecs(icsk->icsk_timeout - jiffies); + jiffies_delta_to_msecs(icsk->icsk_timeout - jiffies); } else if (timer_pending(&sk->sk_timer)) { r->idiag_timer = 2; r->idiag_retrans = icsk->icsk_probes_out; r->idiag_expires = - jiffies_to_msecs(sk->sk_timer.expires - jiffies); + jiffies_delta_to_msecs(sk->sk_timer.expires - jiffies); } else { r->idiag_timer = 0; r->idiag_expires = 0; @@ -342,16 +342,13 @@ static int inet_twsk_diag_fill(struct sock *sk, r = nlmsg_data(nlh); BUG_ON(tw->tw_state != TCP_TIME_WAIT); - tmo = tw->tw_timer.expires - jiffies; - if (tmo < 0) - tmo = 0; - inet_diag_msg_common_fill(r, sk); r->idiag_retrans = 0; r->idiag_state = tw->tw_substate; r->idiag_timer = 3; - r->idiag_expires = jiffies_to_msecs(tmo); + tmo = tw->tw_timer.expires - jiffies; + r->idiag_expires = jiffies_delta_to_msecs(tmo); r->idiag_rqueue = 0; r->idiag_wqueue = 0; r->idiag_uid = 0; @@ -385,7 +382,7 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, offsetof(struct sock, sk_cookie)); tmo = inet_reqsk(sk)->rsk_timer.expires - jiffies; - r->idiag_expires = (tmo >= 0) ? jiffies_to_msecs(tmo) : 0; + r->idiag_expires = jiffies_delta_to_msecs(tmo); r->idiag_rqueue = 0; r->idiag_wqueue = 0; r->idiag_uid = 0; @@ -914,11 +911,12 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, for (i = s_i; i < INET_LHTABLE_SIZE; i++) { struct inet_listen_hashbucket *ilb; + struct hlist_nulls_node *node; num = 0; ilb = &hashinfo->listening_hash[i]; spin_lock(&ilb->lock); - sk_for_each(sk, &ilb->head) { + sk_nulls_for_each(sk, node, &ilb->nulls_head) { struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 97824864e40d..2bbaaf0c7176 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -240,7 +240,7 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score = sk->sk_family == PF_INET ? 2 : 1; - if (sk->sk_incoming_cpu == raw_smp_processor_id()) + if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; } return score; @@ -516,10 +516,11 @@ static int inet_reuseport_add_sock(struct sock *sk, struct inet_listen_hashbucket *ilb) { struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; + const struct hlist_nulls_node *node; struct sock *sk2; kuid_t uid = sock_i_uid(sk); - sk_for_each_rcu(sk2, &ilb->head) { + sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) { if (sk2 != sk && sk2->sk_family == sk->sk_family && ipv6_only_sock(sk2) == ipv6_only_sock(sk) && @@ -555,9 +556,9 @@ int __inet_hash(struct sock *sk, struct sock *osk) } if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && sk->sk_family == AF_INET6) - hlist_add_tail_rcu(&sk->sk_node, &ilb->head); + __sk_nulls_add_node_tail_rcu(sk, &ilb->nulls_head); else - hlist_add_head_rcu(&sk->sk_node, &ilb->head); + __sk_nulls_add_node_rcu(sk, &ilb->nulls_head); inet_hash2(hashinfo, sk); ilb->count++; sock_set_flag(sk, SOCK_RCU_FREE); @@ -606,11 +607,9 @@ void inet_unhash(struct sock *sk) reuseport_detach_sock(sk); if (ilb) { inet_unhash2(hashinfo, sk); - __sk_del_node_init(sk); - ilb->count--; - } else { - __sk_nulls_del_node_init_rcu(sk); + ilb->count--; } + __sk_nulls_del_node_init_rcu(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); unlock: spin_unlock_bh(lock); @@ -750,7 +749,8 @@ void inet_hashinfo_init(struct inet_hashinfo *h) for (i = 0; i < INET_LHTABLE_SIZE; i++) { spin_lock_init(&h->listening_hash[i].lock); - INIT_HLIST_HEAD(&h->listening_hash[i].head); + INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].nulls_head, + i + LISTENING_NULLS_BASE); h->listening_hash[i].count = 0; } diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index be778599bfed..ff327a62c9ce 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -160,7 +160,12 @@ static void inet_peer_gc(struct inet_peer_base *base, base->total / inet_peer_threshold * HZ; for (i = 0; i < gc_cnt; i++) { p = gc_stack[i]; - delta = (__u32)jiffies - p->dtime; + + /* The READ_ONCE() pairs with the WRITE_ONCE() + * in inet_putpeer() + */ + delta = (__u32)jiffies - READ_ONCE(p->dtime); + if (delta < ttl || !refcount_dec_if_one(&p->refcnt)) gc_stack[i] = NULL; } @@ -237,7 +242,10 @@ EXPORT_SYMBOL_GPL(inet_getpeer); void inet_putpeer(struct inet_peer *p) { - p->dtime = (__u32)jiffies; + /* The WRITE_ONCE() pairs with itself (we run lockless) + * and the READ_ONCE() in inet_peer_gc() + */ + WRITE_ONCE(p->dtime, (__u32)jiffies); if (refcount_dec_and_test(&p->refcnt)) call_rcu(&p->rcu, inetpeer_free_rcu); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 06f6f280b9ff..00ec819f949b 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -123,7 +123,7 @@ int ip_forward(struct sk_buff *skb) rt = skb_rtable(skb); - if (opt->is_strictroute && rt->rt_gw_family) + if (opt->is_strictroute && rt->rt_uses_gateway) goto sr_failed; IPCB(skb)->flags |= IPSKB_FORWARDED; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index a53a543fe055..8274f98c511c 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -340,6 +340,8 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, iph->saddr, iph->daddr, tpi->key); if (tunnel) { + const struct iphdr *tnl_params; + if (__iptunnel_pull_header(skb, hdr_len, tpi->proto, raw_proto, false) < 0) goto drop; @@ -348,7 +350,9 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, skb_pop_mac_header(skb); else skb_reset_mac_header(skb); - if (tunnel->collect_md) { + + tnl_params = &tunnel->parms.iph; + if (tunnel->collect_md || tnl_params->daddr == 0) { __be16 flags; __be64 tun_id; @@ -509,9 +513,9 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev) key = &tun_info->key; if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)) goto err_free_skb; - md = ip_tunnel_info_opts(tun_info); - if (!md) + if (tun_info->options_len < sizeof(*md)) goto err_free_skb; + md = ip_tunnel_info_opts(tun_info); /* ERSPAN has fixed 8 byte GRE header */ version = md->version; @@ -1446,6 +1450,7 @@ static void erspan_setup(struct net_device *dev) struct ip_tunnel *t = netdev_priv(dev); ether_setup(dev); + dev->max_mtu = 0; dev->netdev_ops = &erspan_netdev_ops; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; @@ -1459,8 +1464,8 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_OFLAGS] = { .type = NLA_U16 }, [IFLA_GRE_IKEY] = { .type = NLA_U32 }, [IFLA_GRE_OKEY] = { .type = NLA_U32 }, - [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) }, - [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) }, + [IFLA_GRE_LOCAL] = { .len = sizeof_field(struct iphdr, saddr) }, + [IFLA_GRE_REMOTE] = { .len = sizeof_field(struct iphdr, daddr) }, [IFLA_GRE_TTL] = { .type = NLA_U8 }, [IFLA_GRE_TOS] = { .type = NLA_U8 }, [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 }, diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 1e2392b7c64e..aa438c6758a7 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -199,7 +199,7 @@ resubmit: kfree_skb(skb); return; } - nf_reset(skb); + nf_reset_ct(skb); } ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv, skb); @@ -302,16 +302,31 @@ drop: return true; } +static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph, + const struct sk_buff *hint) +{ + return hint && !skb_dst(skb) && ip_hdr(hint)->daddr == iph->daddr && + ip_hdr(hint)->tos == iph->tos; +} + INDIRECT_CALLABLE_DECLARE(int udp_v4_early_demux(struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int tcp_v4_early_demux(struct sk_buff *)); static int ip_rcv_finish_core(struct net *net, struct sock *sk, - struct sk_buff *skb, struct net_device *dev) + struct sk_buff *skb, struct net_device *dev, + const struct sk_buff *hint) { const struct iphdr *iph = ip_hdr(skb); int (*edemux)(struct sk_buff *skb); struct rtable *rt; int err; + if (ip_can_use_hint(skb, iph, hint)) { + err = ip_route_use_hint(skb, iph->daddr, iph->saddr, iph->tos, + dev, hint); + if (unlikely(err)) + goto drop_error; + } + if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && !skb->sk && @@ -408,7 +423,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) if (!skb) return NET_RX_SUCCESS; - ret = ip_rcv_finish_core(net, sk, skb, dev); + ret = ip_rcv_finish_core(net, sk, skb, dev, NULL); if (ret != NET_RX_DROP) ret = dst_input(skb); return ret; @@ -535,11 +550,20 @@ static void ip_sublist_rcv_finish(struct list_head *head) } } +static struct sk_buff *ip_extract_route_hint(const struct net *net, + struct sk_buff *skb, int rt_type) +{ + if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST) + return NULL; + + return skb; +} + static void ip_list_rcv_finish(struct net *net, struct sock *sk, struct list_head *head) { + struct sk_buff *skb, *next, *hint = NULL; struct dst_entry *curr_dst = NULL; - struct sk_buff *skb, *next; struct list_head sublist; INIT_LIST_HEAD(&sublist); @@ -554,11 +578,14 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk, skb = l3mdev_ip_rcv(skb); if (!skb) continue; - if (ip_rcv_finish_core(net, sk, skb, dev) == NET_RX_DROP) + if (ip_rcv_finish_core(net, sk, skb, dev, hint) == NET_RX_DROP) continue; dst = skb_dst(skb); if (curr_dst != dst) { + hint = ip_extract_route_hint(net, skb, + ((struct rtable *)dst)->rt_type); + /* dispatch old sublist */ if (!list_empty(&sublist)) ip_sublist_rcv_finish(&sublist); @@ -611,5 +638,6 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt, list_add_tail(&skb->list, &sublist); } /* dispatch final sublist */ - ip_sublist_rcv(&sublist, curr_dev, curr_net); + if (!list_empty(&sublist)) + ip_sublist_rcv(&sublist, curr_dev, curr_net); } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index cc7ef0d05bbd..d84819893db9 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -240,8 +240,8 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s static int ip_finish_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int mtu) { + struct sk_buff *segs, *nskb; netdev_features_t features; - struct sk_buff *segs; int ret = 0; /* common case: seglen is <= mtu @@ -272,8 +272,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, consume_skb(skb); - do { - struct sk_buff *nskb = segs->next; + skb_list_walk_safe(segs, segs, nskb) { int err; skb_mark_not_on_list(segs); @@ -281,8 +280,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, if (err && ret == 0) ret = err; - segs = nskb; - } while (segs); + } return ret; } @@ -422,7 +420,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net_device *dev = skb_dst(skb)->dev; + struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev; IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); @@ -430,7 +428,7 @@ int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb) skb->protocol = htons(ETH_P_IP); return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, - net, sk, skb, NULL, dev, + net, sk, skb, indev, dev, ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } @@ -499,7 +497,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, skb_dst_set_noref(skb, &rt->dst); packet_routed: - if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_gw_family) + if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway) goto no_route; /* OK, we know where to send it, allocate and build IP header. */ @@ -645,11 +643,12 @@ void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter) EXPORT_SYMBOL(ip_fraglist_prepare); void ip_frag_init(struct sk_buff *skb, unsigned int hlen, - unsigned int ll_rs, unsigned int mtu, + unsigned int ll_rs, unsigned int mtu, bool DF, struct ip_frag_state *state) { struct iphdr *iph = ip_hdr(skb); + state->DF = DF; state->hlen = hlen; state->ll_rs = ll_rs; state->mtu = mtu; @@ -668,9 +667,6 @@ static void ip_frag_ipcb(struct sk_buff *from, struct sk_buff *to, /* Copy the flags to each fragment. */ IPCB(to)->flags = IPCB(from)->flags; - if (IPCB(from)->flags & IPSKB_FRAG_PMTU) - state->iph->frag_off |= htons(IP_DF); - /* ANK: dirty, but effective trick. Upgrade options only if * the segment to be fragmented was THE FIRST (otherwise, * options are already fixed) and make it ONCE @@ -738,6 +734,8 @@ struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state) */ iph = ip_hdr(skb2); iph->frag_off = htons((state->offset >> 3)); + if (state->DF) + iph->frag_off |= htons(IP_DF); /* * Added AC : If we are fragmenting a fragment that's not the @@ -771,6 +769,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, struct rtable *rt = skb_rtable(skb); unsigned int mtu, hlen, ll_rs; struct ip_fraglist_iter iter; + ktime_t tstamp = skb->tstamp; struct ip_frag_state state; int err = 0; @@ -846,6 +845,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, ip_fraglist_prepare(skb, &iter); } + skb->tstamp = tstamp; err = output(net, sk, skb); if (!err) @@ -881,7 +881,8 @@ slow_path: * Fragment the datagram. */ - ip_frag_init(skb, hlen, ll_rs, mtu, &state); + ip_frag_init(skb, hlen, ll_rs, mtu, IPCB(skb)->flags & IPSKB_FRAG_PMTU, + &state); /* * Keep copying data until we run out. @@ -900,6 +901,7 @@ slow_path: /* * Put this fragment into the sending queue. */ + skb2->tstamp = tstamp; err = output(net, sk, skb2); if (err) goto fail; @@ -1254,18 +1256,22 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, cork->addr = ipc->addr; } - /* - * We steal reference to this route, caller should not release it - */ - *rtp = NULL; cork->fragsize = ip_sk_use_pmtu(sk) ? - dst_mtu(&rt->dst) : rt->dst.dev->mtu; + dst_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu); + + if (!inetdev_valid_mtu(cork->fragsize)) + return -ENETUNREACH; cork->gso_size = ipc->gso_size; + cork->dst = &rt->dst; + /* We stole this route, caller should not release it. */ + *rtp = NULL; + cork->length = 0; cork->ttl = ipc->ttl; cork->tos = ipc->tos; + cork->mark = ipc->sockc.mark; cork->priority = ipc->priority; cork->transmit_time = ipc->sockc.transmit_time; cork->tx_flags = 0; @@ -1529,7 +1535,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, } skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority; - skb->mark = sk->sk_mark; + skb->mark = cork->mark; skb->tstamp = cork->transmit_time; /* * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec @@ -1693,7 +1699,6 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, inet_sk(sk)->tos = arg->tos; - sk->sk_priority = skb->priority; sk->sk_protocol = ip_hdr(skb)->protocol; sk->sk_bound_dev_if = arg->bound_dev_if; sk->sk_sndbuf = sysctl_wmem_default; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 38c02bb62e2c..74e1d964a615 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -505,7 +505,7 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; if (skb_valid_dst(skb)) - skb_dst_update_pmtu(skb, mtu); + skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IP)) { if (!skb_is_gso(skb) && @@ -1236,10 +1236,8 @@ int ip_tunnel_init(struct net_device *dev) iph->version = 4; iph->ihl = 5; - if (tunnel->collect_md) { - dev->features |= NETIF_F_NETNS_LOCAL; + if (tunnel->collect_md) netif_keep_dst(dev); - } return 0; } EXPORT_SYMBOL_GPL(ip_tunnel_init); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 1452a97914a0..47f8b947eef1 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -34,6 +34,9 @@ #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/dst_metadata.h> +#include <net/geneve.h> +#include <net/vxlan.h> +#include <net/erspan.h> const struct ip_tunnel_encap_ops __rcu * iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly; @@ -126,15 +129,14 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, if (!md || md->type != METADATA_IP_TUNNEL || md->u.tun_info.mode & IP_TUNNEL_INFO_TX) - return NULL; - res = metadata_dst_alloc(0, METADATA_IP_TUNNEL, flags); + src = &md->u.tun_info; + res = metadata_dst_alloc(src->options_len, METADATA_IP_TUNNEL, flags); if (!res) return NULL; dst = &res->u.tun_info; - src = &md->u.tun_info; dst->key.tun_id = src->key.tun_id; if (src->mode & IP_TUNNEL_INFO_IPV6) memcpy(&dst->key.u.ipv6.dst, &src->key.u.ipv6.src, @@ -143,6 +145,8 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, dst->key.u.ipv4.dst = src->key.u.ipv4.src; dst->key.tun_flags = src->key.tun_flags; dst->mode = src->mode | IP_TUNNEL_INFO_TX; + ip_tunnel_info_opts_set(dst, ip_tunnel_info_opts(src), + src->options_len, 0); return res; } @@ -211,30 +215,243 @@ void ip_tunnel_get_stats64(struct net_device *dev, EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64); static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = { + [LWTUNNEL_IP_UNSPEC] = { .strict_start_type = LWTUNNEL_IP_OPTS }, [LWTUNNEL_IP_ID] = { .type = NLA_U64 }, [LWTUNNEL_IP_DST] = { .type = NLA_U32 }, [LWTUNNEL_IP_SRC] = { .type = NLA_U32 }, [LWTUNNEL_IP_TTL] = { .type = NLA_U8 }, [LWTUNNEL_IP_TOS] = { .type = NLA_U8 }, [LWTUNNEL_IP_FLAGS] = { .type = NLA_U16 }, + [LWTUNNEL_IP_OPTS] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy ip_opts_policy[LWTUNNEL_IP_OPTS_MAX + 1] = { + [LWTUNNEL_IP_OPTS_GENEVE] = { .type = NLA_NESTED }, + [LWTUNNEL_IP_OPTS_VXLAN] = { .type = NLA_NESTED }, + [LWTUNNEL_IP_OPTS_ERSPAN] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy +geneve_opt_policy[LWTUNNEL_IP_OPT_GENEVE_MAX + 1] = { + [LWTUNNEL_IP_OPT_GENEVE_CLASS] = { .type = NLA_U16 }, + [LWTUNNEL_IP_OPT_GENEVE_TYPE] = { .type = NLA_U8 }, + [LWTUNNEL_IP_OPT_GENEVE_DATA] = { .type = NLA_BINARY, .len = 128 }, +}; + +static const struct nla_policy +vxlan_opt_policy[LWTUNNEL_IP_OPT_VXLAN_MAX + 1] = { + [LWTUNNEL_IP_OPT_VXLAN_GBP] = { .type = NLA_U32 }, }; +static const struct nla_policy +erspan_opt_policy[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1] = { + [LWTUNNEL_IP_OPT_ERSPAN_VER] = { .type = NLA_U8 }, + [LWTUNNEL_IP_OPT_ERSPAN_INDEX] = { .type = NLA_U32 }, + [LWTUNNEL_IP_OPT_ERSPAN_DIR] = { .type = NLA_U8 }, + [LWTUNNEL_IP_OPT_ERSPAN_HWID] = { .type = NLA_U8 }, +}; + +static int ip_tun_parse_opts_geneve(struct nlattr *attr, + struct ip_tunnel_info *info, int opts_len, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[LWTUNNEL_IP_OPT_GENEVE_MAX + 1]; + int data_len, err; + + err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_GENEVE_MAX, attr, + geneve_opt_policy, extack); + if (err) + return err; + + if (!tb[LWTUNNEL_IP_OPT_GENEVE_CLASS] || + !tb[LWTUNNEL_IP_OPT_GENEVE_TYPE] || + !tb[LWTUNNEL_IP_OPT_GENEVE_DATA]) + return -EINVAL; + + attr = tb[LWTUNNEL_IP_OPT_GENEVE_DATA]; + data_len = nla_len(attr); + if (data_len % 4) + return -EINVAL; + + if (info) { + struct geneve_opt *opt = ip_tunnel_info_opts(info) + opts_len; + + memcpy(opt->opt_data, nla_data(attr), data_len); + opt->length = data_len / 4; + attr = tb[LWTUNNEL_IP_OPT_GENEVE_CLASS]; + opt->opt_class = nla_get_be16(attr); + attr = tb[LWTUNNEL_IP_OPT_GENEVE_TYPE]; + opt->type = nla_get_u8(attr); + info->key.tun_flags |= TUNNEL_GENEVE_OPT; + } + + return sizeof(struct geneve_opt) + data_len; +} + +static int ip_tun_parse_opts_vxlan(struct nlattr *attr, + struct ip_tunnel_info *info, int opts_len, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[LWTUNNEL_IP_OPT_VXLAN_MAX + 1]; + int err; + + err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_VXLAN_MAX, attr, + vxlan_opt_policy, extack); + if (err) + return err; + + if (!tb[LWTUNNEL_IP_OPT_VXLAN_GBP]) + return -EINVAL; + + if (info) { + struct vxlan_metadata *md = + ip_tunnel_info_opts(info) + opts_len; + + attr = tb[LWTUNNEL_IP_OPT_VXLAN_GBP]; + md->gbp = nla_get_u32(attr); + info->key.tun_flags |= TUNNEL_VXLAN_OPT; + } + + return sizeof(struct vxlan_metadata); +} + +static int ip_tun_parse_opts_erspan(struct nlattr *attr, + struct ip_tunnel_info *info, int opts_len, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1]; + int err; + u8 ver; + + err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_ERSPAN_MAX, attr, + erspan_opt_policy, extack); + if (err) + return err; + + if (!tb[LWTUNNEL_IP_OPT_ERSPAN_VER]) + return -EINVAL; + + ver = nla_get_u8(tb[LWTUNNEL_IP_OPT_ERSPAN_VER]); + if (ver == 1) { + if (!tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX]) + return -EINVAL; + } else if (ver == 2) { + if (!tb[LWTUNNEL_IP_OPT_ERSPAN_DIR] || + !tb[LWTUNNEL_IP_OPT_ERSPAN_HWID]) + return -EINVAL; + } else { + return -EINVAL; + } + + if (info) { + struct erspan_metadata *md = + ip_tunnel_info_opts(info) + opts_len; + + md->version = ver; + if (ver == 1) { + attr = tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX]; + md->u.index = nla_get_be32(attr); + } else { + attr = tb[LWTUNNEL_IP_OPT_ERSPAN_DIR]; + md->u.md2.dir = nla_get_u8(attr); + attr = tb[LWTUNNEL_IP_OPT_ERSPAN_HWID]; + set_hwid(&md->u.md2, nla_get_u8(attr)); + } + + info->key.tun_flags |= TUNNEL_ERSPAN_OPT; + } + + return sizeof(struct erspan_metadata); +} + +static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info, + struct netlink_ext_ack *extack) +{ + int err, rem, opt_len, opts_len = 0, type = 0; + struct nlattr *nla; + + if (!attr) + return 0; + + err = nla_validate(nla_data(attr), nla_len(attr), LWTUNNEL_IP_OPTS_MAX, + ip_opts_policy, extack); + if (err) + return err; + + nla_for_each_attr(nla, nla_data(attr), nla_len(attr), rem) { + switch (nla_type(nla)) { + case LWTUNNEL_IP_OPTS_GENEVE: + if (type && type != TUNNEL_GENEVE_OPT) + return -EINVAL; + opt_len = ip_tun_parse_opts_geneve(nla, info, opts_len, + extack); + if (opt_len < 0) + return opt_len; + opts_len += opt_len; + if (opts_len > IP_TUNNEL_OPTS_MAX) + return -EINVAL; + type = TUNNEL_GENEVE_OPT; + break; + case LWTUNNEL_IP_OPTS_VXLAN: + if (type) + return -EINVAL; + opt_len = ip_tun_parse_opts_vxlan(nla, info, opts_len, + extack); + if (opt_len < 0) + return opt_len; + opts_len += opt_len; + type = TUNNEL_VXLAN_OPT; + break; + case LWTUNNEL_IP_OPTS_ERSPAN: + if (type) + return -EINVAL; + opt_len = ip_tun_parse_opts_erspan(nla, info, opts_len, + extack); + if (opt_len < 0) + return opt_len; + opts_len += opt_len; + type = TUNNEL_ERSPAN_OPT; + break; + default: + return -EINVAL; + } + } + + return opts_len; +} + +static int ip_tun_get_optlen(struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + return ip_tun_parse_opts(attr, NULL, extack); +} + +static int ip_tun_set_opts(struct nlattr *attr, struct ip_tunnel_info *info, + struct netlink_ext_ack *extack) +{ + return ip_tun_parse_opts(attr, info, extack); +} + static int ip_tun_build_state(struct nlattr *attr, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { - struct ip_tunnel_info *tun_info; - struct lwtunnel_state *new_state; struct nlattr *tb[LWTUNNEL_IP_MAX + 1]; - int err; + struct lwtunnel_state *new_state; + struct ip_tunnel_info *tun_info; + int err, opt_len; err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP_MAX, attr, ip_tun_policy, extack); if (err < 0) return err; - new_state = lwtunnel_state_alloc(sizeof(*tun_info)); + opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP_OPTS], extack); + if (opt_len < 0) + return opt_len; + + new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len); if (!new_state) return -ENOMEM; @@ -242,6 +459,12 @@ static int ip_tun_build_state(struct nlattr *attr, tun_info = lwt_tun_info(new_state); + err = ip_tun_set_opts(tb[LWTUNNEL_IP_OPTS], tun_info, extack); + if (err < 0) { + lwtstate_free(new_state); + return err; + } + #ifdef CONFIG_DST_CACHE err = dst_cache_init(&tun_info->dst_cache, GFP_KERNEL); if (err) { @@ -266,10 +489,12 @@ static int ip_tun_build_state(struct nlattr *attr, tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]); if (tb[LWTUNNEL_IP_FLAGS]) - tun_info->key.tun_flags = nla_get_be16(tb[LWTUNNEL_IP_FLAGS]); + tun_info->key.tun_flags |= + (nla_get_be16(tb[LWTUNNEL_IP_FLAGS]) & + ~TUNNEL_OPTIONS_PRESENT); tun_info->mode = IP_TUNNEL_INFO_TX; - tun_info->options_len = 0; + tun_info->options_len = opt_len; *ts = new_state; @@ -285,6 +510,114 @@ static void ip_tun_destroy_state(struct lwtunnel_state *lwtstate) #endif } +static int ip_tun_fill_encap_opts_geneve(struct sk_buff *skb, + struct ip_tunnel_info *tun_info) +{ + struct geneve_opt *opt; + struct nlattr *nest; + int offset = 0; + + nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_GENEVE); + if (!nest) + return -ENOMEM; + + while (tun_info->options_len > offset) { + opt = ip_tunnel_info_opts(tun_info) + offset; + if (nla_put_be16(skb, LWTUNNEL_IP_OPT_GENEVE_CLASS, + opt->opt_class) || + nla_put_u8(skb, LWTUNNEL_IP_OPT_GENEVE_TYPE, opt->type) || + nla_put(skb, LWTUNNEL_IP_OPT_GENEVE_DATA, opt->length * 4, + opt->opt_data)) { + nla_nest_cancel(skb, nest); + return -ENOMEM; + } + offset += sizeof(*opt) + opt->length * 4; + } + + nla_nest_end(skb, nest); + return 0; +} + +static int ip_tun_fill_encap_opts_vxlan(struct sk_buff *skb, + struct ip_tunnel_info *tun_info) +{ + struct vxlan_metadata *md; + struct nlattr *nest; + + nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_VXLAN); + if (!nest) + return -ENOMEM; + + md = ip_tunnel_info_opts(tun_info); + if (nla_put_u32(skb, LWTUNNEL_IP_OPT_VXLAN_GBP, md->gbp)) { + nla_nest_cancel(skb, nest); + return -ENOMEM; + } + + nla_nest_end(skb, nest); + return 0; +} + +static int ip_tun_fill_encap_opts_erspan(struct sk_buff *skb, + struct ip_tunnel_info *tun_info) +{ + struct erspan_metadata *md; + struct nlattr *nest; + + nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_ERSPAN); + if (!nest) + return -ENOMEM; + + md = ip_tunnel_info_opts(tun_info); + if (nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_VER, md->version)) + goto err; + + if (md->version == 1 && + nla_put_be32(skb, LWTUNNEL_IP_OPT_ERSPAN_INDEX, md->u.index)) + goto err; + + if (md->version == 2 && + (nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_DIR, md->u.md2.dir) || + nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_HWID, + get_hwid(&md->u.md2)))) + goto err; + + nla_nest_end(skb, nest); + return 0; +err: + nla_nest_cancel(skb, nest); + return -ENOMEM; +} + +static int ip_tun_fill_encap_opts(struct sk_buff *skb, int type, + struct ip_tunnel_info *tun_info) +{ + struct nlattr *nest; + int err = 0; + + if (!(tun_info->key.tun_flags & TUNNEL_OPTIONS_PRESENT)) + return 0; + + nest = nla_nest_start_noflag(skb, type); + if (!nest) + return -ENOMEM; + + if (tun_info->key.tun_flags & TUNNEL_GENEVE_OPT) + err = ip_tun_fill_encap_opts_geneve(skb, tun_info); + else if (tun_info->key.tun_flags & TUNNEL_VXLAN_OPT) + err = ip_tun_fill_encap_opts_vxlan(skb, tun_info); + else if (tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT) + err = ip_tun_fill_encap_opts_erspan(skb, tun_info); + + if (err) { + nla_nest_cancel(skb, nest); + return err; + } + + nla_nest_end(skb, nest); + return 0; +} + static int ip_tun_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwtstate) { @@ -296,12 +629,52 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb, nla_put_in_addr(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) || nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) || nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) || - nla_put_be16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags)) + nla_put_be16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags) || + ip_tun_fill_encap_opts(skb, LWTUNNEL_IP_OPTS, tun_info)) return -ENOMEM; return 0; } +static int ip_tun_opts_nlsize(struct ip_tunnel_info *info) +{ + int opt_len; + + if (!(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT)) + return 0; + + opt_len = nla_total_size(0); /* LWTUNNEL_IP_OPTS */ + if (info->key.tun_flags & TUNNEL_GENEVE_OPT) { + struct geneve_opt *opt; + int offset = 0; + + opt_len += nla_total_size(0); /* LWTUNNEL_IP_OPTS_GENEVE */ + while (info->options_len > offset) { + opt = ip_tunnel_info_opts(info) + offset; + opt_len += nla_total_size(2) /* OPT_GENEVE_CLASS */ + + nla_total_size(1) /* OPT_GENEVE_TYPE */ + + nla_total_size(opt->length * 4); + /* OPT_GENEVE_DATA */ + offset += sizeof(*opt) + opt->length * 4; + } + } else if (info->key.tun_flags & TUNNEL_VXLAN_OPT) { + opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_VXLAN */ + + nla_total_size(4); /* OPT_VXLAN_GBP */ + } else if (info->key.tun_flags & TUNNEL_ERSPAN_OPT) { + struct erspan_metadata *md = ip_tunnel_info_opts(info); + + opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_ERSPAN */ + + nla_total_size(1) /* OPT_ERSPAN_VER */ + + (md->version == 1 ? nla_total_size(4) + /* OPT_ERSPAN_INDEX (v1) */ + : nla_total_size(1) + + nla_total_size(1)); + /* OPT_ERSPAN_DIR + HWID (v2) */ + } + + return opt_len; +} + static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate) { return nla_total_size_64bit(8) /* LWTUNNEL_IP_ID */ @@ -309,13 +682,21 @@ static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate) + nla_total_size(4) /* LWTUNNEL_IP_SRC */ + nla_total_size(1) /* LWTUNNEL_IP_TOS */ + nla_total_size(1) /* LWTUNNEL_IP_TTL */ - + nla_total_size(2); /* LWTUNNEL_IP_FLAGS */ + + nla_total_size(2) /* LWTUNNEL_IP_FLAGS */ + + ip_tun_opts_nlsize(lwt_tun_info(lwtstate)); + /* LWTUNNEL_IP_OPTS */ } static int ip_tun_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) { - return memcmp(lwt_tun_info(a), lwt_tun_info(b), - sizeof(struct ip_tunnel_info)); + struct ip_tunnel_info *info_a = lwt_tun_info(a); + struct ip_tunnel_info *info_b = lwt_tun_info(b); + + return memcmp(info_a, info_b, sizeof(info_a->key)) || + info_a->mode != info_b->mode || + info_a->options_len != info_b->options_len || + memcmp(ip_tunnel_info_opts(info_a), + ip_tunnel_info_opts(info_b), info_a->options_len); } static const struct lwtunnel_encap_ops ip_tun_lwt_ops = { @@ -328,12 +709,14 @@ static const struct lwtunnel_encap_ops ip_tun_lwt_ops = { }; static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = { + [LWTUNNEL_IP6_UNSPEC] = { .strict_start_type = LWTUNNEL_IP6_OPTS }, [LWTUNNEL_IP6_ID] = { .type = NLA_U64 }, [LWTUNNEL_IP6_DST] = { .len = sizeof(struct in6_addr) }, [LWTUNNEL_IP6_SRC] = { .len = sizeof(struct in6_addr) }, [LWTUNNEL_IP6_HOPLIMIT] = { .type = NLA_U8 }, [LWTUNNEL_IP6_TC] = { .type = NLA_U8 }, [LWTUNNEL_IP6_FLAGS] = { .type = NLA_U16 }, + [LWTUNNEL_IP6_OPTS] = { .type = NLA_NESTED }, }; static int ip6_tun_build_state(struct nlattr *attr, @@ -341,17 +724,21 @@ static int ip6_tun_build_state(struct nlattr *attr, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { - struct ip_tunnel_info *tun_info; - struct lwtunnel_state *new_state; struct nlattr *tb[LWTUNNEL_IP6_MAX + 1]; - int err; + struct lwtunnel_state *new_state; + struct ip_tunnel_info *tun_info; + int err, opt_len; err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP6_MAX, attr, ip6_tun_policy, extack); if (err < 0) return err; - new_state = lwtunnel_state_alloc(sizeof(*tun_info)); + opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP6_OPTS], extack); + if (opt_len < 0) + return opt_len; + + new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len); if (!new_state) return -ENOMEM; @@ -359,6 +746,12 @@ static int ip6_tun_build_state(struct nlattr *attr, tun_info = lwt_tun_info(new_state); + err = ip_tun_set_opts(tb[LWTUNNEL_IP6_OPTS], tun_info, extack); + if (err < 0) { + lwtstate_free(new_state); + return err; + } + if (tb[LWTUNNEL_IP6_ID]) tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP6_ID]); @@ -375,10 +768,12 @@ static int ip6_tun_build_state(struct nlattr *attr, tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]); if (tb[LWTUNNEL_IP6_FLAGS]) - tun_info->key.tun_flags = nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]); + tun_info->key.tun_flags |= + (nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]) & + ~TUNNEL_OPTIONS_PRESENT); tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6; - tun_info->options_len = 0; + tun_info->options_len = opt_len; *ts = new_state; @@ -396,7 +791,8 @@ static int ip6_tun_fill_encap_info(struct sk_buff *skb, nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) || nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.tos) || nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.ttl) || - nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags)) + nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags) || + ip_tun_fill_encap_opts(skb, LWTUNNEL_IP6_OPTS, tun_info)) return -ENOMEM; return 0; @@ -409,7 +805,9 @@ static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate) + nla_total_size(16) /* LWTUNNEL_IP6_SRC */ + nla_total_size(1) /* LWTUNNEL_IP6_HOPLIMIT */ + nla_total_size(1) /* LWTUNNEL_IP6_TC */ - + nla_total_size(2); /* LWTUNNEL_IP6_FLAGS */ + + nla_total_size(2) /* LWTUNNEL_IP6_FLAGS */ + + ip_tun_opts_nlsize(lwt_tun_info(lwtstate)); + /* LWTUNNEL_IP6_OPTS */ } static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = { diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index cfb025606793..37cddd18f282 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -187,8 +187,17 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, int mtu; if (!dst) { - dev->stats.tx_carrier_errors++; - goto tx_error_icmp; + struct rtable *rt; + + fl->u.ip4.flowi4_oif = dev->ifindex; + fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; + rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4); + if (IS_ERR(rt)) { + dev->stats.tx_carrier_errors++; + goto tx_error_icmp; + } + dst = &rt->dst; + skb_dst_set(skb, dst); } dst_hold(dst); @@ -214,7 +223,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, mtu = dst_mtu(dst); if (skb->len > mtu) { - skb_dst_update_pmtu(skb, mtu); + skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IP)) { icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); @@ -580,8 +589,8 @@ static const struct nla_policy vti_policy[IFLA_VTI_MAX + 1] = { [IFLA_VTI_LINK] = { .type = NLA_U32 }, [IFLA_VTI_IKEY] = { .type = NLA_U32 }, [IFLA_VTI_OKEY] = { .type = NLA_U32 }, - [IFLA_VTI_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) }, - [IFLA_VTI_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) }, + [IFLA_VTI_LOCAL] = { .len = sizeof_field(struct iphdr, saddr) }, + [IFLA_VTI_REMOTE] = { .len = sizeof_field(struct iphdr, daddr) }, [IFLA_VTI_FWMARK] = { .type = NLA_U32 }, }; diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 9bcca08efec9..4438f6b12335 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -1334,7 +1334,7 @@ static int __init ipconfig_proc_net_init(void) /* Create a new file under /proc/net/ipconfig */ static int ipconfig_proc_net_create(const char *name, - const struct file_operations *fops) + const struct proc_ops *proc_ops) { char *pname; struct proc_dir_entry *p; @@ -1346,7 +1346,7 @@ static int ipconfig_proc_net_create(const char *name, if (!pname) return -ENOMEM; - p = proc_create(pname, 0444, init_net.proc_net, fops); + p = proc_create(pname, 0444, init_net.proc_net, proc_ops); kfree(pname); if (!p) return -ENOMEM; @@ -1355,7 +1355,7 @@ static int ipconfig_proc_net_create(const char *name, } /* Write NTP server IP addresses to /proc/net/ipconfig/ntp_servers */ -static int ntp_servers_seq_show(struct seq_file *seq, void *v) +static int ntp_servers_show(struct seq_file *seq, void *v) { int i; @@ -1365,7 +1365,7 @@ static int ntp_servers_seq_show(struct seq_file *seq, void *v) } return 0; } -DEFINE_SHOW_ATTRIBUTE(ntp_servers_seq); +DEFINE_PROC_SHOW_ATTRIBUTE(ntp_servers); #endif /* CONFIG_PROC_FS */ /* @@ -1412,6 +1412,9 @@ static int __init wait_for_devices(void) struct net_device *dev; int found = 0; + /* make sure deferred device probes are finished */ + wait_for_device_probe(); + rtnl_lock(); for_each_netdev(&init_net, dev) { if (ic_is_init_dev(dev)) { @@ -1453,7 +1456,7 @@ static int __init ip_auto_config(void) proc_create_single("pnp", 0444, init_net.proc_net, pnp_seq_show); if (ipconfig_proc_net_init() == 0) - ipconfig_proc_net_create("ntp_servers", &ntp_servers_seq_fops); + ipconfig_proc_net_create("ntp_servers", &ntp_servers_proc_ops); #endif /* CONFIG_PROC_FS */ if (!ic_enable) @@ -1483,10 +1486,10 @@ static int __init ip_auto_config(void) * missing values. */ if (ic_myaddr == NONE || -#ifdef CONFIG_ROOT_NFS +#if defined(CONFIG_ROOT_NFS) || defined(CONFIG_CIFS_ROOT) (root_server_addr == NONE && ic_servaddr == NONE && - ROOT_DEV == Root_NFS) || + (ROOT_DEV == Root_NFS || ROOT_DEV == Root_CIFS)) || #endif ic_first_dev->next) { #ifdef IPCONFIG_DYNAMIC @@ -1513,6 +1516,12 @@ static int __init ip_auto_config(void) goto try_try_again; } #endif +#ifdef CONFIG_CIFS_ROOT + if (ROOT_DEV == Root_CIFS) { + pr_err("IP-Config: Retrying forever (CIFS root)...\n"); + goto try_try_again; + } +#endif if (--retries) { pr_err("IP-Config: Reopening network devices...\n"); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index c07bc82cbbe9..6e68def66822 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -278,9 +278,10 @@ static void __net_exit ipmr_rules_exit(struct net *net) rtnl_unlock(); } -static int ipmr_rules_dump(struct net *net, struct notifier_block *nb) +static int ipmr_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { - return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR); + return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR, extack); } static unsigned int ipmr_rules_seq_read(struct net *net) @@ -336,7 +337,8 @@ static void __net_exit ipmr_rules_exit(struct net *net) rtnl_unlock(); } -static int ipmr_rules_dump(struct net *net, struct notifier_block *nb) +static int ipmr_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { return 0; } @@ -1134,8 +1136,8 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, if (!found) { /* Create a new entry if allowable */ - if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 || - (c = ipmr_cache_alloc_unres()) == NULL) { + c = ipmr_cache_alloc_unres(); + if (!c) { spin_unlock_bh(&mfc_unres_lock); kfree_skb(skb); @@ -1794,7 +1796,7 @@ static void ip_encap(struct net *net, struct sk_buff *skb, ip_send_check(iph); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - nf_reset(skb); + nf_reset_ct(skb); } static inline int ipmr_forward_finish(struct net *net, struct sock *sk, @@ -2140,7 +2142,7 @@ int ip_mr_input(struct sk_buff *skb) mroute_sk = rcu_dereference(mrt->mroute_sk); if (mroute_sk) { - nf_reset(skb); + nf_reset_ct(skb); raw_rcv(mroute_sk, skb); return 0; } @@ -2289,7 +2291,8 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb, rcu_read_unlock(); return -ENODEV; } - skb2 = skb_clone(skb, GFP_ATOMIC); + + skb2 = skb_realloc_headroom(skb, sizeof(struct iphdr)); if (!skb2) { read_unlock(&mrt_lock); rcu_read_unlock(); @@ -3040,10 +3043,11 @@ static unsigned int ipmr_seq_read(struct net *net) return net->ipv4.ipmr_seq + ipmr_rules_seq_read(net); } -static int ipmr_dump(struct net *net, struct notifier_block *nb) +static int ipmr_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { return mr_dump(net, nb, RTNL_FAMILY_IPMR, ipmr_rules_dump, - ipmr_mr_table_iter, &mrt_lock); + ipmr_mr_table_iter, &mrt_lock, extack); } static const struct fib_notifier_ops ipmr_notifier_ops_template = { diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index ea48bd15a575..aa8738a91210 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -386,15 +386,17 @@ EXPORT_SYMBOL(mr_rtm_dumproute); int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, int (*rules_dump)(struct net *net, - struct notifier_block *nb), + struct notifier_block *nb, + struct netlink_ext_ack *extack), struct mr_table *(*mr_iter)(struct net *net, struct mr_table *mrt), - rwlock_t *mrt_lock) + rwlock_t *mrt_lock, + struct netlink_ext_ack *extack) { struct mr_table *mrt; int err; - err = rules_dump(net, nb); + err = rules_dump(net, nb, extack); if (err) return err; @@ -409,17 +411,25 @@ int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, if (!v->dev) continue; - mr_call_vif_notifier(nb, net, family, - FIB_EVENT_VIF_ADD, - v, vifi, mrt->id); + err = mr_call_vif_notifier(nb, family, + FIB_EVENT_VIF_ADD, + v, vifi, mrt->id, extack); + if (err) + break; } read_unlock(mrt_lock); + if (err) + return err; + /* Notify on table MFC entries */ - list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) - mr_call_mfc_notifier(nb, net, family, - FIB_EVENT_ENTRY_ADD, - mfc, mrt->id); + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) { + err = mr_call_mfc_notifier(nb, family, + FIB_EVENT_ENTRY_ADD, + mfc, mrt->id, extack); + if (err) + return err; + } } return 0; diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 69e76d677f9e..f17b402111ce 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -272,7 +272,7 @@ config IP_NF_TARGET_CLUSTERIP The CLUSTERIP target allows you to build load-balancing clusters of network servers without having a dedicated load-balancing router/server/switch. - + To compile it as a module, choose M here. If unsure, say N. config IP_NF_TARGET_ECN @@ -281,7 +281,7 @@ config IP_NF_TARGET_ECN depends on NETFILTER_ADVANCED ---help--- This option adds a `ECN' target, which can be used in the iptables mangle - table. + table. You can use this target to remove the ECN bits from the IPv4 header of an IP packet. This is particularly useful, if you need to work around @@ -306,7 +306,7 @@ config IP_NF_RAW This option adds a `raw' table to iptables. This table is the very first in the netfilter framework and hooks in at the PREROUTING and OUTPUT chains. - + If you want to compile it as a module, say M here and read <file:Documentation/kbuild/modules.rst>. If unsure, say `N'. @@ -318,7 +318,7 @@ config IP_NF_SECURITY help This option adds a `security' table to iptables, for use with Mandatory Access Control (MAC) policy. - + If unsure, say N. endif # IP_NF_IPTABLES diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index c50e0ec095d2..7c497c78105f 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -31,7 +31,7 @@ obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o # flow table support obj-$(CONFIG_NF_FLOW_TABLE_IPV4) += nf_flow_table_ipv4.o -# generic IP tables +# generic IP tables obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o # the three instances of ip_tables diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 214154b47d56..f1f78a742b36 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -384,10 +384,11 @@ next: ; return 1; } -static inline int check_target(struct arpt_entry *e, const char *name) +static int check_target(struct arpt_entry *e, struct net *net, const char *name) { struct xt_entry_target *t = arpt_get_target(e); struct xt_tgchk_param par = { + .net = net, .table = name, .entryinfo = e, .target = t->u.kernel.target, @@ -399,8 +400,9 @@ static inline int check_target(struct arpt_entry *e, const char *name) return xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false); } -static inline int -find_check_entry(struct arpt_entry *e, const char *name, unsigned int size, +static int +find_check_entry(struct arpt_entry *e, struct net *net, const char *name, + unsigned int size, struct xt_percpu_counter_alloc_state *alloc_state) { struct xt_entry_target *t; @@ -419,7 +421,7 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size, } t->u.kernel.target = target; - ret = check_target(e, name); + ret = check_target(e, net, name); if (ret) goto err; return 0; @@ -494,12 +496,13 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, return 0; } -static inline void cleanup_entry(struct arpt_entry *e) +static void cleanup_entry(struct arpt_entry *e, struct net *net) { struct xt_tgdtor_param par; struct xt_entry_target *t; t = arpt_get_target(e); + par.net = net; par.target = t->u.kernel.target; par.targinfo = t->data; par.family = NFPROTO_ARP; @@ -512,7 +515,9 @@ static inline void cleanup_entry(struct arpt_entry *e) /* Checks and translates the user-supplied table segment (held in * newinfo). */ -static int translate_table(struct xt_table_info *newinfo, void *entry0, +static int translate_table(struct net *net, + struct xt_table_info *newinfo, + void *entry0, const struct arpt_replace *repl) { struct xt_percpu_counter_alloc_state alloc_state = { 0 }; @@ -569,7 +574,7 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, /* Finally, each sanity check must pass */ i = 0; xt_entry_foreach(iter, entry0, newinfo->size) { - ret = find_check_entry(iter, repl->name, repl->size, + ret = find_check_entry(iter, net, repl->name, repl->size, &alloc_state); if (ret != 0) break; @@ -580,7 +585,7 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, xt_entry_foreach(iter, entry0, newinfo->size) { if (i-- == 0) break; - cleanup_entry(iter); + cleanup_entry(iter, net); } return ret; } @@ -923,7 +928,7 @@ static int __do_replace(struct net *net, const char *name, /* Decrease module usage counts and free resource */ loc_cpu_old_entry = oldinfo->entries; xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size) - cleanup_entry(iter); + cleanup_entry(iter, net); xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, @@ -974,7 +979,7 @@ static int do_replace(struct net *net, const void __user *user, goto free_newinfo; } - ret = translate_table(newinfo, loc_cpu_entry, &tmp); + ret = translate_table(net, newinfo, loc_cpu_entry, &tmp); if (ret != 0) goto free_newinfo; @@ -986,7 +991,7 @@ static int do_replace(struct net *net, const void __user *user, free_newinfo_untrans: xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) - cleanup_entry(iter); + cleanup_entry(iter, net); free_newinfo: xt_free_table_info(newinfo); return ret; @@ -1149,7 +1154,8 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr, } } -static int translate_compat_table(struct xt_table_info **pinfo, +static int translate_compat_table(struct net *net, + struct xt_table_info **pinfo, void **pentry0, const struct compat_arpt_replace *compatr) { @@ -1217,7 +1223,7 @@ static int translate_compat_table(struct xt_table_info **pinfo, repl.num_counters = 0; repl.counters = NULL; repl.size = newinfo->size; - ret = translate_table(newinfo, entry1, &repl); + ret = translate_table(net, newinfo, entry1, &repl); if (ret) goto free_newinfo; @@ -1270,7 +1276,7 @@ static int compat_do_replace(struct net *net, void __user *user, goto free_newinfo; } - ret = translate_compat_table(&newinfo, &loc_cpu_entry, &tmp); + ret = translate_compat_table(net, &newinfo, &loc_cpu_entry, &tmp); if (ret != 0) goto free_newinfo; @@ -1282,7 +1288,7 @@ static int compat_do_replace(struct net *net, void __user *user, free_newinfo_untrans: xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) - cleanup_entry(iter); + cleanup_entry(iter, net); free_newinfo: xt_free_table_info(newinfo); return ret; @@ -1509,7 +1515,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len return ret; } -static void __arpt_unregister_table(struct xt_table *table) +static void __arpt_unregister_table(struct net *net, struct xt_table *table) { struct xt_table_info *private; void *loc_cpu_entry; @@ -1521,7 +1527,7 @@ static void __arpt_unregister_table(struct xt_table *table) /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries; xt_entry_foreach(iter, loc_cpu_entry, private->size) - cleanup_entry(iter); + cleanup_entry(iter, net); if (private->number > private->initial_entries) module_put(table_owner); xt_free_table_info(private); @@ -1546,7 +1552,7 @@ int arpt_register_table(struct net *net, loc_cpu_entry = newinfo->entries; memcpy(loc_cpu_entry, repl->entries, repl->size); - ret = translate_table(newinfo, loc_cpu_entry, repl); + ret = translate_table(net, newinfo, loc_cpu_entry, repl); if (ret != 0) goto out_free; @@ -1561,7 +1567,7 @@ int arpt_register_table(struct net *net, ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); if (ret != 0) { - __arpt_unregister_table(new_table); + __arpt_unregister_table(net, new_table); *res = NULL; } @@ -1576,7 +1582,7 @@ void arpt_unregister_table(struct net *net, struct xt_table *table, const struct nf_hook_ops *ops) { nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); - __arpt_unregister_table(table); + __arpt_unregister_table(net, table); } /* The built-in targets: standard (NULL) and error. */ diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 6bdb1ab8af61..f8755a4ae9d4 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -58,7 +58,7 @@ struct clusterip_config { }; #ifdef CONFIG_PROC_FS -static const struct file_operations clusterip_proc_fops; +static const struct proc_ops clusterip_proc_ops; #endif struct clusterip_net { @@ -280,7 +280,7 @@ clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i, mutex_lock(&cn->mutex); c->pde = proc_create_data(buffer, 0600, cn->procdir, - &clusterip_proc_fops, c); + &clusterip_proc_ops, c); mutex_unlock(&cn->mutex); if (!c->pde) { err = -ENOMEM; @@ -804,12 +804,12 @@ static ssize_t clusterip_proc_write(struct file *file, const char __user *input, return size; } -static const struct file_operations clusterip_proc_fops = { - .open = clusterip_proc_open, - .read = seq_read, - .write = clusterip_proc_write, - .llseek = seq_lseek, - .release = clusterip_proc_release, +static const struct proc_ops clusterip_proc_ops = { + .proc_open = clusterip_proc_open, + .proc_read = seq_read, + .proc_write = clusterip_proc_write, + .proc_lseek = seq_lseek, + .proc_release = clusterip_proc_release, }; #endif /* CONFIG_PROC_FS */ diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index 0e70f3f65f6f..748dc3ce58d3 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -36,8 +36,8 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) opts.options |= XT_SYNPROXY_OPT_ECN; opts.options &= info->options; - opts.mss_encode = opts.mss; - opts.mss = info->mss; + opts.mss_encode = opts.mss_option; + opts.mss_option = info->mss; if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) synproxy_init_timestamp_cookie(info, &opts); else diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c index af3fbf76dbd3..6cc5743c553a 100644 --- a/net/ipv4/netfilter/nf_dup_ipv4.c +++ b/net/ipv4/netfilter/nf_dup_ipv4.c @@ -65,7 +65,7 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum, #if IS_ENABLED(CONFIG_NF_CONNTRACK) /* Avoid counting cloned packets towards the original connection. */ - nf_reset(skb); + nf_reset_ct(skb); nf_ct_set(skb, NULL, IP_CT_UNTRACKED); #endif /* diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c index 012c4047c788..e32e41b99f0f 100644 --- a/net/ipv4/netfilter/nf_flow_table_ipv4.c +++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c @@ -9,6 +9,8 @@ static struct nf_flowtable_type flowtable_ipv4 = { .family = NFPROTO_IPV4, .init = nf_flow_table_init, + .setup = nf_flow_table_offload_setup, + .action = nf_flow_rule_route_ipv4, .free = nf_flow_table_free, .hook = nf_flow_offload_ip_hook, .owner = THIS_MODULE, diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c index 36a28d46149c..c94445b44d8c 100644 --- a/net/ipv4/netfilter/nf_socket_ipv4.c +++ b/net/ipv4/netfilter/nf_socket_ipv4.c @@ -31,16 +31,8 @@ extract_icmp4_fields(const struct sk_buff *skb, u8 *protocol, if (icmph == NULL) return 1; - switch (icmph->type) { - case ICMP_DEST_UNREACH: - case ICMP_SOURCE_QUENCH: - case ICMP_REDIRECT: - case ICMP_TIME_EXCEEDED: - case ICMP_PARAMETERPROB: - break; - default: + if (!icmp_is_err(icmph->type)) return 1; - } inside_iph = skb_header_pointer(skb, outside_hdrlen + sizeof(struct icmphdr), diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 5fe5a3981d43..d072c326dd64 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -23,7 +23,6 @@ static void remove_nexthop(struct net *net, struct nexthop *nh, #define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS) static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = { - [NHA_UNSPEC] = { .strict_start_type = NHA_UNSPEC + 1 }, [NHA_ID] = { .type = NLA_U32 }, [NHA_GROUP] = { .type = NLA_BINARY }, [NHA_GROUP_TYPE] = { .type = NLA_U16 }, @@ -322,7 +321,9 @@ static size_t nh_nlmsg_size_single(struct nexthop *nh) static size_t nh_nlmsg_size(struct nexthop *nh) { - size_t sz = nla_total_size(4); /* NHA_ID */ + size_t sz = NLMSG_ALIGN(sizeof(struct nhmsg)); + + sz += nla_total_size(4); /* NHA_ID */ if (nh->is_group) sz += nh_nlmsg_size_grp(nh); @@ -1151,7 +1152,7 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh, .fc_encap_type = cfg->nh_encap_type, }; u32 tb_id = l3mdev_fib_table(cfg->dev); - int err = -EINVAL; + int err; err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack); if (err) { diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 9d24ef5c5d8f..535427292194 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -781,7 +781,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } else if (!ipc.oif) ipc.oif = inet->uc_index; - flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, + flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, RT_SCOPE_UNIVERSE, sk->sk_protocol, inet_sk_flowi_flags(sk), faddr, saddr, 0, 0, sk->sk_uid); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index cc90243ccf76..2580303249e2 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -289,6 +289,8 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP), SNMP_MIB_ITEM("TCPWqueueTooBig", LINUX_MIB_TCPWQUEUETOOBIG), SNMP_MIB_ITEM("TCPFastOpenPassiveAltKey", LINUX_MIB_TCPFASTOPENPASSIVEALTKEY), + SNMP_MIB_ITEM("TcpTimeoutRehash", LINUX_MIB_TCPTIMEOUTREHASH), + SNMP_MIB_ITEM("TcpDuplicateDataRehash", LINUX_MIB_TCPDUPLICATEDATAREHASH), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 40a6abbc9cf6..3183413ebc6c 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -332,7 +332,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb) kfree_skb(skb); return NET_RX_DROP; } - nf_reset(skb); + nf_reset_ct(skb); skb_push(skb, skb->data - skb_network_header(skb)); @@ -375,7 +375,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, skb_reserve(skb, hlen); skb->priority = sk->sk_priority; - skb->mark = sk->sk_mark; + skb->mark = sockc->mark; skb->tstamp = sockc->transmit_time; skb_dst_set(skb, &rt->dst); *rtp = NULL; @@ -623,7 +623,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } } - flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, + flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, RT_SCOPE_UNIVERSE, hdrincl ? IPPROTO_RAW : sk->sk_protocol, inet_sk_flowi_flags(sk) | diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 517300d587a7..ebe7060d0fc9 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -139,7 +139,8 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst); static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu); + struct sk_buff *skb, u32 mtu, + bool confirm_neigh); static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); static void ipv4_dst_destroy(struct dst_entry *dst); @@ -236,11 +237,11 @@ static int rt_cache_seq_open(struct inode *inode, struct file *file) return seq_open(file, &rt_cache_seq_ops); } -static const struct file_operations rt_cache_seq_fops = { - .open = rt_cache_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, +static const struct proc_ops rt_cache_proc_ops = { + .proc_open = rt_cache_seq_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, }; @@ -270,6 +271,7 @@ static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) *pos = cpu+1; return &per_cpu(rt_cache_stat, cpu); } + (*pos)++; return NULL; } @@ -326,11 +328,11 @@ static int rt_cpu_seq_open(struct inode *inode, struct file *file) return seq_open(file, &rt_cpu_seq_ops); } -static const struct file_operations rt_cpu_seq_fops = { - .open = rt_cpu_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, +static const struct proc_ops rt_cpu_proc_ops = { + .proc_open = rt_cpu_seq_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, }; #ifdef CONFIG_IP_ROUTE_CLASSID @@ -364,12 +366,12 @@ static int __net_init ip_rt_do_proc_init(struct net *net) struct proc_dir_entry *pde; pde = proc_create("rt_cache", 0444, net->proc_net, - &rt_cache_seq_fops); + &rt_cache_proc_ops); if (!pde) goto err1; pde = proc_create("rt_cache", 0444, - net->proc_net_stat, &rt_cpu_seq_fops); + net->proc_net_stat, &rt_cpu_proc_ops); if (!pde) goto err2; @@ -635,6 +637,7 @@ static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnh if (fnhe->fnhe_gw) { rt->rt_flags |= RTCF_REDIRECTED; + rt->rt_uses_gateway = 1; rt->rt_gw_family = AF_INET; rt->rt_gw4 = fnhe->fnhe_gw; } @@ -915,16 +918,15 @@ void ip_rt_send_redirect(struct sk_buff *skb) if (peer->rate_tokens == 0 || time_after(jiffies, (peer->rate_last + - (ip_rt_redirect_load << peer->rate_tokens)))) { + (ip_rt_redirect_load << peer->n_redirects)))) { __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr); icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw); peer->rate_last = jiffies; - ++peer->rate_tokens; ++peer->n_redirects; #ifdef CONFIG_IP_ROUTE_VERBOSE if (log_martians && - peer->rate_tokens == ip_rt_redirect_number) + peer->n_redirects == ip_rt_redirect_number) net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", &ip_hdr(skb)->saddr, inet_iif(skb), &ip_hdr(skb)->daddr, &gw); @@ -1043,7 +1045,8 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) } static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) + struct sk_buff *skb, u32 mtu, + bool confirm_neigh) { struct rtable *rt = (struct rtable *) dst; struct flowi4 fl4; @@ -1313,7 +1316,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) mtu = READ_ONCE(dst->dev->mtu); if (unlikely(ip_mtu_locked(dst))) { - if (rt->rt_gw_family && mtu > 576) + if (rt->rt_uses_gateway && mtu > 576) mtu = 576; } @@ -1482,7 +1485,7 @@ static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt) prev = cmpxchg(p, orig, rt); if (prev == orig) { if (orig) { - dst_dev_put(&orig->dst); + rt_add_uncached_list(orig); dst_release(&orig->dst); } } else { @@ -1569,6 +1572,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, struct fib_nh_common *nhc = FIB_RES_NHC(*res); if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) { + rt->rt_uses_gateway = 1; rt->rt_gw_family = nhc->nhc_gw_family; /* only INET and INET6 are supported */ if (likely(nhc->nhc_gw_family == AF_INET)) @@ -1634,6 +1638,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev, rt->rt_iif = 0; rt->rt_pmtu = 0; rt->rt_mtu_locked = 0; + rt->rt_uses_gateway = 0; rt->rt_gw_family = 0; rt->rt_gw4 = 0; INIT_LIST_HEAD(&rt->rt_uncached); @@ -1892,10 +1897,7 @@ static void ip_multipath_l3_keys(const struct sk_buff *skb, if (!icmph) goto out; - if (icmph->type != ICMP_DEST_UNREACH && - icmph->type != ICMP_REDIRECT && - icmph->type != ICMP_TIME_EXCEEDED && - icmph->type != ICMP_PARAMETERPROB) + if (!icmp_is_err(icmph->type)) goto out; inner_iph = skb_header_pointer(skb, @@ -2020,10 +2022,52 @@ static int ip_mkroute_input(struct sk_buff *skb, return __mkroute_input(skb, res, in_dev, daddr, saddr, tos); } +/* Implements all the saddr-related checks as ip_route_input_slow(), + * assuming daddr is valid and the destination is not a local broadcast one. + * Uses the provided hint instead of performing a route lookup. + */ +int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, + const struct sk_buff *hint) +{ + struct in_device *in_dev = __in_dev_get_rcu(dev); + struct rtable *rt = (struct rtable *)hint; + struct net *net = dev_net(dev); + int err = -EINVAL; + u32 tag = 0; + + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) + goto martian_source; + + if (ipv4_is_zeronet(saddr)) + goto martian_source; + + if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) + goto martian_source; + + if (rt->rt_type != RTN_LOCAL) + goto skip_validate_source; + + tos &= IPTOS_RT_MASK; + err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag); + if (err < 0) + goto martian_source; + +skip_validate_source: + skb_dst_copy(skb, hint); + return 0; + +martian_source: + ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); + return err; +} + /* * NOTE. We drop all the packets that has local source * addresses, because every properly looped back packet * must have correct destination already attached by output routine. + * Changes in the enforced policies must be applied also to + * ip_route_use_hint(). * * Such approach solves two big problems: * 1. Not simplex devices are handled properly. @@ -2468,14 +2512,17 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, int orig_oif = fl4->flowi4_oif; unsigned int flags = 0; struct rtable *rth; - int err = -ENETUNREACH; + int err; if (fl4->saddr) { - rth = ERR_PTR(-EINVAL); if (ipv4_is_multicast(fl4->saddr) || ipv4_is_lbcast(fl4->saddr) || - ipv4_is_zeronet(fl4->saddr)) + ipv4_is_zeronet(fl4->saddr)) { + rth = ERR_PTR(-EINVAL); goto out; + } + + rth = ERR_PTR(-ENETUNREACH); /* I removed check for oif == dev_out->oif here. It was wrong for two reasons: @@ -2643,7 +2690,8 @@ static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst) } static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) + struct sk_buff *skb, u32 mtu, + bool confirm_neigh) { } @@ -2694,6 +2742,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_genid = rt_genid_ipv4(net); rt->rt_flags = ort->rt_flags; rt->rt_type = ort->rt_type; + rt->rt_uses_gateway = ort->rt_uses_gateway; rt->rt_gw_family = ort->rt_gw_family; if (rt->rt_gw_family == AF_INET) rt->rt_gw4 = ort->rt_gw4; @@ -2728,7 +2777,8 @@ EXPORT_SYMBOL_GPL(ip_route_output_flow); /* called with rcu_read_lock held */ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, struct rtable *rt, u32 table_id, struct flowi4 *fl4, - struct sk_buff *skb, u32 portid, u32 seq) + struct sk_buff *skb, u32 portid, u32 seq, + unsigned int flags) { struct rtmsg *r; struct nlmsghdr *nlh; @@ -2736,7 +2786,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 error; u32 metrics[RTAX_MAX]; - nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0); + nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags); if (!nlh) return -EMSGSIZE; @@ -2777,21 +2827,23 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr)) goto nla_put_failure; } - if (rt->rt_gw_family == AF_INET && - nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) { - goto nla_put_failure; - } else if (rt->rt_gw_family == AF_INET6) { - int alen = sizeof(struct in6_addr); - struct nlattr *nla; - struct rtvia *via; - - nla = nla_reserve(skb, RTA_VIA, alen + 2); - if (!nla) + if (rt->rt_uses_gateway) { + if (rt->rt_gw_family == AF_INET && + nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) { goto nla_put_failure; - - via = nla_data(nla); - via->rtvia_family = AF_INET6; - memcpy(via->rtvia_addr, &rt->rt_gw6, alen); + } else if (rt->rt_gw_family == AF_INET6) { + int alen = sizeof(struct in6_addr); + struct nlattr *nla; + struct rtvia *via; + + nla = nla_reserve(skb, RTA_VIA, alen + 2); + if (!nla) + goto nla_put_failure; + + via = nla_data(nla); + via->rtvia_family = AF_INET6; + memcpy(via->rtvia_addr, &rt->rt_gw6, alen); + } } expires = rt->dst.expires; @@ -2860,7 +2912,7 @@ nla_put_failure: static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, u32 table_id, struct fnhe_hash_bucket *bucket, int genid, - int *fa_index, int fa_start) + int *fa_index, int fa_start, unsigned int flags) { int i; @@ -2891,7 +2943,7 @@ static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb, err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt, table_id, NULL, skb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq); + cb->nlh->nlmsg_seq, flags); if (err) return err; next: @@ -2904,7 +2956,7 @@ next: int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb, u32 table_id, struct fib_info *fi, - int *fa_index, int fa_start) + int *fa_index, int fa_start, unsigned int flags) { struct net *net = sock_net(cb->skb->sk); int nhsel, genid = fnhe_genid(net); @@ -2922,7 +2974,8 @@ int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb, err = 0; if (bucket) err = fnhe_dump_bucket(net, skb, cb, table_id, bucket, - genid, fa_index, fa_start); + genid, fa_index, fa_start, + flags); rcu_read_unlock(); if (err) return err; @@ -3171,19 +3224,45 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, skb_reset_mac_header(skb); if (rtm->rtm_flags & RTM_F_FIB_MATCH) { + struct fib_rt_info fri; + if (!res.fi) { err = fib_props[res.type].error; if (!err) err = -EHOSTUNREACH; goto errout_rcu; } + fri.fi = res.fi; + fri.tb_id = table_id; + fri.dst = res.prefix; + fri.dst_len = res.prefixlen; + fri.tos = fl4.flowi4_tos; + fri.type = rt->rt_type; + fri.offload = 0; + fri.trap = 0; + if (res.fa_head) { + struct fib_alias *fa; + + hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) { + u8 slen = 32 - fri.dst_len; + + if (fa->fa_slen == slen && + fa->tb_id == fri.tb_id && + fa->fa_tos == fri.tos && + fa->fa_info == res.fi && + fa->fa_type == fri.type) { + fri.offload = fa->offload; + fri.trap = fa->trap; + break; + } + } + } err = fib_dump_info(skb, NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, RTM_NEWROUTE, table_id, - rt->rt_type, res.prefix, res.prefixlen, - fl4.flowi4_tos, res.fi, 0); + nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0); } else { err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb, - NETLINK_CB(in_skb).portid, nlh->nlmsg_seq); + NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, 0); } if (err < 0) goto errout_rcu; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 535b69326f66..9a4f6b16c9bc 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -62,10 +62,10 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, * Since subsequent timestamps use the normal tcp_time_stamp value, we * must make sure that the resulting initial timestamp is <= tcp_time_stamp. */ -u64 cookie_init_timestamp(struct request_sock *req) +u64 cookie_init_timestamp(struct request_sock *req, u64 now) { struct inet_request_sock *ireq; - u32 ts, ts_now = tcp_time_stamp_raw(); + u32 ts, ts_now = tcp_ns_to_ts(now); u32 options = 0; ireq = inet_rsk(req); @@ -349,6 +349,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; treq->snt_synack = 0; treq->tfo_listener = false; + + if (IS_ENABLED(CONFIG_MPTCP)) + treq->is_mptcp = 0; + if (IS_ENABLED(CONFIG_SMC)) ireq->smc_ok = 0; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 0b980e841927..9684af02e0a5 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -340,6 +340,10 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write, user_key[i * 4 + 1], user_key[i * 4 + 2], user_key[i * 4 + 3]); + + if (WARN_ON_ONCE(off >= tbl.maxlen - 1)) + break; + if (i + 1 < n_keys) off += snprintf(tbl.data + off, tbl.maxlen - off, ","); } @@ -820,6 +824,15 @@ static struct ctl_table ipv4_net_table[] = { .extra2 = &tcp_min_snd_mss_max, }, { + .procname = "tcp_mtu_probe_floor", + .data = &init_net.ipv4.sysctl_tcp_mtu_probe_floor, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &tcp_min_snd_mss_min, + .extra2 = &tcp_min_snd_mss_max, + }, + { .procname = "tcp_probe_threshold", .data = &init_net.ipv4.sysctl_tcp_probe_threshold, .maxlen = sizeof(int), @@ -1028,7 +1041,7 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_fib_multipath_hash_policy, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, + .extra2 = &two, }, #endif { @@ -1180,6 +1193,15 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec, }, { + .procname = "tcp_no_ssthresh_metrics_save", + .data = &init_net.ipv4.sysctl_tcp_no_ssthresh_metrics_save, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { .procname = "tcp_moderate_rcvbuf", .data = &init_net.ipv4.sysctl_tcp_moderate_rcvbuf, .maxlen = sizeof(int), diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 77b485d60b9d..eb2d80519f8e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -271,6 +271,7 @@ #include <net/icmp.h> #include <net/inet_common.h> #include <net/tcp.h> +#include <net/mptcp.h> #include <net/xfrm.h> #include <net/ip.h> #include <net/sock.h> @@ -326,7 +327,7 @@ void tcp_enter_memory_pressure(struct sock *sk) { unsigned long val; - if (tcp_memory_pressure) + if (READ_ONCE(tcp_memory_pressure)) return; val = jiffies; @@ -341,7 +342,7 @@ void tcp_leave_memory_pressure(struct sock *sk) { unsigned long val; - if (!tcp_memory_pressure) + if (!READ_ONCE(tcp_memory_pressure)) return; val = xchg(&tcp_memory_pressure, 0); if (val) @@ -443,15 +444,13 @@ void tcp_init_sock(struct sock *sk) tp->tsoffset = 0; tp->rack.reo_wnd_steps = 1; - sk->sk_state = TCP_CLOSE; - sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); icsk->icsk_sync_mss = tcp_sync_mss; - sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1]; - sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1]; + WRITE_ONCE(sk->sk_sndbuf, sock_net(sk)->ipv4.sysctl_tcp_wmem[1]); + WRITE_ONCE(sk->sk_rcvbuf, sock_net(sk)->ipv4.sysctl_tcp_rmem[1]); sk_sockets_allocated_inc(sk); sk->sk_route_forced_caps = NETIF_F_GSO; @@ -477,7 +476,7 @@ static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) static inline bool tcp_stream_is_readable(const struct tcp_sock *tp, int target, struct sock *sk) { - return (tp->rcv_nxt - tp->copied_seq >= target) || + return (READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq) >= target) || (sk->sk_prot->stream_memory_read ? sk->sk_prot->stream_memory_read(sk) : false); } @@ -543,10 +542,10 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) /* Connected or passive Fast Open socket? */ if (state != TCP_SYN_SENT && - (state != TCP_SYN_RECV || tp->fastopen_rsk)) { + (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) { int target = sock_rcvlowat(sk, 0, INT_MAX); - if (tp->urg_seq == tp->copied_seq && + if (READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) && !sock_flag(sk, SOCK_URGINLINE) && tp->urg_data) target++; @@ -584,7 +583,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) } /* This barrier is coupled with smp_wmb() in tcp_reset() */ smp_rmb(); - if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR; return mask; @@ -607,7 +606,8 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) unlock_sock_fast(sk, slow); break; case SIOCATMARK: - answ = tp->urg_data && tp->urg_seq == tp->copied_seq; + answ = tp->urg_data && + READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq); break; case SIOCOUTQ: if (sk->sk_state == TCP_LISTEN) @@ -616,7 +616,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) answ = 0; else - answ = tp->write_seq - tp->snd_una; + answ = READ_ONCE(tp->write_seq) - tp->snd_una; break; case SIOCOUTQNSD: if (sk->sk_state == TCP_LISTEN) @@ -625,7 +625,8 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) answ = 0; else - answ = tp->write_seq - tp->snd_nxt; + answ = READ_ONCE(tp->write_seq) - + READ_ONCE(tp->snd_nxt); break; default: return -ENOIOCTLCMD; @@ -657,7 +658,7 @@ static void skb_entail(struct sock *sk, struct sk_buff *skb) tcb->sacked = 0; __skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); - sk->sk_wmem_queued += skb->truesize; + sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); if (tp->nonagle & TCP_NAGLE_PUSH) tp->nonagle &= ~TCP_NAGLE_PUSH; @@ -690,8 +691,8 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, refcount_read(&sk->sk_wmem_alloc) > skb->truesize; } -static void tcp_push(struct sock *sk, int flags, int mss_now, - int nonagle, int size_goal) +void tcp_push(struct sock *sk, int flags, int mss_now, + int nonagle, int size_goal) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -925,7 +926,7 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, return max(size_goal, mss_now); } -static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) +int tcp_send_mss(struct sock *sk, int *size_goal, int flags) { int mss_now; @@ -935,6 +936,22 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) return mss_now; } +/* In some cases, both sendpage() and sendmsg() could have added + * an skb to the write queue, but failed adding payload on it. + * We need to remove it to consume less memory, but more + * importantly be able to generate EPOLLOUT for Edge Trigger epoll() + * users. + */ +static void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb) +{ + if (skb && !skb->len) { + tcp_unlink_write_queue(skb, sk); + if (tcp_write_queue_empty(sk)) + tcp_chrono_stop(sk, TCP_CHRONO_BUSY); + sk_wmem_free_skb(sk, skb); + } +} + ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, size_t size, int flags) { @@ -1016,10 +1033,10 @@ new_segment: skb->len += copy; skb->data_len += copy; skb->truesize += copy; - sk->sk_wmem_queued += copy; + sk_wmem_queued_add(sk, copy); sk_mem_charge(sk, copy); skb->ip_summed = CHECKSUM_PARTIAL; - tp->write_seq += copy; + WRITE_ONCE(tp->write_seq, tp->write_seq + copy); TCP_SKB_CB(skb)->end_seq += copy; tcp_skb_pcount_set(skb, 0); @@ -1064,12 +1081,12 @@ out: return copied; do_error: + tcp_remove_empty_skb(sk, tcp_write_queue_tail(sk)); if (copied) goto out; out_err: /* make sure we wake any epoll edge trigger waiter */ - if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && - err == -EAGAIN)) { + if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) { sk->sk_write_space(sk); tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } @@ -1165,7 +1182,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) struct sockcm_cookie sockc; int flags, err, copied = 0; int mss_now = 0, size_goal, copied_syn = 0; - bool process_backlog = false; + int process_backlog = 0; bool zc = false; long timeo; @@ -1257,9 +1274,10 @@ new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; - if (process_backlog && sk_flush_backlog(sk)) { - process_backlog = false; - goto restart; + if (unlikely(process_backlog >= 16)) { + process_backlog = 0; + if (sk_flush_backlog(sk)) + goto restart; } first_skb = tcp_rtx_and_write_queues_empty(sk); skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, @@ -1267,7 +1285,7 @@ new_segment: if (!skb) goto wait_for_memory; - process_backlog = true; + process_backlog++; skb->ip_summed = CHECKSUM_PARTIAL; skb_entail(sk, skb); @@ -1344,7 +1362,7 @@ new_segment: if (!copied) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; - tp->write_seq += copy; + WRITE_ONCE(tp->write_seq, tp->write_seq + copy); TCP_SKB_CB(skb)->end_seq += copy; tcp_skb_pcount_set(skb, 0); @@ -1388,26 +1406,18 @@ out_nopush: sock_zerocopy_put(uarg); return copied + copied_syn; +do_error: + skb = tcp_write_queue_tail(sk); do_fault: - if (!skb->len) { - tcp_unlink_write_queue(skb, sk); - /* It is the one place in all of TCP, except connection - * reset, where we can be unlinking the send_head. - */ - if (tcp_write_queue_empty(sk)) - tcp_chrono_stop(sk, TCP_CHRONO_BUSY); - sk_wmem_free_skb(sk, skb); - } + tcp_remove_empty_skb(sk, skb); -do_error: if (copied + copied_syn) goto out; out_err: sock_zerocopy_put_abort(uarg, true); err = sk_stream_error(sk, flags, err); /* make sure we wake any epoll edge trigger waiter */ - if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && - err == -EAGAIN)) { + if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) { sk->sk_write_space(sk); tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } @@ -1657,9 +1667,9 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_eat_skb(sk, skb); if (!desc->count) break; - tp->copied_seq = seq; + WRITE_ONCE(tp->copied_seq, seq); } - tp->copied_seq = seq; + WRITE_ONCE(tp->copied_seq, seq); tcp_rcv_space_adjust(sk); @@ -1688,7 +1698,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val) else cap = sock_net(sk)->ipv4.sysctl_tcp_rmem[2] >> 1; val = min(val, cap); - sk->sk_rcvlowat = val ? : 1; + WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); /* Check if we need to signal EPOLLIN right now */ tcp_data_ready(sk); @@ -1698,7 +1708,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val) val <<= 1; if (val > sk->sk_rcvbuf) { - sk->sk_rcvbuf = val; + WRITE_ONCE(sk->sk_rcvbuf, val); tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val); } return 0; @@ -1728,8 +1738,8 @@ static int tcp_zerocopy_receive(struct sock *sk, struct tcp_zerocopy_receive *zc) { unsigned long address = (unsigned long)zc->address; + u32 length = 0, seq, offset, zap_len; const skb_frag_t *frags = NULL; - u32 length = 0, seq, offset; struct vm_area_struct *vma; struct sk_buff *skb = NULL; struct tcp_sock *tp; @@ -1756,17 +1766,19 @@ static int tcp_zerocopy_receive(struct sock *sk, seq = tp->copied_seq; inq = tcp_inq(sk); zc->length = min_t(u32, zc->length, inq); - zc->length &= ~(PAGE_SIZE - 1); - if (zc->length) { - zap_page_range(vma, address, zc->length); + zap_len = zc->length & ~(PAGE_SIZE - 1); + if (zap_len) { + zap_page_range(vma, address, zap_len); zc->recv_skip_hint = 0; } else { - zc->recv_skip_hint = inq; + zc->recv_skip_hint = zc->length; } ret = 0; while (length + PAGE_SIZE <= zc->length) { if (zc->recv_skip_hint < PAGE_SIZE) { if (skb) { + if (zc->recv_skip_hint > 0) + break; skb = skb->next; offset = seq - TCP_SKB_CB(skb)->seq; } else { @@ -1779,18 +1791,18 @@ static int tcp_zerocopy_receive(struct sock *sk, break; frags = skb_shinfo(skb)->frags; while (offset) { - if (frags->size > offset) + if (skb_frag_size(frags) > offset) goto out; - offset -= frags->size; + offset -= skb_frag_size(frags); frags++; } } - if (frags->size != PAGE_SIZE || frags->page_offset) { + if (skb_frag_size(frags) != PAGE_SIZE || skb_frag_off(frags)) { int remaining = zc->recv_skip_hint; - while (remaining && (frags->size != PAGE_SIZE || - frags->page_offset)) { - remaining -= frags->size; + while (remaining && (skb_frag_size(frags) != PAGE_SIZE || + skb_frag_off(frags))) { + remaining -= skb_frag_size(frags); frags++; } zc->recv_skip_hint -= remaining; @@ -1808,7 +1820,7 @@ static int tcp_zerocopy_receive(struct sock *sk, out: up_read(¤t->mm->mmap_sem); if (length) { - tp->copied_seq = seq; + WRITE_ONCE(tp->copied_seq, seq); tcp_rcv_space_adjust(sk); /* Clean up data we have read: This will do ACK frames. */ @@ -1851,29 +1863,33 @@ static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, if (sock_flag(sk, SOCK_RCVTSTAMP)) { if (sock_flag(sk, SOCK_RCVTSTAMPNS)) { if (new_tstamp) { - struct __kernel_timespec kts = {tss->ts[0].tv_sec, tss->ts[0].tv_nsec}; - + struct __kernel_timespec kts = { + .tv_sec = tss->ts[0].tv_sec, + .tv_nsec = tss->ts[0].tv_nsec, + }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW, sizeof(kts), &kts); } else { - struct timespec ts_old = timespec64_to_timespec(tss->ts[0]); - + struct __kernel_old_timespec ts_old = { + .tv_sec = tss->ts[0].tv_sec, + .tv_nsec = tss->ts[0].tv_nsec, + }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD, sizeof(ts_old), &ts_old); } } else { if (new_tstamp) { - struct __kernel_sock_timeval stv; - - stv.tv_sec = tss->ts[0].tv_sec; - stv.tv_usec = tss->ts[0].tv_nsec / 1000; + struct __kernel_sock_timeval stv = { + .tv_sec = tss->ts[0].tv_sec, + .tv_usec = tss->ts[0].tv_nsec / 1000, + }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW, sizeof(stv), &stv); } else { - struct __kernel_old_timeval tv; - - tv.tv_sec = tss->ts[0].tv_sec; - tv.tv_usec = tss->ts[0].tv_nsec / 1000; + struct __kernel_old_timeval tv = { + .tv_sec = tss->ts[0].tv_sec, + .tv_usec = tss->ts[0].tv_nsec / 1000, + }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD, sizeof(tv), &tv); } @@ -1945,13 +1961,12 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, struct sk_buff *skb, *last; u32 urg_hole = 0; struct scm_timestamping_internal tss; - bool has_tss = false; - bool has_cmsg; + int cmsg_flags; if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); - if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) && + if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) && (sk->sk_state == TCP_ESTABLISHED)) sk_busy_loop(sk, nonblock); @@ -1961,7 +1976,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, if (sk->sk_state == TCP_LISTEN) goto out; - has_cmsg = tp->recvmsg_inq; + cmsg_flags = tp->recvmsg_inq ? 1 : 0; timeo = sock_rcvtimeo(sk, nonblock); /* Urgent data needs to be handled specially. */ @@ -2034,7 +2049,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, /* Well, if we have backlog, try to process it now yet. */ - if (copied >= target && !sk->sk_backlog.tail) + if (copied >= target && !READ_ONCE(sk->sk_backlog.tail)) break; if (copied) { @@ -2106,7 +2121,7 @@ found_ok_skb: if (urg_offset < used) { if (!urg_offset) { if (!sock_flag(sk, SOCK_URGINLINE)) { - ++*seq; + WRITE_ONCE(*seq, *seq + 1); urg_hole++; offset++; used--; @@ -2128,7 +2143,7 @@ found_ok_skb: } } - *seq += used; + WRITE_ONCE(*seq, *seq + used); copied += used; len -= used; @@ -2144,8 +2159,7 @@ skip_copy: if (TCP_SKB_CB(skb)->has_rxtstamp) { tcp_update_recv_tstamps(skb, &tss); - has_tss = true; - has_cmsg = true; + cmsg_flags |= 2; } if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; @@ -2155,7 +2169,7 @@ skip_copy: found_fin_ok: /* Process the FIN. */ - ++*seq; + WRITE_ONCE(*seq, *seq + 1); if (!(flags & MSG_PEEK)) sk_eat_skb(sk, skb); break; @@ -2170,10 +2184,10 @@ found_fin_ok: release_sock(sk); - if (has_cmsg) { - if (has_tss) + if (cmsg_flags) { + if (cmsg_flags & 2) tcp_recv_timestamp(msg, sk, &tss); - if (tp->recvmsg_inq) { + if (cmsg_flags & 1) { inq = tcp_inq_hint(sk); put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq); } @@ -2476,7 +2490,10 @@ adjudge_to_death: } if (sk->sk_state == TCP_CLOSE) { - struct request_sock *req = tcp_sk(sk)->fastopen_rsk; + struct request_sock *req; + + req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, + lockdep_sock_is_held(sk)); /* We could get here with a non-NULL req if the socket is * aborted (e.g., closed with unread data) before 3WHS * finishes. @@ -2508,6 +2525,7 @@ static void tcp_rtx_queue_purge(struct sock *sk) { struct rb_node *p = rb_first(&sk->tcp_rtx_queue); + tcp_sk(sk)->highest_sack = NULL; while (p) { struct sk_buff *skb = rb_to_skb(p); @@ -2548,6 +2566,7 @@ int tcp_disconnect(struct sock *sk, int flags) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int old_state = sk->sk_state; + u32 seq; if (old_state != TCP_CLOSE) tcp_set_state(sk, TCP_CLOSE); @@ -2574,7 +2593,7 @@ int tcp_disconnect(struct sock *sk, int flags) __kfree_skb(sk->sk_rx_skb_cache); sk->sk_rx_skb_cache = NULL; } - tp->copied_seq = tp->rcv_nxt; + WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); tp->urg_data = 0; tcp_write_queue_purge(sk); tcp_fastopen_active_disable_ofo_check(sk); @@ -2590,21 +2609,25 @@ int tcp_disconnect(struct sock *sk, int flags) tp->srtt_us = 0; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); tp->rcv_rtt_last_tsecr = 0; - tp->write_seq += tp->max_window + 2; - if (tp->write_seq == 0) - tp->write_seq = 1; + + seq = tp->write_seq + tp->max_window + 2; + if (!seq) + seq = 1; + WRITE_ONCE(tp->write_seq, seq); + icsk->icsk_backoff = 0; - tp->snd_cwnd = 2; icsk->icsk_probes_out = 0; icsk->icsk_rto = TCP_TIMEOUT_INIT; tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd = TCP_INIT_CWND; tp->snd_cwnd_cnt = 0; tp->window_clamp = 0; + tp->delivered = 0; tp->delivered_ce = 0; tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; tcp_clear_retrans(tp); + tp->total_retrans = 0; inet_csk_delack_init(sk); /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0 * issue in __tcp_select_window() @@ -2616,10 +2639,14 @@ int tcp_disconnect(struct sock *sk, int flags) sk->sk_rx_dst = NULL; tcp_saved_syn_free(tp); tp->compressed_ack = 0; + tp->segs_in = 0; + tp->segs_out = 0; tp->bytes_sent = 0; tp->bytes_acked = 0; tp->bytes_received = 0; tp->bytes_retrans = 0; + tp->data_segs_in = 0; + tp->data_segs_out = 0; tp->duplicate_sack[0].start_seq = 0; tp->duplicate_sack[0].end_seq = 0; tp->dsack_dups = 0; @@ -2640,11 +2667,13 @@ int tcp_disconnect(struct sock *sk, int flags) tp->rx_opt.saw_tstamp = 0; tp->rx_opt.dsack = 0; tp->rx_opt.num_sacks = 0; + tp->rcv_ooopack = 0; /* Clean up fastopen related fields */ tcp_free_fastopen_req(tp); inet->defer_connect = 0; + tp->fastopen_client_fail = 0; WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); @@ -2918,9 +2947,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level, if (sk->sk_state != TCP_CLOSE) err = -EPERM; else if (tp->repair_queue == TCP_SEND_QUEUE) - tp->write_seq = val; + WRITE_ONCE(tp->write_seq, val); else if (tp->repair_queue == TCP_RECV_QUEUE) - tp->rcv_nxt = val; + WRITE_ONCE(tp->rcv_nxt, val); else err = -EINVAL; break; @@ -3203,8 +3232,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) * tcpi_unacked -> Number of children ready for accept() * tcpi_sacked -> max backlog */ - info->tcpi_unacked = sk->sk_ack_backlog; - info->tcpi_sacked = sk->sk_max_ack_backlog; + info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog); + info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog); return; } @@ -3282,6 +3311,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_bytes_retrans = tp->bytes_retrans; info->tcpi_dsack_dups = tp->dsack_dups; info->tcpi_reord_seen = tp->reord_seen; + info->tcpi_rcv_ooopack = tp->rcv_ooopack; + info->tcpi_snd_wnd = tp->snd_wnd; + info->tcpi_fastopen_client_fail = tp->fastopen_client_fail; unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -3311,6 +3343,7 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */ nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */ nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */ + nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */ 0; } @@ -3365,6 +3398,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups); nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen); nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3); + nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash); return stats; } @@ -3784,8 +3818,8 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, return 1; for (i = 0; i < shi->nr_frags; ++i) { - const struct skb_frag_struct *f = &shi->frags[i]; - unsigned int offset = f->page_offset; + const skb_frag_t *f = &shi->frags[i]; + unsigned int offset = skb_frag_off(f); struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT); sg_set_page(&sg, page, skb_frag_size(f), @@ -3817,7 +3851,13 @@ EXPORT_SYMBOL(tcp_md5_hash_key); void tcp_done(struct sock *sk) { - struct request_sock *req = tcp_sk(sk)->fastopen_rsk; + struct request_sock *req; + + /* We might be called with a new socket, after + * inet_csk_prepare_forced_close() has been called + * so we can not use lockdep_sock_is_held(sk) + */ + req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1); if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); @@ -3916,7 +3956,7 @@ void __init tcp_init(void) BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE); BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > - FIELD_SIZEOF(struct sk_buff, cb)); + sizeof_field(struct sk_buff, cb)); percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL); @@ -3990,4 +4030,5 @@ void __init tcp_init(void) tcp_metrics_init(); BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0); tcp_tasklet_init(); + mptcp_init(); } diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 56be7d27f208..6c4d79baff26 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -306,7 +306,8 @@ static u32 bbr_tso_segs_goal(struct sock *sk) /* Sort of tcp_tso_autosize() but ignoring * driver provided sk_gso_max_size. */ - bytes = min_t(unsigned long, sk->sk_pacing_rate >> sk->sk_pacing_shift, + bytes = min_t(unsigned long, + sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift), GSO_MAX_SIZE - 1 - MAX_TCP_HEADER); segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); @@ -346,7 +347,7 @@ static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) /* Calculate bdp based on min RTT and the estimated bottleneck bandwidth: * - * bdp = bw * min_rtt * gain + * bdp = ceil(bw * min_rtt * gain) * * The key factor, gain, controls the amount of queue. While a small gain * builds a smaller queue, it becomes more vulnerable to noise in RTT @@ -370,7 +371,9 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) w = (u64)bw * bbr->min_rtt_us; - /* Apply a gain to the given value, then remove the BW_SCALE shift. */ + /* Apply a gain to the given value, remove the BW_SCALE shift, and + * round the value up to avoid a negative feedback loop. + */ bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; return bdp; @@ -386,7 +389,7 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) * which allows 2 outstanding 2-packet sequences, to try to keep pipe * full even with ACK-every-other-packet delayed ACKs. */ -static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd, int gain) +static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd) { struct bbr *bbr = inet_csk_ca(sk); @@ -397,7 +400,7 @@ static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd, int gain) cwnd = (cwnd + 1) & ~1U; /* Ensure gain cycling gets inflight above BDP even for small BDPs. */ - if (bbr->mode == BBR_PROBE_BW && gain > BBR_UNIT) + if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0) cwnd += 2; return cwnd; @@ -409,7 +412,7 @@ static u32 bbr_inflight(struct sock *sk, u32 bw, int gain) u32 inflight; inflight = bbr_bdp(sk, bw, gain); - inflight = bbr_quantization_budget(sk, inflight, gain); + inflight = bbr_quantization_budget(sk, inflight); return inflight; } @@ -529,7 +532,7 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, * due to aggregation (of data and/or ACKs) visible in the ACK stream. */ target_cwnd += bbr_ack_aggregation_cwnd(sk); - target_cwnd = bbr_quantization_budget(sk, target_cwnd, gain); + target_cwnd = bbr_quantization_budget(sk, target_cwnd); /* If we're below target cwnd, slow start cwnd toward target cwnd. */ if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ @@ -776,8 +779,7 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) * bandwidth sample. Delivered is in packets and interval_us in uS and * ratio will be <<1 for most connections. So delivered is first scaled. */ - bw = (u64)rs->delivered * BW_UNIT; - do_div(bw, rs->interval_us); + bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us); /* If this sample is application-limited, it is likely to have a very * low delivered count that represents application behavior rather than diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 8a56e09cfb0e..8a01428f80c1 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -121,14 +121,14 @@ int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, struct sk_psock *psock; int copied, ret; - if (unlikely(flags & MSG_ERRQUEUE)) - return inet_recv_error(sk, msg, len, addr_len); - if (!skb_queue_empty(&sk->sk_receive_queue)) - return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - psock = sk_psock_get(sk); if (unlikely(!psock)) return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + if (!skb_queue_empty(&sk->sk_receive_queue) && + sk_psock_queue_empty(psock)) + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); lock_sock(sk); msg_bytes_ready: copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags); @@ -139,7 +139,7 @@ msg_bytes_ready: timeo = sock_rcvtimeo(sk, nonblock); data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err); if (data) { - if (skb_queue_empty(&sk->sk_receive_queue)) + if (!sk_psock_queue_empty(psock)) goto msg_bytes_ready; release_sock(sk); sk_psock_put(sk, psock); @@ -301,7 +301,7 @@ EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir); static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg, int *copied, int flags) { - bool cork = false, enospc = msg->sg.start == msg->sg.end; + bool cork = false, enospc = sk_msg_full(msg); struct sock *sk_redir; u32 tosend, delta = 0; int ret; @@ -315,10 +315,7 @@ more_data: */ delta = msg->sg.size; psock->eval = sk_psock_msg_verdict(sk, psock, msg); - if (msg->sg.size < delta) - delta -= msg->sg.size; - else - delta = 0; + delta -= msg->sg.size; } if (msg->cork_bytes && diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index c445a81d144e..3172e31987be 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -21,7 +21,7 @@ static DEFINE_SPINLOCK(tcp_cong_list_lock); static LIST_HEAD(tcp_cong_list); /* Simple linear search, don't expect many entries! */ -static struct tcp_congestion_ops *tcp_ca_find(const char *name) +struct tcp_congestion_ops *tcp_ca_find(const char *name) { struct tcp_congestion_ops *e; @@ -162,7 +162,7 @@ void tcp_assign_congestion_control(struct sock *sk) rcu_read_lock(); ca = rcu_dereference(net->ipv4.tcp_congestion_control); - if (unlikely(!try_module_get(ca->owner))) + if (unlikely(!bpf_try_module_get(ca, ca->owner))) ca = &tcp_reno; icsk->icsk_ca_ops = ca; rcu_read_unlock(); @@ -208,7 +208,7 @@ void tcp_cleanup_congestion_control(struct sock *sk) if (icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); - module_put(icsk->icsk_ca_ops->owner); + bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner); } /* Used by sysctl to change default congestion control */ @@ -222,12 +222,12 @@ int tcp_set_default_congestion_control(struct net *net, const char *name) ca = tcp_ca_find_autoload(net, name); if (!ca) { ret = -ENOENT; - } else if (!try_module_get(ca->owner)) { + } else if (!bpf_try_module_get(ca, ca->owner)) { ret = -EBUSY; } else { prev = xchg(&net->ipv4.tcp_congestion_control, ca); if (prev) - module_put(prev->owner); + bpf_module_put(prev, prev->owner); ca->flags |= TCP_CONG_NON_RESTRICTED; ret = 0; @@ -256,6 +256,9 @@ void tcp_get_available_congestion_control(char *buf, size_t maxlen) offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ca->name); + + if (WARN_ON_ONCE(offs >= maxlen)) + break; } rcu_read_unlock(); } @@ -285,6 +288,9 @@ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen) offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ca->name); + + if (WARN_ON_ONCE(offs >= maxlen)) + break; } rcu_read_unlock(); } @@ -360,19 +366,19 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, } else if (!load) { const struct tcp_congestion_ops *old_ca = icsk->icsk_ca_ops; - if (try_module_get(ca->owner)) { + if (bpf_try_module_get(ca, ca->owner)) { if (reinit) { tcp_reinit_congestion_control(sk, ca); } else { icsk->icsk_ca_ops = ca; - module_put(old_ca->owner); + bpf_module_put(old_ca, old_ca->owner); } } else { err = -EBUSY; } } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) { err = -EPERM; - } else if (!try_module_get(ca->owner)) { + } else if (!bpf_try_module_get(ca, ca->owner)) { err = -EBUSY; } else { tcp_reinit_congestion_control(sk, ca); diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 1b3d032a4df2..8f8eefd3a3ce 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -40,8 +40,8 @@ /* Number of delay samples for detecting the increase of delay */ #define HYSTART_MIN_SAMPLES 8 -#define HYSTART_DELAY_MIN (4U<<3) -#define HYSTART_DELAY_MAX (16U<<3) +#define HYSTART_DELAY_MIN (4000U) /* 4 ms */ +#define HYSTART_DELAY_MAX (16000U) /* 16 ms */ #define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX) static int fast_convergence __read_mostly = 1; @@ -53,7 +53,7 @@ static int tcp_friendliness __read_mostly = 1; static int hystart __read_mostly = 1; static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY; static int hystart_low_window __read_mostly = 16; -static int hystart_ack_delta __read_mostly = 2; +static int hystart_ack_delta_us __read_mostly = 2000; static u32 cube_rtt_scale __read_mostly; static u32 beta_scale __read_mostly; @@ -77,8 +77,8 @@ MODULE_PARM_DESC(hystart_detect, "hybrid slow start detection mechanisms" " 1: packet-train 2: delay 3: both packet-train and delay"); module_param(hystart_low_window, int, 0644); MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start"); -module_param(hystart_ack_delta, int, 0644); -MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (msecs)"); +module_param(hystart_ack_delta_us, int, 0644); +MODULE_PARM_DESC(hystart_ack_delta_us, "spacing between ack's indicating train (usecs)"); /* BIC TCP Parameters */ struct bictcp { @@ -89,7 +89,7 @@ struct bictcp { u32 bic_origin_point;/* origin point of bic function */ u32 bic_K; /* time to origin point from the beginning of the current epoch */ - u32 delay_min; /* min delay (msec << 3) */ + u32 delay_min; /* min delay (usec) */ u32 epoch_start; /* beginning of an epoch */ u32 ack_cnt; /* number of acks */ u32 tcp_cwnd; /* estimated tcp cwnd */ @@ -117,13 +117,9 @@ static inline void bictcp_reset(struct bictcp *ca) ca->found = 0; } -static inline u32 bictcp_clock(void) +static inline u32 bictcp_clock_us(const struct sock *sk) { -#if HZ < 1000 - return ktime_to_ms(ktime_get_real()); -#else - return jiffies_to_msecs(jiffies); -#endif + return tcp_sk(sk)->tcp_mstamp; } static inline void bictcp_hystart_reset(struct sock *sk) @@ -131,9 +127,9 @@ static inline void bictcp_hystart_reset(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - ca->round_start = ca->last_ack = bictcp_clock(); + ca->round_start = ca->last_ack = bictcp_clock_us(sk); ca->end_seq = tp->snd_nxt; - ca->curr_rtt = 0; + ca->curr_rtt = ~0U; ca->sample_cnt = 0; } @@ -276,7 +272,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked) */ t = (s32)(tcp_jiffies32 - ca->epoch_start); - t += msecs_to_jiffies(ca->delay_min >> 3); + t += usecs_to_jiffies(ca->delay_min); /* change the unit from HZ to bictcp_HZ */ t <<= BICTCP_HZ; do_div(t, HZ); @@ -376,22 +372,54 @@ static void bictcp_state(struct sock *sk, u8 new_state) } } +/* Account for TSO/GRO delays. + * Otherwise short RTT flows could get too small ssthresh, since during + * slow start we begin with small TSO packets and ca->delay_min would + * not account for long aggregation delay when TSO packets get bigger. + * Ideally even with a very small RTT we would like to have at least one + * TSO packet being sent and received by GRO, and another one in qdisc layer. + * We apply another 100% factor because @rate is doubled at this point. + * We cap the cushion to 1ms. + */ +static u32 hystart_ack_delay(struct sock *sk) +{ + unsigned long rate; + + rate = READ_ONCE(sk->sk_pacing_rate); + if (!rate) + return 0; + return min_t(u64, USEC_PER_MSEC, + div64_ul((u64)GSO_MAX_SIZE * 4 * USEC_PER_SEC, rate)); +} + static void hystart_update(struct sock *sk, u32 delay) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - - if (ca->found & hystart_detect) - return; + u32 threshold; if (hystart_detect & HYSTART_ACK_TRAIN) { - u32 now = bictcp_clock(); + u32 now = bictcp_clock_us(sk); /* first detection parameter - ack-train detection */ - if ((s32)(now - ca->last_ack) <= hystart_ack_delta) { + if ((s32)(now - ca->last_ack) <= hystart_ack_delta_us) { ca->last_ack = now; - if ((s32)(now - ca->round_start) > ca->delay_min >> 4) { - ca->found |= HYSTART_ACK_TRAIN; + + threshold = ca->delay_min + hystart_ack_delay(sk); + + /* Hystart ack train triggers if we get ack past + * ca->delay_min/2. + * Pacing might have delayed packets up to RTT/2 + * during slow start. + */ + if (sk->sk_pacing_status == SK_PACING_NONE) + threshold >>= 1; + + if ((s32)(now - ca->round_start) > threshold) { + ca->found = 1; + pr_debug("hystart_ack_train (%u > %u) delay_min %u (+ ack_delay %u) cwnd %u\n", + now - ca->round_start, threshold, + ca->delay_min, hystart_ack_delay(sk), tp->snd_cwnd); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTTRAINDETECT); NET_ADD_STATS(sock_net(sk), @@ -405,14 +433,14 @@ static void hystart_update(struct sock *sk, u32 delay) if (hystart_detect & HYSTART_DELAY) { /* obtain the minimum delay of more than sampling packets */ if (ca->sample_cnt < HYSTART_MIN_SAMPLES) { - if (ca->curr_rtt == 0 || ca->curr_rtt > delay) + if (ca->curr_rtt > delay) ca->curr_rtt = delay; ca->sample_cnt++; } else { if (ca->curr_rtt > ca->delay_min + HYSTART_DELAY_THRESH(ca->delay_min >> 3)) { - ca->found |= HYSTART_DELAY; + ca->found = 1; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTDELAYDETECT); NET_ADD_STATS(sock_net(sk), @@ -424,9 +452,6 @@ static void hystart_update(struct sock *sk, u32 delay) } } -/* Track delayed acknowledgment ratio using sliding window - * ratio = (15*ratio + sample) / 16 - */ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) { const struct tcp_sock *tp = tcp_sk(sk); @@ -441,7 +466,7 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) if (ca->epoch_start && (s32)(tcp_jiffies32 - ca->epoch_start) < HZ) return; - delay = (sample->rtt_us << 3) / USEC_PER_MSEC; + delay = sample->rtt_us; if (delay == 0) delay = 1; @@ -450,7 +475,7 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) ca->delay_min = delay; /* hystart triggers when cwnd is larger than some threshold */ - if (hystart && tcp_in_slow_start(tp) && + if (!ca->found && tcp_in_slow_start(tp) && hystart && tp->snd_cwnd >= hystart_low_window) hystart_update(sk, delay); } diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index a3a386236d93..0d08f9e2d8d0 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -21,13 +21,14 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, struct tcp_info *info = _info; if (inet_sk_state_load(sk) == TCP_LISTEN) { - r->idiag_rqueue = sk->sk_ack_backlog; - r->idiag_wqueue = sk->sk_max_ack_backlog; + r->idiag_rqueue = READ_ONCE(sk->sk_ack_backlog); + r->idiag_wqueue = READ_ONCE(sk->sk_max_ack_backlog); } else if (sk->sk_type == SOCK_STREAM) { const struct tcp_sock *tp = tcp_sk(sk); - r->idiag_rqueue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); - r->idiag_wqueue = tp->write_seq - tp->snd_una; + r->idiag_rqueue = max_t(int, READ_ONCE(tp->rcv_nxt) - + READ_ONCE(tp->copied_seq), 0); + r->idiag_wqueue = READ_ONCE(tp->write_seq) - tp->snd_una; } if (info) tcp_get_info(sk, info); @@ -81,13 +82,42 @@ static int tcp_diag_put_md5sig(struct sk_buff *skb, } #endif +static int tcp_diag_put_ulp(struct sk_buff *skb, struct sock *sk, + const struct tcp_ulp_ops *ulp_ops) +{ + struct nlattr *nest; + int err; + + nest = nla_nest_start_noflag(skb, INET_DIAG_ULP_INFO); + if (!nest) + return -EMSGSIZE; + + err = nla_put_string(skb, INET_ULP_INFO_NAME, ulp_ops->name); + if (err) + goto nla_failure; + + if (ulp_ops->get_info) + err = ulp_ops->get_info(sk, skb); + if (err) + goto nla_failure; + + nla_nest_end(skb, nest); + return 0; + +nla_failure: + nla_nest_cancel(skb, nest); + return err; +} + static int tcp_diag_get_aux(struct sock *sk, bool net_admin, struct sk_buff *skb) { + struct inet_connection_sock *icsk = inet_csk(sk); + int err = 0; + #ifdef CONFIG_TCP_MD5SIG if (net_admin) { struct tcp_md5sig_info *md5sig; - int err = 0; rcu_read_lock(); md5sig = rcu_dereference(tcp_sk(sk)->md5sig_info); @@ -99,11 +129,21 @@ static int tcp_diag_get_aux(struct sock *sk, bool net_admin, } #endif + if (net_admin) { + const struct tcp_ulp_ops *ulp_ops; + + ulp_ops = icsk->icsk_ulp_ops; + if (ulp_ops) + err = tcp_diag_put_ulp(skb, sk, ulp_ops); + if (err) + return err; + } return 0; } static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin) { + struct inet_connection_sock *icsk = inet_csk(sk); size_t size = 0; #ifdef CONFIG_TCP_MD5SIG @@ -124,6 +164,17 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin) } #endif + if (net_admin && sk_fullsock(sk)) { + const struct tcp_ulp_ops *ulp_ops; + + ulp_ops = icsk->icsk_ulp_ops; + if (ulp_ops) { + size += nla_total_size(0) + + nla_total_size(TCP_ULP_NAME_MAX); + if (ulp_ops->get_info_size) + size += ulp_ops->get_info_size(sk); + } + } return size; } diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 3fd451271a70..19ad9586c720 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -253,7 +253,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, */ tp = tcp_sk(child); - tp->fastopen_rsk = req; + rcu_assign_pointer(tp->fastopen_rsk, req); tcp_rsk(req)->tfo_listener = true; /* RFC1323: The window in SYN & SYN/ACK segments is never @@ -422,7 +422,10 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, cookie->len = -1; return true; } - return cookie->len > 0; + if (cookie->len > 0) + return true; + tcp_sk(sk)->fastopen_client_fail = TFO_COOKIE_UNAVAILABLE; + return false; } /* This function checks if we want to defer sending SYN until the first diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c21e8a22fb3b..316ebdf8151d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -79,6 +79,7 @@ #include <trace/events/tcp.h> #include <linux/jump_label_ratelimit.h> #include <net/busy_poll.h> +#include <net/mptcp.h> int sysctl_tcp_max_orphans __read_mostly = NR_FILE; @@ -266,7 +267,7 @@ static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb) static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) { - tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; + tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; } static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) @@ -359,7 +360,8 @@ static void tcp_sndbuf_expand(struct sock *sk) sndmem *= nr_segs * per_mss; if (sk->sk_sndbuf < sndmem) - sk->sk_sndbuf = min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]); + WRITE_ONCE(sk->sk_sndbuf, + min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2])); } /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) @@ -483,8 +485,9 @@ static void tcp_clamp_window(struct sock *sk) !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && !tcp_under_memory_pressure(sk) && sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { - sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), - net->ipv4.sysctl_tcp_rmem[2]); + WRITE_ONCE(sk->sk_rcvbuf, + min(atomic_read(&sk->sk_rmem_alloc), + net->ipv4.sysctl_tcp_rmem[2])); } if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); @@ -648,7 +651,7 @@ void tcp_rcv_space_adjust(struct sock *sk) rcvbuf = min_t(u64, rcvwin * rcvmem, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); if (rcvbuf > sk->sk_rcvbuf) { - sk->sk_rcvbuf = rcvbuf; + WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); /* Make the window clamp follow along. */ tp->window_clamp = tcp_win_from_space(sk, rcvbuf); @@ -913,9 +916,10 @@ static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq, /* This must be called before lost_out is incremented */ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) { - if (!tp->retransmit_skb_hint || - before(TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) + if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) || + (tp->retransmit_skb_hint && + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->retransmit_skb_hint)->seq))) tp->retransmit_skb_hint = skb; } @@ -1420,7 +1424,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) goto fallback; - if (!tcp_skb_can_collapse_to(prev)) + if (!tcp_skb_can_collapse(prev, skb)) goto fallback; in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && @@ -1725,8 +1729,11 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, } /* Ignore very old stuff early */ - if (!after(sp[used_sacks].end_seq, prior_snd_una)) + if (!after(sp[used_sacks].end_seq, prior_snd_una)) { + if (i == 0) + first_sack_index = -1; continue; + } used_sacks++; } @@ -2666,7 +2673,7 @@ static void tcp_process_loss(struct sock *sk, int flag, int num_dupack, struct tcp_sock *tp = tcp_sk(sk); bool recovered = !before(tp->snd_una, tp->high_seq); - if ((flag & FLAG_SND_UNA_ADVANCED || tp->fastopen_rsk) && + if ((flag & FLAG_SND_UNA_ADVANCED || rcu_access_pointer(tp->fastopen_rsk)) && tcp_try_undo_loss(sk, false)) return; @@ -2990,7 +2997,7 @@ void tcp_rearm_rto(struct sock *sk) /* If the retrans timer is currently being used by Fast Open * for SYN-ACK retrans purpose, stay put. */ - if (tp->fastopen_rsk) + if (rcu_access_pointer(tp->fastopen_rsk)) return; if (!tp->packets_out) { @@ -3158,6 +3165,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, tp->retransmit_skb_hint = NULL; if (unlikely(skb == tp->lost_skb_hint)) tp->lost_skb_hint = NULL; + tcp_highest_sack_replace(sk, skb, next); tcp_rtx_queue_unlink_and_free(skb, sk); } @@ -3362,7 +3370,7 @@ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq) sock_owned_by_me((struct sock *)tp); tp->bytes_received += delta; - tp->rcv_nxt = seq; + WRITE_ONCE(tp->rcv_nxt, seq); } /* Update our send window. @@ -3548,7 +3556,7 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit) if (rexmit == REXMIT_NONE || sk->sk_state == TCP_SYN_SENT) return; - if (unlikely(rexmit == 2)) { + if (unlikely(rexmit == REXMIT_NEW)) { __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF); if (after(tp->snd_nxt, tp->high_seq)) @@ -3782,6 +3790,49 @@ static void smc_parse_options(const struct tcphdr *th, #endif } +/* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped + * value on success. + */ +static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss) +{ + const unsigned char *ptr = (const unsigned char *)(th + 1); + int length = (th->doff * 4) - sizeof(struct tcphdr); + u16 mss = 0; + + while (length > 0) { + int opcode = *ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return mss; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + if (length < 2) + return mss; + opsize = *ptr++; + if (opsize < 2) /* "silly options" */ + return mss; + if (opsize > length) + return mss; /* fail on partial options */ + if (opcode == TCPOPT_MSS && opsize == TCPOLEN_MSS) { + u16 in_mss = get_unaligned_be16(ptr); + + if (in_mss) { + if (user_mss && user_mss < in_mss) + in_mss = user_mss; + mss = in_mss; + } + } + ptr += opsize - 2; + length -= opsize; + } + } + return mss; +} + /* Look for tcp options. Normally only called on SYN and SYNACK packets. * But, this can also be called on packets in the established flow when * the fast version below fails. @@ -3875,6 +3926,10 @@ void tcp_parse_options(const struct net *net, */ break; #endif + case TCPOPT_MPTCP: + mptcp_parse_option(skb, ptr, opsize, opt_rx); + break; + case TCPOPT_FASTOPEN: tcp_parse_fastopen_option( opsize - TCPOLEN_FASTOPEN_BASE, @@ -4216,8 +4271,10 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb) * The receiver remembers and reflects via DSACKs. Leverage the * DSACK state and change the txhash to re-route speculatively. */ - if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq) + if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq) { sk_rethink_txhash(sk); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH); + } } static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) @@ -4375,6 +4432,9 @@ static bool tcp_try_coalesce(struct sock *sk, if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) return false; + if (!mptcp_skb_can_collapse(to, from)) + return false; + #ifdef CONFIG_TLS_DEVICE if (from->decrypted != to->decrypted) return false; @@ -4512,6 +4572,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) tp->pred_flags = 0; inet_csk_schedule_ack(sk); + tp->rcv_ooopack += max_t(u16, 1, skb_shinfo(skb)->gso_segs); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); seq = TCP_SKB_CB(skb)->seq; end_seq = TCP_SKB_CB(skb)->end_seq; @@ -4712,6 +4773,9 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) bool fragstolen; int eaten; + if (sk_is_mptcp(sk)) + mptcp_incoming_options(sk, skb, &tp->rx_opt); + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { __kfree_skb(skb); return; @@ -4883,7 +4947,7 @@ restart: /* The first skb to collapse is: * - not SYN/FIN and * - bloated or contains data before "start" or - * overlaps to the next one. + * overlaps to the next one and mptcp allow collapsing. */ if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) && (tcp_win_from_space(sk, skb->truesize) > skb->len || @@ -4892,7 +4956,7 @@ restart: break; } - if (n && n != tail && + if (n && n != tail && mptcp_skb_can_collapse(skb, n) && TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) { end_of_skbs = false; break; @@ -4925,6 +4989,7 @@ restart: else __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */ skb_set_owner_r(nskb, sk); + mptcp_skb_ext_move(nskb, skb); /* Copy data, releasing collapsed skbs. */ while (copy > 0) { @@ -4944,6 +5009,7 @@ restart: skb = tcp_collapse_one(sk, skb, list, root); if (!skb || skb == tail || + !mptcp_skb_can_collapse(nskb, skb) || (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) goto end; #ifdef CONFIG_TLS_DEVICE @@ -5312,7 +5378,7 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th) } tp->urg_data = TCP_URG_NOTYET; - tp->urg_seq = ptr; + WRITE_ONCE(tp->urg_seq, ptr); /* Disable header prediction. */ tp->pred_flags = 0; @@ -5768,6 +5834,10 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp); if (data) { /* Retransmit unacked data in SYN */ + if (tp->total_retrans) + tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED; + else + tp->fastopen_client_fail = TFO_DATA_NOT_ACKED; skb_rbtree_walk_from(data) { if (__tcp_retransmit_skb(sk, data, 1)) break; @@ -5838,8 +5908,14 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * the segment and return)" */ if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) || - after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) + after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) { + /* Previous FIN/ACK or RST/ACK might be ignored. */ + if (icsk->icsk_retransmits == 0) + inet_csk_reset_xmit_timer(sk, + ICSK_TIME_RETRANS, + TCP_TIMEOUT_MIN, TCP_RTO_MAX); goto reset_and_undo; + } if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp, @@ -5888,7 +5964,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, /* Ok.. it's good. Set up sequence numbers and * move to established. */ - tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; + WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1); tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; /* RFC1323: The window in SYN & SYN/ACK segments is @@ -5914,10 +5990,13 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); + if (sk_is_mptcp(sk)) + mptcp_rcv_synsent(sk); + /* Remember, tcp_poll() does not lock socket! * Change state from SYN-SENT only after copied_seq * is initialized. */ - tp->copied_seq = tp->rcv_nxt; + WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); smc_check_reset_syn(tp); @@ -5991,8 +6070,8 @@ discard: tp->tcp_header_len = sizeof(struct tcphdr); } - tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; - tp->copied_seq = tp->rcv_nxt; + WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1); + WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; /* RFC1323: The window in SYN & SYN/ACK segments is @@ -6043,6 +6122,8 @@ reset_and_undo: static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) { + struct request_sock *req; + tcp_try_undo_loss(sk, false); /* Reset rtx states to prevent spurious retransmits_timed_out() */ @@ -6052,7 +6133,9 @@ static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) /* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1, * we no longer need req so release it. */ - reqsk_fastopen_remove(sk, tcp_sk(sk)->fastopen_rsk, false); + req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, + lockdep_sock_is_held(sk)); + reqsk_fastopen_remove(sk, req, false); /* Re-arm the timer because data may have been sent out. * This is similar to the regular data transmission case @@ -6127,7 +6210,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tcp_mstamp_refresh(tp); tp->rx_opt.saw_tstamp = 0; - req = tp->fastopen_rsk; + req = rcu_dereference_protected(tp->fastopen_rsk, + lockdep_sock_is_held(sk)); if (req) { bool req_stolen; @@ -6167,7 +6251,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tcp_try_undo_spurious_syn(sk); tp->retrans_stamp = 0; tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); - tp->copied_seq = tp->rcv_nxt; + WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); } smp_mb(); tcp_set_state(sk, TCP_ESTABLISHED); @@ -6274,8 +6358,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) case TCP_CLOSE_WAIT: case TCP_CLOSING: case TCP_LAST_ACK: - if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) + if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + if (sk_is_mptcp(sk)) + mptcp_incoming_options(sk, skb, &tp->rx_opt); break; + } /* fall through */ case TCP_FIN_WAIT1: case TCP_FIN_WAIT2: @@ -6422,9 +6509,7 @@ EXPORT_SYMBOL(inet_reqsk_alloc); /* * Return true if a syncookie should be sent */ -static bool tcp_syn_flood_action(const struct sock *sk, - const struct sk_buff *skb, - const char *proto) +static bool tcp_syn_flood_action(const struct sock *sk, const char *proto) { struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; const char *msg = "Dropping request"; @@ -6444,7 +6529,7 @@ static bool tcp_syn_flood_action(const struct sock *sk, net->ipv4.sysctl_tcp_syncookies != 2 && xchg(&queue->synflood_warned, 1) == 0) net_info_ratelimited("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", - proto, ntohs(tcp_hdr(skb)->dest), msg); + proto, sk->sk_num, msg); return want_cookie; } @@ -6466,6 +6551,36 @@ static void tcp_reqsk_record_syn(const struct sock *sk, } } +/* If a SYN cookie is required and supported, returns a clamped MSS value to be + * used for SYN cookie generation. + */ +u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, + const struct tcp_request_sock_ops *af_ops, + struct sock *sk, struct tcphdr *th) +{ + struct tcp_sock *tp = tcp_sk(sk); + u16 mss; + + if (sock_net(sk)->ipv4.sysctl_tcp_syncookies != 2 && + !inet_csk_reqsk_queue_is_full(sk)) + return 0; + + if (!tcp_syn_flood_action(sk, rsk_ops->slab_name)) + return 0; + + if (sk_acceptq_is_full(sk)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); + return 0; + } + + mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss); + if (!mss) + mss = af_ops->mss_clamp; + + return mss; +} +EXPORT_SYMBOL_GPL(tcp_get_syncookie_mss); + int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb) @@ -6487,7 +6602,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, */ if ((net->ipv4.sysctl_tcp_syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) && !isn) { - want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name); + want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name); if (!want_cookie) goto drop; } @@ -6503,6 +6618,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_rsk(req)->af_specific = af_ops; tcp_rsk(req)->ts_off = 0; +#if IS_ENABLED(CONFIG_MPTCP) + tcp_rsk(req)->is_mptcp = 0; +#endif tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = af_ops->mss_clamp; @@ -6525,6 +6643,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, af_ops->init_req(req, sk, skb); + if (IS_ENABLED(CONFIG_MPTCP) && want_cookie) + tcp_rsk(req)->is_mptcp = 0; + if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d57641cb3477..df1166b76126 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -121,11 +121,9 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == AF_INET6) { if (ipv6_addr_loopback(&tw->tw_v6_daddr) || - (ipv6_addr_v4mapped(&tw->tw_v6_daddr) && - (tw->tw_v6_daddr.s6_addr[12] == 127)) || + ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || - (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) && - (tw->tw_v6_rcv_saddr.s6_addr[12] == 127))) + ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) loopback = true; } else #endif @@ -164,9 +162,11 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) * without appearing to create any others. */ if (likely(!tp->repair)) { - tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; - if (tp->write_seq == 0) - tp->write_seq = 1; + u32 seq = tcptw->tw_snd_nxt + 65535 + 2; + + if (!seq) + seq = 1; + WRITE_ONCE(tp->write_seq, seq); tp->rx_opt.ts_recent = tcptw->tw_ts_recent; tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; } @@ -253,7 +253,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) tp->rx_opt.ts_recent = 0; tp->rx_opt.ts_recent_stamp = 0; if (likely(!tp->repair)) - tp->write_seq = 0; + WRITE_ONCE(tp->write_seq, 0); } inet->inet_dport = usin->sin_port; @@ -291,16 +291,17 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (likely(!tp->repair)) { if (!tp->write_seq) - tp->write_seq = secure_tcp_seq(inet->inet_saddr, - inet->inet_daddr, - inet->inet_sport, - usin->sin_port); + WRITE_ONCE(tp->write_seq, + secure_tcp_seq(inet->inet_saddr, + inet->inet_daddr, + inet->inet_sport, + usin->sin_port)); tp->tsoffset = secure_tcp_ts_off(sock_net(sk), inet->inet_saddr, inet->inet_daddr); } - inet->inet_id = tp->write_seq ^ jiffies; + inet->inet_id = prandom_u32(); if (tcp_fastopen_defer_connect(sk, &err)) return err; @@ -478,7 +479,7 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) icsk = inet_csk(sk); tp = tcp_sk(sk); /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ - fastopen = tp->fastopen_rsk; + fastopen = rcu_dereference(tp->fastopen_rsk); snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; if (sk->sk_state != TCP_LISTEN && !between(seq, snd_una, tp->snd_nxt)) { @@ -700,9 +701,21 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) rcu_read_lock(); hash_location = tcp_parse_md5sig_option(th); if (sk && sk_fullsock(sk)) { - key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *) - &ip_hdr(skb)->saddr, AF_INET); + const union tcp_md5_addr *addr; + int l3index; + + /* sdif set, means packet ingressed via a device + * in an L3 domain and inet_iif is set to it. + */ + l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; + addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; + key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); } else if (hash_location) { + const union tcp_md5_addr *addr; + int sdif = tcp_v4_sdif(skb); + int dif = inet_iif(skb); + int l3index; + /* * active side is lost. Try to find listening socket through * source port, and then find md5 key through listening socket. @@ -713,14 +726,17 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, ip_hdr(skb)->saddr, th->source, ip_hdr(skb)->daddr, - ntohs(th->source), inet_iif(skb), - tcp_v4_sdif(skb)); + ntohs(th->source), dif, sdif); /* don't send rst if it can't find key */ if (!sk1) goto out; - key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *) - &ip_hdr(skb)->saddr, AF_INET); + /* sdif set, means packet ingressed via a device + * in an L3 domain and dif is set to it. + */ + l3index = sdif ? dif : 0; + addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; + key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); if (!key) goto out; @@ -771,6 +787,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) if (sk) { ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_mark : sk->sk_mark; + ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? + inet_twsk(sk)->tw_priority : sk->sk_priority; transmit_time = tcp_transmit_time(sk); } ip_send_unicast_reply(ctl_sk, @@ -866,6 +884,8 @@ static void tcp_v4_send_ack(const struct sock *sk, ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_mark : sk->sk_mark; + ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? + inet_twsk(sk)->tw_priority : sk->sk_priority; transmit_time = tcp_transmit_time(sk); ip_send_unicast_reply(ctl_sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, @@ -900,6 +920,9 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { + const union tcp_md5_addr *addr; + int l3index; + /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV * sk->sk_state == TCP_SYN_RECV -> for Fast Open. */ @@ -911,14 +934,15 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, * exception of <SYN> segments, MUST be right-shifted by * Rcv.Wind.Shift bits: */ + addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; + l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; tcp_v4_send_ack(sk, skb, seq, tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, req->ts_recent, 0, - tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr, - AF_INET), + tcp_md5_do_lookup(sk, l3index, addr, AF_INET), inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, ip_hdr(skb)->tos); } @@ -978,7 +1002,7 @@ DEFINE_STATIC_KEY_FALSE(tcp_md5_needed); EXPORT_SYMBOL(tcp_md5_needed); /* Find the Key structure for an address. */ -struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, +struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, int family) { @@ -998,7 +1022,8 @@ struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, hlist_for_each_entry_rcu(key, &md5sig->head, node) { if (key->family != family) continue; - + if (key->l3index && key->l3index != l3index) + continue; if (family == AF_INET) { mask = inet_make_mask(key->prefixlen); match = (key->addr.a4.s_addr & mask) == @@ -1022,7 +1047,8 @@ EXPORT_SYMBOL(__tcp_md5_do_lookup); static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, const union tcp_md5_addr *addr, - int family, u8 prefixlen) + int family, u8 prefixlen, + int l3index) { const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; @@ -1041,6 +1067,8 @@ static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, hlist_for_each_entry_rcu(key, &md5sig->head, node) { if (key->family != family) continue; + if (key->l3index && key->l3index != l3index) + continue; if (!memcmp(&key->addr, addr, size) && key->prefixlen == prefixlen) return key; @@ -1052,23 +1080,26 @@ struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, const struct sock *addr_sk) { const union tcp_md5_addr *addr; + int l3index; + l3index = l3mdev_master_ifindex_by_index(sock_net(sk), + addr_sk->sk_bound_dev_if); addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; - return tcp_md5_do_lookup(sk, addr, AF_INET); + return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); } EXPORT_SYMBOL(tcp_v4_md5_lookup); /* This can be called on a newly created socket, from other files */ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, - int family, u8 prefixlen, const u8 *newkey, u8 newkeylen, - gfp_t gfp) + int family, u8 prefixlen, int l3index, + const u8 *newkey, u8 newkeylen, gfp_t gfp) { /* Add Key to the list */ struct tcp_md5sig_key *key; struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_info *md5sig; - key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen); + key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); if (key) { /* Pre-existing entry - just update that one. */ memcpy(key->key, newkey, newkeylen); @@ -1100,6 +1131,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, key->keylen = newkeylen; key->family = family; key->prefixlen = prefixlen; + key->l3index = l3index; memcpy(&key->addr, addr, (family == AF_INET6) ? sizeof(struct in6_addr) : sizeof(struct in_addr)); @@ -1109,11 +1141,11 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, EXPORT_SYMBOL(tcp_md5_do_add); int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, - u8 prefixlen) + u8 prefixlen, int l3index) { struct tcp_md5sig_key *key; - key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen); + key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); if (!key) return -ENOENT; hlist_del_rcu(&key->node); @@ -1144,7 +1176,9 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, { struct tcp_md5sig cmd; struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; + const union tcp_md5_addr *addr; u8 prefixlen = 32; + int l3index = 0; if (optlen < sizeof(cmd)) return -EINVAL; @@ -1162,16 +1196,34 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, return -EINVAL; } + if (optname == TCP_MD5SIG_EXT && + cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); + if (dev && netif_is_l3_master(dev)) + l3index = dev->ifindex; + + rcu_read_unlock(); + + /* ok to reference set/not set outside of rcu; + * right now device MUST be an L3 master + */ + if (!dev || !l3index) + return -EINVAL; + } + + addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; + if (!cmd.tcpm_keylen) - return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, - AF_INET, prefixlen); + return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index); if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) return -EINVAL; - return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, - AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen, - GFP_KERNEL); + return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, + cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); } static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, @@ -1281,7 +1333,8 @@ EXPORT_SYMBOL(tcp_v4_md5_hash_skb); /* Called with rcu_read_lock() */ static bool tcp_v4_inbound_md5_hash(const struct sock *sk, - const struct sk_buff *skb) + const struct sk_buff *skb, + int dif, int sdif) { #ifdef CONFIG_TCP_MD5SIG /* @@ -1296,11 +1349,17 @@ static bool tcp_v4_inbound_md5_hash(const struct sock *sk, struct tcp_md5sig_key *hash_expected; const struct iphdr *iph = ip_hdr(skb); const struct tcphdr *th = tcp_hdr(skb); - int genhash; + const union tcp_md5_addr *addr; unsigned char newhash[16]; + int genhash, l3index; + + /* sdif set, means packet ingressed via a device + * in an L3 domain and dif is set to the l3mdev + */ + l3index = sdif ? dif : 0; - hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr, - AF_INET); + addr = (union tcp_md5_addr *)&iph->saddr; + hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); hash_location = tcp_parse_md5sig_option(th); /* We've parsed the options - do we have a hash? */ @@ -1326,11 +1385,11 @@ static bool tcp_v4_inbound_md5_hash(const struct sock *sk, if (genhash || memcmp(hash_location, newhash, 16) != 0) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); - net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", + net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n", &iph->saddr, ntohs(th->source), &iph->daddr, ntohs(th->dest), genhash ? " tcp_v4_calc_md5_hash failed" - : ""); + : "", l3index); return true; } return false; @@ -1367,7 +1426,7 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = { .syn_ack_timeout = tcp_syn_ack_timeout, }; -static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { +const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .mss_clamp = TCP_MSS_DEFAULT, #ifdef CONFIG_TCP_MD5SIG .req_md5_lookup = tcp_v4_md5_lookup, @@ -1414,7 +1473,9 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct tcp_sock *newtp; struct sock *newsk; #ifdef CONFIG_TCP_MD5SIG + const union tcp_md5_addr *addr; struct tcp_md5sig_key *key; + int l3index; #endif struct ip_options_rcu *inet_opt; @@ -1443,7 +1504,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, inet_csk(newsk)->icsk_ext_hdr_len = 0; if (inet_opt) inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; - newinet->inet_id = newtp->write_seq ^ jiffies; + newinet->inet_id = prandom_u32(); if (!dst) { dst = inet_csk_route_child_sock(sk, newsk, req); @@ -1462,9 +1523,10 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, tcp_initialize_rcv_mss(newsk); #ifdef CONFIG_TCP_MD5SIG + l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); /* Copy over the MD5 key from the original socket */ - key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr, - AF_INET); + addr = (union tcp_md5_addr *)&newinet->inet_daddr; + key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); if (key) { /* * We're using one, so create a matching key @@ -1472,8 +1534,8 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, * memory, then we end up not copying the key * across. Shucks. */ - tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr, - AF_INET, 32, key->key, key->keylen, GFP_ATOMIC); + tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, + key->key, key->keylen, GFP_ATOMIC); sk_nocaps_add(newsk, NETIF_F_GSO_MASK); } #endif @@ -1515,6 +1577,21 @@ static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) return sk; } +u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, + struct tcphdr *th, u32 *cookie) +{ + u16 mss = 0; +#ifdef CONFIG_SYN_COOKIES + mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, + &tcp_request_sock_ipv4_ops, sk, th); + if (mss) { + *cookie = __cookie_v4_init_sequence(iph, th, &mss); + tcp_synq_overflow(sk); + } +#endif + return mss; +} + /* The socket must have it's spinlock held when we get * here, unless it is a TCP_LISTEN socket. * @@ -1625,7 +1702,7 @@ int tcp_v4_early_demux(struct sk_buff *skb) bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) { - u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf; + u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf); struct skb_shared_info *shinfo; const struct tcphdr *th; struct tcphdr *thtail; @@ -1788,6 +1865,7 @@ int tcp_v4_rcv(struct sk_buff *skb) struct net *net = dev_net(skb->dev); struct sk_buff *skb_to_free; int sdif = inet_sdif(skb); + int dif = inet_iif(skb); const struct iphdr *iph; const struct tcphdr *th; bool refcounted; @@ -1836,7 +1914,7 @@ process: struct sock *nsk; sk = req->rsk_listener; - if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) { + if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) { sk_drops_add(sk, skb); reqsk_put(req); goto discard_it; @@ -1894,10 +1972,10 @@ process: if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; - if (tcp_v4_inbound_md5_hash(sk, skb)) + if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif)) goto discard_and_relse; - nf_reset(skb); + nf_reset_ct(skb); if (tcp_filter(sk, skb)) goto discard_and_relse; @@ -2102,7 +2180,7 @@ void tcp_v4_destroy_sock(struct sock *sk) if (inet_csk(sk)->icsk_bind_hash) inet_put_port(sk); - BUG_ON(tp->fastopen_rsk); + BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); /* If socket is aborted during connect operation */ tcp_free_fastopen_req(tp); @@ -2127,13 +2205,14 @@ static void *listening_get_next(struct seq_file *seq, void *cur) struct tcp_iter_state *st = seq->private; struct net *net = seq_file_net(seq); struct inet_listen_hashbucket *ilb; + struct hlist_nulls_node *node; struct sock *sk = cur; if (!sk) { get_head: ilb = &tcp_hashinfo.listening_hash[st->bucket]; spin_lock(&ilb->lock); - sk = sk_head(&ilb->head); + sk = sk_nulls_head(&ilb->nulls_head); st->offset = 0; goto get_sk; } @@ -2141,9 +2220,9 @@ get_head: ++st->num; ++st->offset; - sk = sk_next(sk); + sk = sk_nulls_next(sk); get_sk: - sk_for_each_from(sk) { + sk_nulls_for_each_from(sk, node) { if (!net_eq(sock_net(sk), net)) continue; if (sk->sk_family == afinfo->family) @@ -2431,17 +2510,18 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) state = inet_sk_state_load(sk); if (state == TCP_LISTEN) - rx_queue = sk->sk_ack_backlog; + rx_queue = READ_ONCE(sk->sk_ack_backlog); else /* Because we don't lock the socket, * we might find a transient negative value. */ - rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); + rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - + READ_ONCE(tp->copied_seq), 0); seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", i, src, srcp, dest, destp, state, - tp->write_seq - tp->snd_una, + READ_ONCE(tp->write_seq) - tp->snd_una, rx_queue, timer_active, jiffies_delta_to_clock_t(timer_expires - jiffies), @@ -2598,7 +2678,8 @@ static void __net_exit tcp_sk_exit(struct net *net) int cpu; if (net->ipv4.tcp_congestion_control) - module_put(net->ipv4.tcp_congestion_control->owner); + bpf_module_put(net->ipv4.tcp_congestion_control, + net->ipv4.tcp_congestion_control->owner); for_each_possible_cpu(cpu) inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); @@ -2637,6 +2718,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; + net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; @@ -2652,12 +2734,13 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; net->ipv4.sysctl_tcp_tw_reuse = 2; + net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; cnt = tcp_hashinfo.ehash_mask + 1; net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2; net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; - net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256); + net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128); net->ipv4.sysctl_tcp_sack = 1; net->ipv4.sysctl_tcp_window_scaling = 1; net->ipv4.sysctl_tcp_timestamps = 1; @@ -2703,7 +2786,8 @@ static int __net_init tcp_sk_init(struct net *net) /* Reno is always built in */ if (!net_eq(net, &init_net) && - try_module_get(init_net.ipv4.tcp_congestion_control->owner)) + bpf_try_module_get(init_net.ipv4.tcp_congestion_control, + init_net.ipv4.tcp_congestion_control->owner)) net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; else net->ipv4.tcp_congestion_control = &tcp_reno; diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index c4848e7a0aad..279db8822439 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -385,7 +385,8 @@ void tcp_update_metrics(struct sock *sk) if (tcp_in_initial_slowstart(tp)) { /* Slow start still did not finish. */ - if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { + if (!net->ipv4.sysctl_tcp_no_ssthresh_metrics_save && + !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); if (val && (tp->snd_cwnd >> 1) > val) tcp_metric_set(tm, TCP_METRIC_SSTHRESH, @@ -400,7 +401,8 @@ void tcp_update_metrics(struct sock *sk) } else if (!tcp_in_slow_start(tp) && icsk->icsk_ca_state == TCP_CA_Open) { /* Cong. avoidance phase, cwnd is reliable. */ - if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) + if (!net->ipv4.sysctl_tcp_no_ssthresh_metrics_save && + !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) tcp_metric_set(tm, TCP_METRIC_SSTHRESH, max(tp->snd_cwnd >> 1, tp->snd_ssthresh)); if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { @@ -416,7 +418,8 @@ void tcp_update_metrics(struct sock *sk) tcp_metric_set(tm, TCP_METRIC_CWND, (val + tp->snd_ssthresh) >> 1); } - if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { + if (!net->ipv4.sysctl_tcp_no_ssthresh_metrics_save && + !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); if (val && tp->snd_ssthresh > val) tcp_metric_set(tm, TCP_METRIC_SSTHRESH, @@ -441,6 +444,7 @@ void tcp_init_metrics(struct sock *sk) { struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); struct tcp_metrics_block *tm; u32 val, crtt = 0; /* cached RTT scaled by 8 */ @@ -458,7 +462,8 @@ void tcp_init_metrics(struct sock *sk) if (tcp_metric_locked(tm, TCP_METRIC_CWND)) tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND); - val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); + val = net->ipv4.sysctl_tcp_no_ssthresh_metrics_save ? + 0 : tcp_metric_get(tm, TCP_METRIC_SSTHRESH); if (val) { tp->snd_ssthresh = val; if (tp->snd_ssthresh > tp->snd_cwnd_clamp) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 8bcaf2586b68..ad3b56d9fa71 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -266,6 +266,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tw->tw_transparent = inet->transparent; tw->tw_mark = sk->sk_mark; + tw->tw_priority = sk->sk_priority; tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; tcptw->tw_rcv_nxt = tp->rcv_nxt; tcptw->tw_snd_nxt = tp->snd_nxt; @@ -413,7 +414,7 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) rcu_read_lock(); ca = tcp_ca_find_key(ca_key); - if (likely(ca && try_module_get(ca->owner))) { + if (likely(ca && bpf_try_module_get(ca, ca->owner))) { icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst); icsk->icsk_ca_ops = ca; ca_got_dst = true; @@ -424,7 +425,7 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) /* If no valid choice made yet, assign current system default ca. */ if (!ca_got_dst && (!icsk->icsk_ca_setsockopt || - !try_module_get(icsk->icsk_ca_ops->owner))) + !bpf_try_module_get(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner))) tcp_assign_congestion_control(sk); tcp_set_ca_state(sk, TCP_CA_Open); @@ -461,6 +462,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, struct tcp_request_sock *treq = tcp_rsk(req); struct inet_connection_sock *newicsk; struct tcp_sock *oldtp, *newtp; + u32 seq; if (!newsk) return NULL; @@ -474,12 +476,16 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, /* Now setup tcp_sock */ newtp->pred_flags = 0; - newtp->rcv_wup = newtp->copied_seq = - newtp->rcv_nxt = treq->rcv_isn + 1; + seq = treq->rcv_isn + 1; + newtp->rcv_wup = seq; + WRITE_ONCE(newtp->copied_seq, seq); + WRITE_ONCE(newtp->rcv_nxt, seq); newtp->segs_in = 1; - newtp->snd_sml = newtp->snd_una = - newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; + seq = treq->snt_isn + 1; + newtp->snd_sml = newtp->snd_una = seq; + WRITE_ONCE(newtp->snd_nxt, seq); + newtp->snd_up = seq; INIT_LIST_HEAD(&newtp->tsq_node); INIT_LIST_HEAD(&newtp->tsorted_sent_queue); @@ -494,7 +500,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->total_retrans = req->num_retrans; tcp_init_xmit_timers(newsk); - newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; + WRITE_ONCE(newtp->write_seq, newtp->pushed_seq = treq->snt_isn + 1); if (sock_flag(newsk, SOCK_KEEPOPEN)) inet_csk_reset_keepalive_timer(newsk, @@ -540,7 +546,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->rx_opt.mss_clamp = req->mss; tcp_ecn_openreq_child(newtp, req); newtp->fastopen_req = NULL; - newtp->fastopen_rsk = NULL; + RCU_INIT_POINTER(newtp->fastopen_rsk, NULL); __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 979520e46e33..306e25d743e8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -38,6 +38,7 @@ #define pr_fmt(fmt) "TCP: " fmt #include <net/tcp.h> +#include <net/mptcp.h> #include <linux/compiler.h> #include <linux/gfp.h> @@ -67,11 +68,14 @@ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); unsigned int prior_packets = tp->packets_out; - tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq); __skb_unlink(skb, &sk->sk_write_queue); tcp_rbtree_insert(&sk->tcp_rtx_queue, skb); + if (tp->highest_sack == NULL) + tp->highest_sack = skb; + tp->packets_out += tcp_skb_pcount(skb); if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); @@ -411,6 +415,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_WSCALE (1 << 3) #define OPTION_FAST_OPEN_COOKIE (1 << 8) #define OPTION_SMC (1 << 9) +#define OPTION_MPTCP (1 << 10) static void smc_options_write(__be32 *ptr, u16 *options) { @@ -436,8 +441,17 @@ struct tcp_out_options { __u8 *hash_location; /* temporary pointer, overloaded */ __u32 tsval, tsecr; /* need to include OPTION_TS */ struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ + struct mptcp_out_options mptcp; }; +static void mptcp_options_write(__be32 *ptr, struct tcp_out_options *opts) +{ +#if IS_ENABLED(CONFIG_MPTCP) + if (unlikely(OPTION_MPTCP & opts->options)) + mptcp_write_options(ptr, &opts->mptcp); +#endif +} + /* Write previously computed TCP options to the packet. * * Beware: Something in the Internet is very sensitive to the ordering of @@ -546,6 +560,8 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, } smc_options_write(ptr, &options); + + mptcp_options_write(ptr, opts); } static void smc_set_option(const struct tcp_sock *tp, @@ -581,6 +597,22 @@ static void smc_set_option_cond(const struct tcp_sock *tp, #endif } +static void mptcp_set_option_cond(const struct request_sock *req, + struct tcp_out_options *opts, + unsigned int *remaining) +{ + if (rsk_is_mptcp(req)) { + unsigned int size; + + if (mptcp_synack_options(req, &size, &opts->mptcp)) { + if (*remaining >= size) { + opts->options |= OPTION_MPTCP; + *remaining -= size; + } + } + } +} + /* Compute TCP options for SYN packets. This is not the final * network wire format yet. */ @@ -650,6 +682,15 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, smc_set_option(tp, opts, &remaining); + if (sk_is_mptcp(sk)) { + unsigned int size; + + if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) { + opts->options |= OPTION_MPTCP; + remaining -= size; + } + } + return MAX_TCP_OPTION_SPACE - remaining; } @@ -711,6 +752,8 @@ static unsigned int tcp_synack_options(const struct sock *sk, } } + mptcp_set_option_cond(req, opts, &remaining); + smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); return MAX_TCP_OPTION_SPACE - remaining; @@ -748,13 +791,35 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb size += TCPOLEN_TSTAMP_ALIGNED; } + /* MPTCP options have precedence over SACK for the limited TCP + * option space because a MPTCP connection would be forced to + * fall back to regular TCP if a required multipath option is + * missing. SACK still gets a chance to use whatever space is + * left. + */ + if (sk_is_mptcp(sk)) { + unsigned int remaining = MAX_TCP_OPTION_SPACE - size; + unsigned int opt_size = 0; + + if (mptcp_established_options(sk, skb, &opt_size, remaining, + &opts->mptcp)) { + opts->options |= OPTION_MPTCP; + size += opt_size; + } + } + eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; if (unlikely(eff_sacks)) { const unsigned int remaining = MAX_TCP_OPTION_SPACE - size; + if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED + + TCPOLEN_SACK_PERBLOCK)) + return size; + opts->num_sack_blocks = min_t(unsigned int, eff_sacks, (remaining - TCPOLEN_SACK_BASE_ALIGNED) / TCPOLEN_SACK_PERBLOCK); + size += TCPOLEN_SACK_BASE_ALIGNED + opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; } @@ -1050,11 +1115,22 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, tcb = TCP_SKB_CB(skb); memset(&opts, 0, sizeof(opts)); - if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) + if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) { tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5); - else + } else { tcp_options_size = tcp_established_options(sk, skb, &opts, &md5); + /* Force a PSH flag on all (GSO) packets to expedite GRO flush + * at receiver : This slightly improve GRO performance. + * Note that we do not force the PSH flag for non GSO packets, + * because they might be sent under high congestion events, + * and in this case it is better to delay the delivery of 1-MSS + * packets and thus the corresponding ACK packet that would + * release the following packet. + */ + if (tcp_skb_pcount(skb) > 1) + tcb->tcp_flags |= TCPHDR_PSH; + } tcp_header_size = tcp_options_size + sizeof(struct tcphdr); /* if no packet is in qdisc/device queue, then allow XPS to select @@ -1185,10 +1261,10 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); /* Advance write_seq and place onto the write_queue. */ - tp->write_seq = TCP_SKB_CB(skb)->end_seq; + WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq); __skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); - sk->sk_wmem_queued += skb->truesize; + sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); } @@ -1322,7 +1398,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, return -ENOMEM; /* We'll just try again later. */ skb_copy_decrypted(buff, skb); - sk->sk_wmem_queued += buff->truesize; + sk_wmem_queued_add(sk, buff->truesize); sk_mem_charge(sk, buff->truesize); nlen = skb->len - len - nsize; buff->truesize += nlen; @@ -1403,7 +1479,7 @@ static int __pskb_trim_head(struct sk_buff *skb, int len) } else { shinfo->frags[k] = shinfo->frags[i]; if (eat) { - shinfo->frags[k].page_offset += eat; + skb_frag_off_add(&shinfo->frags[k], eat); skb_frag_size_sub(&shinfo->frags[k], eat); eat = 0; } @@ -1432,7 +1508,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) if (delta_truesize) { skb->truesize -= delta_truesize; - sk->sk_wmem_queued -= delta_truesize; + sk_wmem_queued_add(sk, -delta_truesize); sk_mem_uncharge(sk, delta_truesize); sock_set_flag(sk, SOCK_QUEUE_SHRUNK); } @@ -1713,7 +1789,7 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, u32 bytes, segs; bytes = min_t(unsigned long, - sk->sk_pacing_rate >> sk->sk_pacing_shift, + sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift), sk->sk_gso_max_size - 1 - MAX_TCP_HEADER); /* Goal is to send at least one packet per ms, @@ -1877,7 +1953,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, return -ENOMEM; skb_copy_decrypted(buff, skb); - sk->sk_wmem_queued += buff->truesize; + sk_wmem_queued_add(sk, buff->truesize); sk_mem_charge(sk, buff->truesize); buff->truesize += nlen; skb->truesize -= nlen; @@ -2053,7 +2129,7 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len) if (len <= skb->len) break; - if (unlikely(TCP_SKB_CB(skb)->eor)) + if (unlikely(TCP_SKB_CB(skb)->eor) || tcp_has_tx_tstamp(skb)) return false; len -= skb->len; @@ -2141,7 +2217,7 @@ static int tcp_mtu_probe(struct sock *sk) nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false); if (!nskb) return -1; - sk->sk_wmem_queued += nskb->truesize; + sk_wmem_queued_add(sk, nskb->truesize); sk_mem_charge(sk, nskb->truesize); skb = tcp_send_head(sk); @@ -2170,6 +2246,7 @@ static int tcp_mtu_probe(struct sock *sk) * we need to propagate it to the new skb. */ TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor; + tcp_skb_collapse_tstamp(nskb, skb); tcp_unlink_write_queue(skb, sk); sk_wmem_free_skb(sk, skb); } else { @@ -2247,7 +2324,7 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, limit = max_t(unsigned long, 2 * skb->truesize, - sk->sk_pacing_rate >> sk->sk_pacing_shift); + sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift)); if (sk->sk_pacing_status == SK_PACING_NONE) limit = min_t(unsigned long, limit, sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); @@ -2425,6 +2502,14 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (tcp_small_queue_check(sk, skb, 0)) break; + /* Argh, we hit an empty skb(), presumably a thread + * is sleeping in sendmsg()/sk_stream_wait_memory(). + * We do not want to send a pure-ack packet and have + * a strange looking rtx queue with empty packet(s). + */ + if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) + break; + if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; @@ -2470,7 +2555,7 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) /* Don't do any loss probe on a Fast Open connection before 3WHS * finishes. */ - if (tp->fastopen_rsk) + if (rcu_access_pointer(tp->fastopen_rsk)) return false; early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans; @@ -2841,7 +2926,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, if (!tcp_can_collapse(sk, skb)) break; - if (!tcp_skb_can_collapse_to(to)) + if (!tcp_skb_can_collapse(to, skb)) break; space -= skb->len; @@ -3108,7 +3193,7 @@ void sk_forced_mem_schedule(struct sock *sk, int size) */ void tcp_send_fin(struct sock *sk) { - struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk); + struct sk_buff *skb, *tskb, *tail = tcp_write_queue_tail(sk); struct tcp_sock *tp = tcp_sk(sk); /* Optimization, tack on the FIN if we have one skb in write queue and @@ -3116,6 +3201,7 @@ void tcp_send_fin(struct sock *sk) * Note: in the latter case, FIN packet will be sent after a timeout, * as TCP stack thinks it has already been transmitted. */ + tskb = tail; if (!tskb && tcp_under_memory_pressure(sk)) tskb = skb_rb_last(&sk->tcp_rtx_queue); @@ -3123,14 +3209,14 @@ void tcp_send_fin(struct sock *sk) TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; TCP_SKB_CB(tskb)->end_seq++; tp->write_seq++; - if (tcp_write_queue_empty(sk)) { + if (!tail) { /* This means tskb was already sent. * Pretend we included the FIN on previous transmit. * We need to set tp->snd_nxt to the value it would have * if FIN had been sent. This is because retransmit path * does not change tp->snd_nxt. */ - tp->snd_nxt++; + WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + 1); return; } } else { @@ -3207,10 +3293,11 @@ int tcp_send_synack(struct sock *sk) if (!nskb) return -ENOMEM; INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor); + tcp_highest_sack_replace(sk, skb, nskb); tcp_rtx_queue_unlink_and_free(skb, sk); __skb_header_release(nskb); tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb); - sk->sk_wmem_queued += nskb->truesize; + sk_wmem_queued_add(sk, nskb->truesize); sk_mem_charge(sk, nskb->truesize); skb = nskb; } @@ -3278,7 +3365,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, now = tcp_clock_ns(); #ifdef CONFIG_SYN_COOKIES if (unlikely(req->cookie_ts)) - skb->skb_mstamp_ns = cookie_init_timestamp(req); + skb->skb_mstamp_ns = cookie_init_timestamp(req, now); else #endif { @@ -3343,8 +3430,8 @@ static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst) rcu_read_lock(); ca = tcp_ca_find_key(ca_key); - if (likely(ca && try_module_get(ca->owner))) { - module_put(icsk->icsk_ca_ops->owner); + if (likely(ca && bpf_try_module_get(ca, ca->owner))) { + bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner); icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst); icsk->icsk_ca_ops = ca; } @@ -3414,14 +3501,14 @@ static void tcp_connect_init(struct sock *sk) tp->snd_una = tp->write_seq; tp->snd_sml = tp->write_seq; tp->snd_up = tp->write_seq; - tp->snd_nxt = tp->write_seq; + WRITE_ONCE(tp->snd_nxt, tp->write_seq); if (likely(!tp->repair)) tp->rcv_nxt = 0; else tp->rcv_tstamp = tcp_jiffies32; tp->rcv_wup = tp->rcv_nxt; - tp->copied_seq = tp->rcv_nxt; + WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); inet_csk(sk)->icsk_rto = tcp_timeout_init(sk); inet_csk(sk)->icsk_retransmits = 0; @@ -3435,9 +3522,9 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) tcb->end_seq += skb->len; __skb_header_release(skb); - sk->sk_wmem_queued += skb->truesize; + sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); - tp->write_seq = tcb->end_seq; + WRITE_ONCE(tp->write_seq, tcb->end_seq); tp->packets_out += tcp_skb_pcount(skb); } @@ -3574,11 +3661,11 @@ int tcp_connect(struct sock *sk) /* We change tp->snd_nxt after the tcp_transmit_skb() call * in order to make this packet get counted in tcpOutSegs. */ - tp->snd_nxt = tp->write_seq; + WRITE_ONCE(tp->snd_nxt, tp->write_seq); tp->pushed_seq = tp->write_seq; buff = tcp_send_head(sk); if (unlikely(buff)) { - tp->snd_nxt = TCP_SKB_CB(buff)->seq; + WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(buff)->seq); tp->pushed_seq = TCP_SKB_CB(buff)->seq; } TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index c801cd37cc2a..c3f26dcd6704 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -154,7 +154,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) } else { mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1; mss = min(net->ipv4.sysctl_tcp_base_mss, mss); - mss = max(mss, 68 - tcp_sk(sk)->tcp_header_len); + mss = max(mss, net->ipv4.sysctl_tcp_mtu_probe_floor); mss = max(mss, net->ipv4.sysctl_tcp_min_snd_mss); icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); } @@ -198,8 +198,13 @@ static bool retransmits_timed_out(struct sock *sk, return false; start_ts = tcp_sk(sk)->retrans_stamp; - if (likely(timeout == 0)) - timeout = tcp_model_timeout(sk, boundary, TCP_RTO_MIN); + if (likely(timeout == 0)) { + unsigned int rto_base = TCP_RTO_MIN; + + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) + rto_base = tcp_timeout_init(sk); + timeout = tcp_model_timeout(sk, boundary, rto_base); + } return (s32)(tcp_time_stamp(tcp_sk(sk)) - start_ts - timeout) >= 0; } @@ -210,7 +215,7 @@ static int tcp_write_timeout(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); - bool expired, do_reset; + bool expired = false, do_reset; int retry_until; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { @@ -218,6 +223,9 @@ static int tcp_write_timeout(struct sock *sk) dst_negative_advice(sk); } else { sk_rethink_txhash(sk); + tp->timeout_rehash++; + __NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPTIMEOUTREHASH); } retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; expired = icsk->icsk_retransmits >= retry_until; @@ -229,6 +237,9 @@ static int tcp_write_timeout(struct sock *sk) dst_negative_advice(sk); } else { sk_rethink_txhash(sk); + tp->timeout_rehash++; + __NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPTIMEOUTREHASH); } retry_until = net->ipv4.sysctl_tcp_retries2; @@ -242,9 +253,10 @@ static int tcp_write_timeout(struct sock *sk) if (tcp_out_of_resources(sk, do_reset)) return 1; } + } + if (!expired) expired = retransmits_timed_out(sk, retry_until, icsk->icsk_user_timeout); - } tcp_fastopen_active_detect_blackhole(sk, expired); if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG)) @@ -380,15 +392,13 @@ abort: tcp_write_err(sk); * Timer for Fast Open socket to retransmit SYNACK. Note that the * sk here is the child socket, not the parent (listener) socket. */ -static void tcp_fastopen_synack_timer(struct sock *sk) +static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) { struct inet_connection_sock *icsk = inet_csk(sk); int max_retries = icsk->icsk_syn_retries ? : sock_net(sk)->ipv4.sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */ struct tcp_sock *tp = tcp_sk(sk); - struct request_sock *req; - req = tcp_sk(sk)->fastopen_rsk; req->rsk_ops->syn_ack_timeout(req); if (req->num_timeout >= max_retries) { @@ -429,17 +439,26 @@ void tcp_retransmit_timer(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct request_sock *req; + struct sk_buff *skb; - if (tp->fastopen_rsk) { + req = rcu_dereference_protected(tp->fastopen_rsk, + lockdep_sock_is_held(sk)); + if (req) { WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && sk->sk_state != TCP_FIN_WAIT1); - tcp_fastopen_synack_timer(sk); + tcp_fastopen_synack_timer(sk, req); /* Before we receive ACK to our SYN-ACK don't retransmit * anything else (e.g., data or FIN segments). */ return; } - if (!tp->packets_out || WARN_ON_ONCE(tcp_rtx_queue_empty(sk))) + + if (!tp->packets_out) + return; + + skb = tcp_rtx_queue_head(sk); + if (WARN_ON_ONCE(!skb)) return; tp->tlp_high_seq = 0; @@ -473,7 +492,7 @@ void tcp_retransmit_timer(struct sock *sk) goto out; } tcp_enter_loss(sk); - tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1); + tcp_retransmit_skb(sk, skb, 1); __sk_dst_reset(sk); goto out_reset_timer; } diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c index 4849edb62d52..38d3ad141161 100644 --- a/net/ipv4/tcp_ulp.c +++ b/net/ipv4/tcp_ulp.c @@ -92,21 +92,26 @@ void tcp_get_available_ulp(char *buf, size_t maxlen) offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ulp_ops->name); + + if (WARN_ON_ONCE(offs >= maxlen)) + break; } rcu_read_unlock(); } -void tcp_update_ulp(struct sock *sk, struct proto *proto) +void tcp_update_ulp(struct sock *sk, struct proto *proto, + void (*write_space)(struct sock *sk)) { struct inet_connection_sock *icsk = inet_csk(sk); if (!icsk->icsk_ulp_ops) { + sk->sk_write_space = write_space; sk->sk_prot = proto; return; } if (icsk->icsk_ulp_ops->update) - icsk->icsk_ulp_ops->update(sk, proto); + icsk->icsk_ulp_ops->update(sk, proto, write_space); } void tcp_cleanup_ulp(struct sock *sk) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d88821c794fb..db76b9609299 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -388,7 +388,7 @@ static int compute_score(struct sock *sk, struct net *net, return -1; score += 4; - if (sk->sk_incoming_cpu == raw_smp_processor_id()) + if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; return score; } @@ -423,12 +423,13 @@ static struct sock *udp4_lib_lookup2(struct net *net, score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif); if (score > badness) { - if (sk->sk_reuseport) { + if (sk->sk_reuseport && + sk->sk_state != TCP_ESTABLISHED) { hash = udp_ehashfn(net, daddr, hnum, saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); - if (result) + if (result && !reuseport_has_conns(sk, false)) return result; } badness = score; @@ -820,6 +821,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, int is_udplite = IS_UDPLITE(sk); int offset = skb_transport_offset(skb); int len = skb->len - offset; + int datalen = len - sizeof(*uh); __wsum csum = 0; /* @@ -853,10 +855,12 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, return -EIO; } - skb_shinfo(skb)->gso_size = cork->gso_size; - skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; - skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(len - sizeof(uh), - cork->gso_size); + if (datalen > cork->gso_size) { + skb_shinfo(skb)->gso_size = cork->gso_size; + skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; + skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen, + cork->gso_size); + } goto csum_partial; } @@ -1130,7 +1134,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl4 = &fl4_stack; - flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, + flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, tos, RT_SCOPE_UNIVERSE, sk->sk_protocol, flow_flags, faddr, saddr, dport, inet->inet_sport, @@ -1293,6 +1297,27 @@ out: #define UDP_SKB_IS_STATELESS 0x80000000 +/* all head states (dst, sk, nf conntrack) except skb extensions are + * cleared by udp_rcv(). + * + * We need to preserve secpath, if present, to eventually process + * IP_CMSG_PASSSEC at recvmsg() time. + * + * Other extensions can be cleared. + */ +static bool udp_try_make_stateless(struct sk_buff *skb) +{ + if (!skb_has_extensions(skb)) + return true; + + if (!secpath_exists(skb)) { + skb_ext_reset(skb); + return true; + } + + return false; +} + static void udp_set_dev_scratch(struct sk_buff *skb) { struct udp_dev_scratch *scratch = udp_skb_scratch(skb); @@ -1304,14 +1329,24 @@ static void udp_set_dev_scratch(struct sk_buff *skb) scratch->csum_unnecessary = !!skb_csum_unnecessary(skb); scratch->is_linear = !skb_is_nonlinear(skb); #endif - /* all head states execept sp (dst, sk, nf) are always cleared by - * udp_rcv() and we need to preserve secpath, if present, to eventually - * process IP_CMSG_PASSSEC at recvmsg() time - */ - if (likely(!skb_sec_path(skb))) + if (udp_try_make_stateless(skb)) scratch->_tsize_state |= UDP_SKB_IS_STATELESS; } +static void udp_skb_csum_unnecessary_set(struct sk_buff *skb) +{ + /* We come here after udp_lib_checksum_complete() returned 0. + * This means that __skb_checksum_complete() might have + * set skb->csum_valid to 1. + * On 64bit platforms, we can set csum_unnecessary + * to true, but only if the skb is not shared. + */ +#if BITS_PER_LONG == 64 + if (!skb_shared(skb)) + udp_skb_scratch(skb)->csum_unnecessary = true; +#endif +} + static int udp_skb_truesize(struct sk_buff *skb) { return udp_skb_scratch(skb)->_tsize_state & ~UDP_SKB_IS_STATELESS; @@ -1333,7 +1368,8 @@ static void udp_rmem_release(struct sock *sk, int size, int partial, if (likely(partial)) { up->forward_deficit += size; size = up->forward_deficit; - if (size < (sk->sk_rcvbuf >> 2)) + if (size < (sk->sk_rcvbuf >> 2) && + !skb_queue_empty(&up->reader_queue)) return; } else { size += up->forward_deficit; @@ -1440,7 +1476,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) * queue contains some other skb */ rmem = atomic_add_return(size, &sk->sk_rmem_alloc); - if (rmem > (size + sk->sk_rcvbuf)) + if (rmem > (size + (unsigned int)sk->sk_rcvbuf)) goto uncharge_drop; spin_lock(&list->lock); @@ -1546,10 +1582,7 @@ static struct sk_buff *__first_packet_length(struct sock *sk, *total += skb->truesize; kfree_skb(skb); } else { - /* the csum related bits could be changed, refresh - * the scratch area - */ - udp_set_dev_scratch(skb); + udp_skb_csum_unnecessary_set(skb); break; } } @@ -1573,7 +1606,7 @@ static int first_packet_length(struct sock *sk) spin_lock_bh(&rcvq->lock); skb = __first_packet_length(sk, rcvq, &total); - if (!skb && !skb_queue_empty(sk_queue)) { + if (!skb && !skb_queue_empty_lockless(sk_queue)) { spin_lock(&sk_queue->lock); skb_queue_splice_tail_init(sk_queue, rcvq); spin_unlock(&sk_queue->lock); @@ -1646,7 +1679,7 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, return skb; } - if (skb_queue_empty(sk_queue)) { + if (skb_queue_empty_lockless(sk_queue)) { spin_unlock_bh(&queue->lock); goto busy_check; } @@ -1672,11 +1705,12 @@ busy_check: break; sk_busy_loop(sk, flags & MSG_DONTWAIT); - } while (!skb_queue_empty(sk_queue)); + } while (!skb_queue_empty_lockless(sk_queue)); /* sk_queue is empty, reader_queue may contain peeked packets */ } while (timeo && - !__skb_wait_for_more_packets(sk, &error, &timeo, + !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, + &error, &timeo, (struct sk_buff *)sk_queue)); *err = error; @@ -1968,7 +2002,7 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) */ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto drop; - nf_reset(skb); + nf_reset_ct(skb); if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) { int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); @@ -2072,8 +2106,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_SGO_CB_OFFSET); __skb_push(skb, -skb_mac_offset(skb)); segs = udp_rcv_segment(sk, skb, true); - for (skb = segs; skb; skb = next) { - next = skb->next; + skb_list_walk_safe(segs, skb, next) { __skb_pull(skb, skb_transport_offset(skb)); ret = udp_queue_rcv_one_skb(sk, skb); if (ret > 0) @@ -2297,7 +2330,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; - nf_reset(skb); + nf_reset_ct(skb); /* No socket. Drop packet silently, if checksum is wrong */ if (udp_lib_checksum_complete(skb)) @@ -2519,9 +2552,11 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, case UDP_ENCAP: switch (val) { case 0: +#ifdef CONFIG_XFRM case UDP_ENCAP_ESPINUDP: case UDP_ENCAP_ESPINUDP_NON_IKE: up->encap_rcv = xfrm4_udp_encap_rcv; +#endif /* FALLTHROUGH */ case UDP_ENCAP_L2TPINUDP: up->encap_type = val; @@ -2708,7 +2743,7 @@ __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait) __poll_t mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk; - if (!skb_queue_empty(&udp_sk(sk)->reader_queue)) + if (!skb_queue_empty_lockless(&udp_sk(sk)->reader_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* Check for false positives due to checksum errors */ diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index a3908e55ed89..1a98583a79f4 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -184,6 +184,20 @@ out_unlock: } EXPORT_SYMBOL(skb_udp_tunnel_segment); +static struct sk_buff *__udp_gso_segment_list(struct sk_buff *skb, + netdev_features_t features) +{ + unsigned int mss = skb_shinfo(skb)->gso_size; + + skb = skb_segment_list(skb, features, skb_mac_header_len(skb)); + if (IS_ERR(skb)) + return skb; + + udp_hdr(skb)->len = htons(sizeof(struct udphdr) + mss); + + return skb; +} + struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, netdev_features_t features) { @@ -196,6 +210,9 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, __sum16 check; __be16 newlen; + if (skb_shinfo(gso_skb)->gso_type & SKB_GSO_FRAGLIST) + return __udp_gso_segment_list(gso_skb, features); + mss = skb_shinfo(gso_skb)->gso_size; if (gso_skb->len <= sizeof(*uh) + mss) return ERR_PTR(-EINVAL); @@ -354,6 +371,7 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, struct udphdr *uh2; struct sk_buff *p; unsigned int ulen; + int ret = 0; /* requires non zero csum, for symmetry with GSO */ if (!uh->check) { @@ -369,7 +387,6 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, } /* pull encapsulating udp header */ skb_gro_pull(skb, sizeof(struct udphdr)); - skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) @@ -383,14 +400,40 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, continue; } + if (NAPI_GRO_CB(skb)->is_flist != NAPI_GRO_CB(p)->is_flist) { + NAPI_GRO_CB(skb)->flush = 1; + return p; + } + /* Terminate the flow on len mismatch or if it grow "too much". * Under small packet flood GRO count could elsewhere grow a lot * leading to excessive truesize values. * On len mismatch merge the first packet shorter than gso_size, * otherwise complete the GRO packet. */ - if (ulen > ntohs(uh2->len) || skb_gro_receive(p, skb) || - ulen != ntohs(uh2->len) || + if (ulen > ntohs(uh2->len)) { + pp = p; + } else { + if (NAPI_GRO_CB(skb)->is_flist) { + if (!pskb_may_pull(skb, skb_gro_offset(skb))) { + NAPI_GRO_CB(skb)->flush = 1; + return NULL; + } + if ((skb->ip_summed != p->ip_summed) || + (skb->csum_level != p->csum_level)) { + NAPI_GRO_CB(skb)->flush = 1; + return NULL; + } + ret = skb_gro_receive_list(p, skb); + } else { + skb_gro_postpull_rcsum(skb, uh, + sizeof(struct udphdr)); + + ret = skb_gro_receive(p, skb); + } + } + + if (ret || ulen != ntohs(uh2->len) || NAPI_GRO_CB(p)->count >= UDP_GRO_CNT_MAX) pp = p; @@ -401,36 +444,29 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, return NULL; } -INDIRECT_CALLABLE_DECLARE(struct sock *udp6_lib_lookup_skb(struct sk_buff *skb, - __be16 sport, __be16 dport)); struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, - struct udphdr *uh, udp_lookup_t lookup) + struct udphdr *uh, struct sock *sk) { struct sk_buff *pp = NULL; struct sk_buff *p; struct udphdr *uh2; unsigned int off = skb_gro_offset(skb); int flush = 1; - struct sock *sk; - rcu_read_lock(); - sk = INDIRECT_CALL_INET(lookup, udp6_lib_lookup_skb, - udp4_lib_lookup_skb, skb, uh->source, uh->dest); - if (!sk) - goto out_unlock; + if (skb->dev->features & NETIF_F_GRO_FRAGLIST) + NAPI_GRO_CB(skb)->is_flist = sk ? !udp_sk(sk)->gro_enabled: 1; - if (udp_sk(sk)->gro_enabled) { + if ((sk && udp_sk(sk)->gro_enabled) || NAPI_GRO_CB(skb)->is_flist) { pp = call_gro_receive(udp_gro_receive_segment, head, skb); - rcu_read_unlock(); return pp; } - if (NAPI_GRO_CB(skb)->encap_mark || + if (!sk || NAPI_GRO_CB(skb)->encap_mark || (skb->ip_summed != CHECKSUM_PARTIAL && NAPI_GRO_CB(skb)->csum_cnt == 0 && !NAPI_GRO_CB(skb)->csum_valid) || !udp_sk(sk)->gro_receive) - goto out_unlock; + goto out; /* mark that this skb passed once through the tunnel gro layer */ NAPI_GRO_CB(skb)->encap_mark = 1; @@ -457,8 +493,7 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); pp = call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); -out_unlock: - rcu_read_unlock(); +out: skb_gro_flush_final(skb, pp, flush); return pp; } @@ -468,8 +503,10 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); + struct sk_buff *pp; + struct sock *sk; - if (unlikely(!uh) || !static_branch_unlikely(&udp_encap_needed_key)) + if (unlikely(!uh)) goto flush; /* Don't bother verifying checksum if we're going to flush anyway. */ @@ -480,11 +517,15 @@ struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb) inet_gro_compute_pseudo)) goto flush; else if (uh->check) - skb_gro_checksum_try_convert(skb, IPPROTO_UDP, uh->check, + skb_gro_checksum_try_convert(skb, IPPROTO_UDP, inet_gro_compute_pseudo); skip: NAPI_GRO_CB(skb)->is_ipv6 = 0; - return udp_gro_receive(head, skb, uh, udp4_lib_lookup_skb); + rcu_read_lock(); + sk = static_branch_unlikely(&udp_encap_needed_key) ? udp4_lib_lookup_skb(skb, uh->source, uh->dest) : NULL; + pp = udp_gro_receive(head, skb, uh, sk); + rcu_read_unlock(); + return pp; flush: NAPI_GRO_CB(skb)->flush = 1; @@ -517,9 +558,7 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff, rcu_read_lock(); sk = INDIRECT_CALL_INET(lookup, udp6_lib_lookup_skb, udp4_lib_lookup_skb, skb, uh->source, uh->dest); - if (sk && udp_sk(sk)->gro_enabled) { - err = udp_gro_complete_segment(skb); - } else if (sk && udp_sk(sk)->gro_complete) { + if (sk && udp_sk(sk)->gro_complete) { skb_shinfo(skb)->gso_type = uh->check ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; @@ -529,6 +568,8 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff, skb->encapsulation = 1; err = udp_sk(sk)->gro_complete(sk, skb, nhoff + sizeof(struct udphdr)); + } else { + err = udp_gro_complete_segment(skb); } rcu_read_unlock(); @@ -544,6 +585,23 @@ INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff) const struct iphdr *iph = ip_hdr(skb); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); + if (NAPI_GRO_CB(skb)->is_flist) { + uh->len = htons(skb->len - nhoff); + + skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4); + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; + + if (skb->ip_summed == CHECKSUM_UNNECESSARY) { + if (skb->csum_level < SKB_MAX_CSUM_LEVEL) + skb->csum_level++; + } else { + skb->ip_summed = CHECKSUM_UNNECESSARY; + skb->csum_level = 0; + } + + return 0; + } + if (uh->check) uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr, iph->daddr, 0); diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index ecff3fce9807..89ba7c87de5d 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -92,7 +92,7 @@ static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) { return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, - net, sk, skb, NULL, skb_dst(skb)->dev, + net, sk, skb, skb->dev, skb_dst(skb)->dev, __xfrm4_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index cdef8f9a3b01..9ebd54752e03 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -85,6 +85,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL); xdst->u.rt.rt_type = rt->rt_type; + xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway; xdst->u.rt.rt_gw_family = rt->rt_gw_family; if (rt->rt_gw_family == AF_INET) xdst->u.rt.rt_gw4 = rt->rt_gw4; @@ -99,12 +100,13 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, } static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) + struct sk_buff *skb, u32 mtu, + bool confirm_neigh) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct dst_entry *path = xdst->route; - path->ops->update_pmtu(path, sk, skb, mtu); + path->ops->update_pmtu(path, sk, skb, mtu, confirm_neigh); } static void xfrm4_redirect(struct dst_entry *dst, struct sock *sk, diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c index 8a4285712808..ea595c8549c7 100644 --- a/net/ipv4/xfrm4_protocol.c +++ b/net/ipv4/xfrm4_protocol.c @@ -72,6 +72,14 @@ int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, if (!head) goto out; + if (!skb_dst(skb)) { + const struct iphdr *iph = ip_hdr(skb); + + if (ip_route_input_noref(skb, iph->daddr, iph->saddr, + iph->tos, skb->dev)) + goto drop; + } + for_each_protocol_rcu(*head, handler) if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL) return ret; @@ -79,6 +87,7 @@ int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, out: icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); +drop: kfree_skb(skb); return 0; } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index dc73888c7859..cb493e15959c 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -478,7 +478,7 @@ static struct inet6_dev *ipv6_find_idev(struct net_device *dev) if (!idev) { idev = ipv6_add_dev(dev); if (IS_ERR(idev)) - return NULL; + return idev; } if (dev->flags&IFF_UP) @@ -1045,7 +1045,8 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg, int err = 0; if (addr_type == IPV6_ADDR_ANY || - addr_type & IPV6_ADDR_MULTICAST || + (addr_type & IPV6_ADDR_MULTICAST && + !(cfg->ifa_flags & IFA_F_MCAUTOJOIN)) || (!(idev->dev->flags & IFF_LOOPBACK) && !netif_is_l3_master(idev->dev) && addr_type & IPV6_ADDR_LOOPBACK)) @@ -2465,8 +2466,8 @@ static struct inet6_dev *addrconf_add_dev(struct net_device *dev) ASSERT_RTNL(); idev = ipv6_find_idev(dev); - if (!idev) - return ERR_PTR(-ENOBUFS); + if (IS_ERR(idev)) + return idev; if (idev->cnf.disable_ipv6) return ERR_PTR(-EACCES); @@ -3158,7 +3159,7 @@ static void init_loopback(struct net_device *dev) ASSERT_RTNL(); idev = ipv6_find_idev(dev); - if (!idev) { + if (IS_ERR(idev)) { pr_debug("%s: add_dev failed\n", __func__); return; } @@ -3373,7 +3374,7 @@ static void addrconf_sit_config(struct net_device *dev) */ idev = ipv6_find_idev(dev); - if (!idev) { + if (IS_ERR(idev)) { pr_debug("%s: add_dev failed\n", __func__); return; } @@ -3398,7 +3399,7 @@ static void addrconf_gre_config(struct net_device *dev) ASSERT_RTNL(); idev = ipv6_find_idev(dev); - if (!idev) { + if (IS_ERR(idev)) { pr_debug("%s: add_dev failed\n", __func__); return; } @@ -4772,8 +4773,8 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC; idev = ipv6_find_idev(dev); - if (!idev) - return -ENOBUFS; + if (IS_ERR(idev)) + return PTR_ERR(idev); if (!ipv6_allow_optimistic_dad(net, idev)) cfg.ifa_flags &= ~IFA_F_OPTIMISTIC; @@ -5230,16 +5231,16 @@ static int inet6_rtm_valid_getaddr_req(struct sk_buff *skb, return -EINVAL; } + if (!netlink_strict_get_check(skb)) + return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, + ifa_ipv6_policy, extack); + ifm = nlmsg_data(nlh); if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get address request"); return -EINVAL; } - if (!netlink_strict_get_check(skb)) - return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, - ifa_ipv6_policy, extack); - err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy, extack); if (err) @@ -5551,14 +5552,13 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev, nla = nla_reserve(skb, IFLA_INET6_TOKEN, sizeof(struct in6_addr)); if (!nla) goto nla_put_failure; - - if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->cnf.addr_gen_mode)) - goto nla_put_failure; - read_lock_bh(&idev->lock); memcpy(nla_data(nla), idev->token.s6_addr, nla_len(nla)); read_unlock_bh(&idev->lock); + if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->cnf.addr_gen_mode)) + goto nla_put_failure; + return 0; nla_put_failure: @@ -5718,6 +5718,9 @@ static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla) struct nlattr *tb[IFLA_INET6_MAX + 1]; int err; + if (!idev) + return -EAFNOSUPPORT; + if (nla_parse_nested_deprecated(tb, IFLA_INET6_MAX, nla, NULL, NULL) < 0) BUG(); @@ -5963,13 +5966,20 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) switch (event) { case RTM_NEWADDR: /* - * If the address was optimistic - * we inserted the route at the start of - * our DAD process, so we don't need - * to do it again + * If the address was optimistic we inserted the route at the + * start of our DAD process, so we don't need to do it again. + * If the device was taken down in the middle of the DAD + * cycle there is a race where we could get here without a + * host route, so nothing to insert. That will be fixed when + * the device is brought up. */ - if (!rcu_access_pointer(ifp->rt->fib6_node)) + if (ifp->rt && !rcu_access_pointer(ifp->rt->fib6_node)) { ip6_ins_rt(net, ifp->rt); + } else if (!ifp->rt && (ifp->idev->dev->flags & IFF_UP)) { + pr_warn("BUG: Address %pI6c on device %s is missing its host route.\n", + &ifp->addr, ifp->idev->dev->name); + } + if (ifp->idev->cnf.forwarding) addrconf_join_anycast(ifp); if (!ipv6_addr_any(&ifp->peer_addr)) diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index 783f3c1466da..ea00ce3d4117 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -7,6 +7,7 @@ #include <linux/export.h> #include <net/ipv6.h> #include <net/ipv6_stubs.h> +#include <net/addrconf.h> #include <net/ip.h> /* if ipv6 module registers this function is used by xfrm to force all @@ -128,11 +129,12 @@ int inet6addr_validator_notifier_call_chain(unsigned long val, void *v) } EXPORT_SYMBOL(inet6addr_validator_notifier_call_chain); -static int eafnosupport_ipv6_dst_lookup(struct net *net, struct sock *u1, - struct dst_entry **u2, - struct flowi6 *u3) +static struct dst_entry *eafnosupport_ipv6_dst_lookup_flow(struct net *net, + const struct sock *sk, + struct flowi6 *fl6, + const struct in6_addr *final_dst) { - return -EAFNOSUPPORT; + return ERR_PTR(-EAFNOSUPPORT); } static int eafnosupport_ipv6_route_input(struct sk_buff *skb) @@ -189,7 +191,7 @@ static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt) } const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { - .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup, + .ipv6_dst_lookup_flow = eafnosupport_ipv6_dst_lookup_flow, .ipv6_route_input = eafnosupport_ipv6_route_input, .fib6_get_table = eafnosupport_fib6_get_table, .fib6_table_lookup = eafnosupport_fib6_table_lookup, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index ef37e0574f54..d727c3b41495 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -292,7 +292,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, return -EINVAL; snum = ntohs(addr->sin6_port); - if (snum && snum < inet_prot_sock(net) && + if (snum && inet_port_requires_bind_service(net, snum) && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) return -EACCES; @@ -765,7 +765,7 @@ int inet6_sk_rebuild_header(struct sock *sk) &final); rcu_read_unlock(); - dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { sk->sk_route_caps = 0; sk->sk_err_soft = -PTR_ERR(dst); @@ -946,7 +946,7 @@ static int ipv6_route_input(struct sk_buff *skb) static const struct ipv6_stub ipv6_stub_impl = { .ipv6_sock_mc_join = ipv6_sock_mc_join, .ipv6_sock_mc_drop = ipv6_sock_mc_drop, - .ipv6_dst_lookup = ip6_dst_lookup, + .ipv6_dst_lookup_flow = ip6_dst_lookup_flow, .ipv6_route_input = ipv6_route_input, .fib6_get_table = fib6_get_table, .fib6_table_lookup = fib6_table_lookup, diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 9ab897ded4df..390bedde21a5 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -27,6 +27,7 @@ #include <net/ip6_route.h> #include <net/tcp_states.h> #include <net/dsfield.h> +#include <net/sock_reuseport.h> #include <linux/errqueue.h> #include <linux/uaccess.h> @@ -84,7 +85,7 @@ int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr) final_p = fl6_update_dst(&fl6, opt, &final); rcu_read_unlock(); - dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto out; @@ -254,6 +255,7 @@ ipv4_connected: goto out; } + reuseport_has_conns(sk, true); sk->sk_state = TCP_ESTABLISHED; sk_set_txhash(sk); out: diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index e31626ffccd1..fd535053245b 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -79,6 +79,8 @@ static struct sk_buff *esp6_gro_receive(struct list_head *head, if (!x) goto out_reset; + skb->mark = xfrm_smark_get(skb->mark, x); + sp->xvec[sp->len++] = x; sp->olen++; diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c index b358f1a4dd08..da46c4284676 100644 --- a/net/ipv6/exthdrs_core.c +++ b/net/ipv6/exthdrs_core.c @@ -197,10 +197,8 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, struct ipv6hdr _ip6, *ip6; ip6 = skb_header_pointer(skb, *offset, sizeof(_ip6), &_ip6); - if (!ip6 || (ip6->version != 6)) { - printk(KERN_ERR "IPv6 header not found\n"); + if (!ip6 || (ip6->version != 6)) return -EBADMSG; - } start = *offset + sizeof(struct ipv6hdr); nexthdr = ip6->nexthdr; } diff --git a/net/ipv6/fib6_notifier.c b/net/ipv6/fib6_notifier.c index 05f82baaa99e..f87ae33e1d01 100644 --- a/net/ipv6/fib6_notifier.c +++ b/net/ipv6/fib6_notifier.c @@ -7,12 +7,12 @@ #include <net/netns/ipv6.h> #include <net/ip6_fib.h> -int call_fib6_notifier(struct notifier_block *nb, struct net *net, +int call_fib6_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info) { info->family = AF_INET6; - return call_fib_notifier(nb, net, event_type, info); + return call_fib_notifier(nb, event_type, info); } int call_fib6_notifiers(struct net *net, enum fib_event_type event_type, @@ -27,15 +27,16 @@ static unsigned int fib6_seq_read(struct net *net) return fib6_tables_seq_read(net) + fib6_rules_seq_read(net); } -static int fib6_dump(struct net *net, struct notifier_block *nb) +static int fib6_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { int err; - err = fib6_rules_dump(net, nb); + err = fib6_rules_dump(net, nb, extack); if (err) return err; - return fib6_tables_dump(net, nb); + return fib6_tables_dump(net, nb, extack); } static const struct fib_notifier_ops fib6_notifier_ops_template = { diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index d22b6c140f23..fafe556d21e0 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -47,9 +47,10 @@ bool fib6_rule_default(const struct fib_rule *rule) } EXPORT_SYMBOL_GPL(fib6_rule_default); -int fib6_rules_dump(struct net *net, struct notifier_block *nb) +int fib6_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { - return fib_rules_dump(net, nb, AF_INET6); + return fib_rules_dump(net, nb, AF_INET6, extack); } unsigned int fib6_rules_seq_read(struct net *net) @@ -287,7 +288,8 @@ static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg return false; suppress_route: - ip6_rt_put(rt); + if (!(arg->flags & FIB_LOOKUP_NOREF)) + ip6_rt_put(rt); return true; } diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 62c997201970..ef408a5090a2 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -516,13 +516,29 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, mip6_addr_swap(skb); + sk = icmpv6_xmit_lock(net); + if (!sk) + goto out_bh_enable; + memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_ICMPV6; fl6.daddr = hdr->saddr; if (force_saddr) saddr = force_saddr; - if (saddr) + if (saddr) { fl6.saddr = *saddr; + } else { + /* select a more meaningful saddr from input if */ + struct net_device *in_netdev; + + in_netdev = dev_get_by_index(net, IP6CB(skb)->iif); + if (in_netdev) { + ipv6_dev_get_saddr(net, in_netdev, &fl6.daddr, + inet6_sk(sk)->srcprefs, + &fl6.saddr); + dev_put(in_netdev); + } + } fl6.flowi6_mark = mark; fl6.flowi6_oif = iif; fl6.fl6_icmp_type = type; @@ -531,10 +547,6 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, NULL); security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); - sk = icmpv6_xmit_lock(net); - if (!sk) - goto out_bh_enable; - sk->sk_mark = mark; np = inet6_sk(sk); diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 4da24aa6c696..e315526fa244 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -48,7 +48,7 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk, fl6->flowi6_uid = sk->sk_uid; security_req_classify_flow(req, flowi6_to_flowi(fl6)); - dst = ip6_dst_lookup_flow(sk, fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (IS_ERR(dst)) return NULL; @@ -103,7 +103,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, dst = __inet6_csk_dst_check(sk, np->dst_cookie); if (!dst) { - dst = ip6_dst_lookup_flow(sk, fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (!IS_ERR(dst)) ip6_dst_store(sk, dst, NULL, NULL); @@ -133,7 +133,7 @@ int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused fl6.daddr = sk->sk_v6_daddr; res = ip6_xmit(sk, skb, &fl6, sk->sk_mark, rcu_dereference(np->opt), - np->tclass); + np->tclass, sk->sk_priority); rcu_read_unlock(); return res; } @@ -146,7 +146,7 @@ struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu) if (IS_ERR(dst)) return NULL; - dst->ops->update_pmtu(dst, sk, NULL, mtu); + dst->ops->update_pmtu(dst, sk, NULL, mtu, true); dst = inet6_csk_route_socket(sk, &fl6); return IS_ERR(dst) ? NULL : dst; diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index cf60fae9533b..fbe9d4295eac 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -105,7 +105,7 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score = 1; - if (sk->sk_incoming_cpu == raw_smp_processor_id()) + if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; } return score; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 87f47bc55c5e..58fbde244381 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -318,7 +318,7 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, if (rt->dst.error == -EAGAIN) { ip6_rt_put_flags(rt, flags); rt = net->ipv6.ip6_null_entry; - if (!(flags | RT6_LOOKUP_F_DST_NOREF)) + if (!(flags & RT6_LOOKUP_F_DST_NOREF)) dst_hold(&rt->dst); } @@ -357,15 +357,32 @@ unsigned int fib6_tables_seq_read(struct net *net) return fib_seq; } -static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net, +static int call_fib6_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, - struct fib6_info *rt) + struct fib6_info *rt, + struct netlink_ext_ack *extack) +{ + struct fib6_entry_notifier_info info = { + .info.extack = extack, + .rt = rt, + }; + + return call_fib6_notifier(nb, event_type, &info.info); +} + +static int call_fib6_multipath_entry_notifier(struct notifier_block *nb, + enum fib_event_type event_type, + struct fib6_info *rt, + unsigned int nsiblings, + struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { + .info.extack = extack, .rt = rt, + .nsiblings = nsiblings, }; - return call_fib6_notifier(nb, net, event_type, &info.info); + return call_fib6_notifier(nb, event_type, &info.info); } int call_fib6_entry_notifiers(struct net *net, @@ -398,43 +415,72 @@ int call_fib6_multipath_entry_notifiers(struct net *net, return call_fib6_notifiers(net, event_type, &info.info); } +int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt) +{ + struct fib6_entry_notifier_info info = { + .rt = rt, + .nsiblings = rt->fib6_nsiblings, + }; + + rt->fib6_table->fib_seq++; + return call_fib6_notifiers(net, FIB_EVENT_ENTRY_REPLACE, &info.info); +} + struct fib6_dump_arg { struct net *net; struct notifier_block *nb; + struct netlink_ext_ack *extack; }; -static void fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg) +static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg) { - if (rt == arg->net->ipv6.fib6_null_entry) - return; - call_fib6_entry_notifier(arg->nb, arg->net, FIB_EVENT_ENTRY_ADD, rt); + enum fib_event_type fib_event = FIB_EVENT_ENTRY_REPLACE; + int err; + + if (!rt || rt == arg->net->ipv6.fib6_null_entry) + return 0; + + if (rt->fib6_nsiblings) + err = call_fib6_multipath_entry_notifier(arg->nb, fib_event, + rt, + rt->fib6_nsiblings, + arg->extack); + else + err = call_fib6_entry_notifier(arg->nb, fib_event, rt, + arg->extack); + + return err; } static int fib6_node_dump(struct fib6_walker *w) { - struct fib6_info *rt; + int err; - for_each_fib6_walker_rt(w) - fib6_rt_dump(rt, w->args); + err = fib6_rt_dump(w->leaf, w->args); w->leaf = NULL; - return 0; + return err; } -static void fib6_table_dump(struct net *net, struct fib6_table *tb, - struct fib6_walker *w) +static int fib6_table_dump(struct net *net, struct fib6_table *tb, + struct fib6_walker *w) { + int err; + w->root = &tb->tb6_root; spin_lock_bh(&tb->tb6_lock); - fib6_walk(net, w); + err = fib6_walk(net, w); spin_unlock_bh(&tb->tb6_lock); + return err; } /* Called with rcu_read_lock() */ -int fib6_tables_dump(struct net *net, struct notifier_block *nb) +int fib6_tables_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { struct fib6_dump_arg arg; struct fib6_walker *w; unsigned int h; + int err = 0; w = kzalloc(sizeof(*w), GFP_ATOMIC); if (!w) @@ -443,19 +489,24 @@ int fib6_tables_dump(struct net *net, struct notifier_block *nb) w->func = fib6_node_dump; arg.net = net; arg.nb = nb; + arg.extack = extack; w->args = &arg; for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv6.fib_table_hash[h]; struct fib6_table *tb; - hlist_for_each_entry_rcu(tb, head, tb6_hlist) - fib6_table_dump(net, tb, w); + hlist_for_each_entry_rcu(tb, head, tb6_hlist) { + err = fib6_table_dump(net, tb, w); + if (err < 0) + goto out; + } } +out: kfree(w); - return 0; + return err; } static int fib6_dump_node(struct fib6_walker *w) @@ -1021,6 +1072,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, (info->nlh->nlmsg_flags & NLM_F_CREATE)); int found = 0; bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); + bool notify_sibling_rt = false; u16 nlflags = NLM_F_EXCL; int err; @@ -1112,6 +1164,7 @@ next_iter: /* Find the first route that have the same metric */ sibling = leaf; + notify_sibling_rt = true; while (sibling) { if (sibling->fib6_metric == rt->fib6_metric && rt6_qualify_for_ecmp(sibling)) { @@ -1121,6 +1174,7 @@ next_iter: } sibling = rcu_dereference_protected(sibling->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock)); + notify_sibling_rt = false; } /* For each sibling in the list, increment the counter of * siblings. BUG() if counters does not match, list of siblings @@ -1147,10 +1201,21 @@ next_iter: add: nlflags |= NLM_F_CREATE; - if (!info->skip_notify_kernel) { + /* The route should only be notified if it is the first + * route in the node or if it is added as a sibling + * route to the first route in the node. + */ + if (!info->skip_notify_kernel && + (notify_sibling_rt || ins == &fn->leaf)) { + enum fib_event_type fib_event; + + if (notify_sibling_rt) + fib_event = FIB_EVENT_ENTRY_APPEND; + else + fib_event = FIB_EVENT_ENTRY_REPLACE; err = call_fib6_entry_notifiers(info->nl_net, - FIB_EVENT_ENTRY_ADD, - rt, extack); + fib_event, rt, + extack); if (err) { struct fib6_info *sibling, *next_sibling; @@ -1194,7 +1259,7 @@ add: return -ENOENT; } - if (!info->skip_notify_kernel) { + if (!info->skip_notify_kernel && ins == &fn->leaf) { err = call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE, rt, extack); @@ -1443,6 +1508,8 @@ out: } #endif goto failure; + } else if (fib6_requires_src(rt)) { + fib6_routes_require_src_inc(info->nl_net); } return err; @@ -1825,13 +1892,29 @@ static struct fib6_node *fib6_repair_tree(struct net *net, static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, struct fib6_info __rcu **rtp, struct nl_info *info) { + struct fib6_info *leaf, *replace_rt = NULL; struct fib6_walker *w; struct fib6_info *rt = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock)); struct net *net = info->nl_net; + bool notify_del = false; RT6_TRACE("fib6_del_route\n"); + /* If the deleted route is the first in the node and it is not part of + * a multipath route, then we need to replace it with the next route + * in the node, if exists. + */ + leaf = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&table->tb6_lock)); + if (leaf == rt && !rt->fib6_nsiblings) { + if (rcu_access_pointer(rt->fib6_next)) + replace_rt = rcu_dereference_protected(rt->fib6_next, + lockdep_is_held(&table->tb6_lock)); + else + notify_del = true; + } + /* Unlink it */ *rtp = rt->fib6_next; rt->fib6_node = NULL; @@ -1849,6 +1932,14 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, if (rt->fib6_nsiblings) { struct fib6_info *sibling, *next_sibling; + /* The route is deleted from a multipath route. If this + * multipath route is the first route in the node, then we need + * to emit a delete notification. Otherwise, we need to skip + * the notification. + */ + if (rt->fib6_metric == leaf->fib6_metric && + rt6_qualify_for_ecmp(leaf)) + notify_del = true; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) sibling->fib6_nsiblings--; @@ -1884,8 +1975,13 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, fib6_purge_rt(rt, fn, net); - if (!info->skip_notify_kernel) - call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL); + if (!info->skip_notify_kernel) { + if (notify_del) + call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, + rt, NULL); + else if (replace_rt) + call_fib6_entry_notifiers_replace(net, replace_rt); + } if (!info->skip_notify) inet6_rt_notify(RTM_DELROUTE, rt, info, 0); @@ -1915,6 +2011,8 @@ int fib6_del(struct fib6_info *rt, struct nl_info *info) struct fib6_info *cur = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock)); if (rt == cur) { + if (fib6_requires_src(cur)) + fib6_routes_require_src_dec(info->nl_net); fib6_del_route(table, fn, rtp, info); return 0; } @@ -2473,14 +2571,13 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; + ++(*pos); if (!v) goto iter_table; n = rcu_dereference_bh(((struct fib6_info *)v)->fib6_next); - if (n) { - ++*pos; + if (n) return n; - } iter_table: ipv6_route_check_sernum(iter); @@ -2488,8 +2585,6 @@ iter_table: r = fib6_walk_continue(&iter->w); spin_unlock_bh(&iter->tbl->tb6_lock); if (r > 0) { - if (v) - ++*pos; return iter->w.leaf; } else if (r < 0) { fib6_walker_unlink(net, &iter->w); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index dd2d0b963260..55bfc5149d0c 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -968,7 +968,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || ip_tunnel_info_af(tun_info) != AF_INET6)) - return -EINVAL; + goto tx_err; key = &tun_info->key; memset(&fl6, 0, sizeof(fl6)); @@ -980,9 +980,9 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, dsfield = key->tos; if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)) goto tx_err; - md = ip_tunnel_info_opts(tun_info); - if (!md) + if (tun_info->options_len < sizeof(*md)) goto tx_err; + md = ip_tunnel_info_opts(tun_info); tun_id = tunnel_id_to_key32(key->tun_id); if (md->version == 1) { @@ -1040,7 +1040,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, /* TooBig packet may have updated dst->dev's mtu */ if (!t->parms.collect_md && dst && dst_mtu(dst) > dst->dev->mtu) - dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu); + dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu, false); err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, NEXTHDR_GRE); @@ -1466,7 +1466,6 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) dev->mtu -= 8; if (tunnel->parms.collect_md) { - dev->features |= NETIF_F_NETNS_LOCAL; netif_keep_dst(dev); } ip6gre_tnl_init_features(dev); @@ -1894,7 +1893,6 @@ static void ip6gre_tap_setup(struct net_device *dev) dev->needs_free_netdev = true; dev->priv_destructor = ip6gre_dev_free; - dev->features |= NETIF_F_NETNS_LOCAL; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; netif_keep_dst(dev); @@ -2170,8 +2168,8 @@ static const struct nla_policy ip6gre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_OFLAGS] = { .type = NLA_U16 }, [IFLA_GRE_IKEY] = { .type = NLA_U32 }, [IFLA_GRE_OKEY] = { .type = NLA_U32 }, - [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct ipv6hdr, saddr) }, - [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct ipv6hdr, daddr) }, + [IFLA_GRE_LOCAL] = { .len = sizeof_field(struct ipv6hdr, saddr) }, + [IFLA_GRE_REMOTE] = { .len = sizeof_field(struct ipv6hdr, daddr) }, [IFLA_GRE_TTL] = { .type = NLA_U8 }, [IFLA_GRE_ENCAP_LIMIT] = { .type = NLA_U8 }, [IFLA_GRE_FLOWINFO] = { .type = NLA_U32 }, @@ -2192,11 +2190,11 @@ static void ip6erspan_tap_setup(struct net_device *dev) { ether_setup(dev); + dev->max_mtu = 0; dev->netdev_ops = &ip6erspan_netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = ip6gre_dev_free; - dev->features |= NETIF_F_NETNS_LOCAL; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; netif_keep_dst(dev); diff --git a/net/ipv6/ip6_icmp.c b/net/ipv6/ip6_icmp.c index 02045494c24c..e0086758b6ee 100644 --- a/net/ipv6/ip6_icmp.c +++ b/net/ipv6/ip6_icmp.c @@ -45,4 +45,38 @@ out: rcu_read_unlock(); } EXPORT_SYMBOL(icmpv6_send); + +#if IS_ENABLED(CONFIG_NF_NAT) +#include <net/netfilter/nf_conntrack.h> +void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) +{ + struct sk_buff *cloned_skb = NULL; + enum ip_conntrack_info ctinfo; + struct in6_addr orig_ip; + struct nf_conn *ct; + + ct = nf_ct_get(skb_in, &ctinfo); + if (!ct || !(ct->status & IPS_SRC_NAT)) { + icmpv6_send(skb_in, type, code, info); + return; + } + + if (skb_shared(skb_in)) + skb_in = cloned_skb = skb_clone(skb_in, GFP_ATOMIC); + + if (unlikely(!skb_in || skb_network_header(skb_in) < skb_in->head || + (skb_network_header(skb_in) + sizeof(struct ipv6hdr)) > + skb_tail_pointer(skb_in) || skb_ensure_writable(skb_in, + skb_network_offset(skb_in) + sizeof(struct ipv6hdr)))) + goto out; + + orig_ip = ipv6_hdr(skb_in)->saddr; + ipv6_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.in6; + icmpv6_send(skb_in, type, code, info); + ipv6_hdr(skb_in)->saddr = orig_ip; +out: + consume_skb(cloned_skb); +} +EXPORT_SYMBOL(icmpv6_ndo_send); +#endif #endif diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index fa014d5f1732..7b089d0ac8cd 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -80,15 +80,33 @@ static void ip6_sublist_rcv_finish(struct list_head *head) { struct sk_buff *skb, *next; - list_for_each_entry_safe(skb, next, head, list) + list_for_each_entry_safe(skb, next, head, list) { + skb_list_del_init(skb); dst_input(skb); + } +} + +static bool ip6_can_use_hint(const struct sk_buff *skb, + const struct sk_buff *hint) +{ + return hint && !skb_dst(skb) && + ipv6_addr_equal(&ipv6_hdr(hint)->daddr, &ipv6_hdr(skb)->daddr); +} + +static struct sk_buff *ip6_extract_route_hint(const struct net *net, + struct sk_buff *skb) +{ + if (fib6_routes_require_src(net) || fib6_has_custom_rules(net)) + return NULL; + + return skb; } static void ip6_list_rcv_finish(struct net *net, struct sock *sk, struct list_head *head) { + struct sk_buff *skb, *next, *hint = NULL; struct dst_entry *curr_dst = NULL; - struct sk_buff *skb, *next; struct list_head sublist; INIT_LIST_HEAD(&sublist); @@ -102,9 +120,15 @@ static void ip6_list_rcv_finish(struct net *net, struct sock *sk, skb = l3mdev_ip6_rcv(skb); if (!skb) continue; - ip6_rcv_finish_core(net, sk, skb); + + if (ip6_can_use_hint(skb, hint)) + skb_dst_copy(skb, hint); + else + ip6_rcv_finish_core(net, sk, skb); dst = skb_dst(skb); if (curr_dst != dst) { + hint = ip6_extract_route_hint(net, skb); + /* dispatch old sublist */ if (!list_empty(&sublist)) ip6_sublist_rcv_finish(&sublist); @@ -221,6 +245,16 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev, if (ipv6_addr_is_multicast(&hdr->saddr)) goto err; + /* While RFC4291 is not explicit about v4mapped addresses + * in IPv6 headers, it seems clear linux dual-stack + * model can not deal properly with these. + * Security models could be fooled by ::ffff:127.0.0.1 for example. + * + * https://tools.ietf.org/html/draft-itojun-v6ops-v4mapped-harmful-02 + */ + if (ipv6_addr_v4mapped(&hdr->saddr)) + goto err; + skb->transport_header = skb->network_header + sizeof(*hdr); IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); @@ -313,7 +347,8 @@ void ipv6_list_rcv(struct list_head *head, struct packet_type *pt, list_add_tail(&skb->list, &sublist); } /* dispatch final sublist */ - ip6_sublist_rcv(&sublist, curr_dev, curr_net); + if (!list_empty(&sublist)) + ip6_sublist_rcv(&sublist, curr_dev, curr_net); } INDIRECT_CALLABLE_DECLARE(int udpv6_rcv(struct sk_buff *)); @@ -369,7 +404,7 @@ resubmit_final: /* Free reference early: we don't need it any more, and it may hold ip_conntrack module loaded indefinitely. */ - nf_reset(skb); + nf_reset_ct(skb); skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 8e49fd62eea9..087304427bbb 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -160,7 +160,7 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net_device *dev = skb_dst(skb)->dev; + struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev; struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); skb->protocol = htons(ETH_P_IPV6); @@ -173,7 +173,7 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) } return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, - net, sk, skb, NULL, dev, + net, sk, skb, indev, dev, ip6_finish_output, !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } @@ -193,7 +193,7 @@ bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) * which are using proper atomic operations or spinlocks. */ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, - __u32 mark, struct ipv6_txoptions *opt, int tclass) + __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority) { struct net *net = sock_net(sk); const struct ipv6_pinfo *np = inet6_sk(sk); @@ -258,7 +258,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, hdr->daddr = *first_hop; skb->protocol = htons(ETH_P_IPV6); - skb->priority = sk->sk_priority; + skb->priority = priority; skb->mark = mark; mtu = dst_mtu(dst); @@ -768,6 +768,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, inet6_sk(skb->sk) : NULL; struct ip6_frag_state state; unsigned int mtu, hlen, nexthdr_offset; + ktime_t tstamp = skb->tstamp; int hroom, err = 0; __be32 frag_id; u8 *prevhdr, nexthdr = 0; @@ -855,6 +856,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, if (iter.frag) ip6_fraglist_prepare(skb, &iter); + skb->tstamp = tstamp; err = output(net, sk, skb); if (!err) IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), @@ -913,6 +915,7 @@ slow_path: /* * Put this fragment into the sending queue. */ + frag->tstamp = tstamp; err = output(net, sk, frag); if (err) goto fail; @@ -1141,19 +1144,19 @@ EXPORT_SYMBOL_GPL(ip6_dst_lookup); * It returns a valid dst pointer on success, or a pointer encoded * error code. */ -struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6, +struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst) { struct dst_entry *dst = NULL; int err; - err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6); + err = ip6_dst_lookup_tail(net, sk, &dst, fl6); if (err) return ERR_PTR(err); if (final_dst) fl6->daddr = *final_dst; - return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); + return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0); } EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); @@ -1185,7 +1188,7 @@ struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, if (dst) return dst; - dst = ip6_dst_lookup_flow(sk, fl6, final_dst); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst); if (connected && !IS_ERR(dst)) ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6); @@ -1294,6 +1297,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, cork->base.fragsize = mtu; cork->base.gso_size = ipc6->gso_size; cork->base.tx_flags = 0; + cork->base.mark = ipc6->sockc.mark; sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags); if (dst_allfrag(xfrm_dst_path(&rt->dst))) @@ -1764,7 +1768,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, hdr->daddr = *final_dst; skb->priority = sk->sk_priority; - skb->mark = sk->sk_mark; + skb->mark = cork->base.mark; skb->tstamp = cork->base.transmit_time; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 754a484d35df..5d65436ad5ad 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -121,6 +121,7 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev) /** * ip6_tnl_lookup - fetch tunnel matching the end-point addresses + * @link: ifindex of underlying interface * @remote: the address of the tunnel exit-point * @local: the address of the tunnel entry-point * @@ -134,37 +135,56 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev) for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) static struct ip6_tnl * -ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_addr *local) +ip6_tnl_lookup(struct net *net, int link, + const struct in6_addr *remote, const struct in6_addr *local) { unsigned int hash = HASH(remote, local); - struct ip6_tnl *t; + struct ip6_tnl *t, *cand = NULL; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct in6_addr any; for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { - if (ipv6_addr_equal(local, &t->parms.laddr) && - ipv6_addr_equal(remote, &t->parms.raddr) && - (t->dev->flags & IFF_UP)) + if (!ipv6_addr_equal(local, &t->parms.laddr) || + !ipv6_addr_equal(remote, &t->parms.raddr) || + !(t->dev->flags & IFF_UP)) + continue; + + if (link == t->parms.link) return t; + else + cand = t; } memset(&any, 0, sizeof(any)); hash = HASH(&any, local); for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { - if (ipv6_addr_equal(local, &t->parms.laddr) && - ipv6_addr_any(&t->parms.raddr) && - (t->dev->flags & IFF_UP)) + if (!ipv6_addr_equal(local, &t->parms.laddr) || + !ipv6_addr_any(&t->parms.raddr) || + !(t->dev->flags & IFF_UP)) + continue; + + if (link == t->parms.link) return t; + else if (!cand) + cand = t; } hash = HASH(remote, &any); for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { - if (ipv6_addr_equal(remote, &t->parms.raddr) && - ipv6_addr_any(&t->parms.laddr) && - (t->dev->flags & IFF_UP)) + if (!ipv6_addr_equal(remote, &t->parms.raddr) || + !ipv6_addr_any(&t->parms.laddr) || + !(t->dev->flags & IFF_UP)) + continue; + + if (link == t->parms.link) return t; + else if (!cand) + cand = t; } + if (cand) + return cand; + t = rcu_dereference(ip6n->collect_md_tun); if (t && t->dev->flags & IFF_UP) return t; @@ -351,7 +371,8 @@ static struct ip6_tnl *ip6_tnl_locate(struct net *net, (t = rtnl_dereference(*tp)) != NULL; tp = &t->next) { if (ipv6_addr_equal(local, &t->parms.laddr) && - ipv6_addr_equal(remote, &t->parms.raddr)) { + ipv6_addr_equal(remote, &t->parms.raddr) && + p->link == t->parms.link) { if (create) return ERR_PTR(-EEXIST); @@ -485,7 +506,7 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, processing of the error. */ rcu_read_lock(); - t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->daddr, &ipv6h->saddr); + t = ip6_tnl_lookup(dev_net(skb->dev), skb->dev->ifindex, &ipv6h->daddr, &ipv6h->saddr); if (!t) goto out; @@ -640,7 +661,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (rel_info > dst_mtu(skb_dst(skb2))) goto out; - skb_dst_update_pmtu(skb2, rel_info); + skb_dst_update_pmtu_no_confirm(skb2, rel_info); } icmp_send(skb2, rel_type, rel_code, htonl(rel_info)); @@ -887,7 +908,7 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto, int ret = -1; rcu_read_lock(); - t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, &ipv6h->daddr); + t = ip6_tnl_lookup(dev_net(skb->dev), skb->dev->ifindex, &ipv6h->saddr, &ipv6h->daddr); if (t) { u8 tproto = READ_ONCE(t->parms.proto); @@ -1132,7 +1153,7 @@ route_lookup: mtu = max(mtu, skb->protocol == htons(ETH_P_IPV6) ? IPV6_MIN_MTU : IPV4_MIN_MTU); - skb_dst_update_pmtu(skb, mtu); + skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) { *pmtu = mtu; err = -EMSGSIZE; @@ -1420,8 +1441,10 @@ tx_err: static void ip6_tnl_link_config(struct ip6_tnl *t) { struct net_device *dev = t->dev; + struct net_device *tdev = NULL; struct __ip6_tnl_parm *p = &t->parms; struct flowi6 *fl6 = &t->fl.u.ip6; + unsigned int mtu; int t_hlen; memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr)); @@ -1457,22 +1480,25 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) struct rt6_info *rt = rt6_lookup(t->net, &p->raddr, &p->laddr, p->link, NULL, strict); + if (rt) { + tdev = rt->dst.dev; + ip6_rt_put(rt); + } - if (!rt) - return; + if (!tdev && p->link) + tdev = __dev_get_by_index(t->net, p->link); - if (rt->dst.dev) { - dev->hard_header_len = rt->dst.dev->hard_header_len + - t_hlen; + if (tdev) { + dev->hard_header_len = tdev->hard_header_len + t_hlen; + mtu = min_t(unsigned int, tdev->mtu, IP6_MAX_MTU); - dev->mtu = rt->dst.dev->mtu - t_hlen; + dev->mtu = mtu - t_hlen; if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) dev->mtu -= 8; if (dev->mtu < IPV6_MIN_MTU) dev->mtu = IPV6_MIN_MTU; } - ip6_rt_put(rt); } } @@ -1877,10 +1903,8 @@ static int ip6_tnl_dev_init(struct net_device *dev) if (err) return err; ip6_tnl_link_config(t); - if (t->parms.collect_md) { - dev->features |= NETIF_F_NETNS_LOCAL; + if (t->parms.collect_md) netif_keep_dst(dev); - } return 0; } diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 024db17386d2..524006aa0d78 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -449,8 +449,17 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) int err = -1; int mtu; - if (!dst) - goto tx_err_link_failure; + if (!dst) { + fl->u.ip6.flowi6_oif = dev->ifindex; + fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; + dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6); + if (dst->error) { + dst_release(dst); + dst = NULL; + goto tx_err_link_failure; + } + skb_dst_set(skb, dst); + } dst_hold(dst); dst = xfrm_lookup(t->net, dst, fl, NULL, 0); @@ -479,7 +488,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) mtu = dst_mtu(dst); if (skb->len > mtu) { - skb_dst_update_pmtu(skb, mtu); + skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IPV6)) { if (mtu < IPV6_MIN_MTU) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index e80d36c5073d..bfa49ff70531 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -265,9 +265,10 @@ static void __net_exit ip6mr_rules_exit(struct net *net) rtnl_unlock(); } -static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb) +static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { - return fib_rules_dump(net, nb, RTNL_FAMILY_IP6MR); + return fib_rules_dump(net, nb, RTNL_FAMILY_IP6MR, extack); } static unsigned int ip6mr_rules_seq_read(struct net *net) @@ -324,7 +325,8 @@ static void __net_exit ip6mr_rules_exit(struct net *net) rtnl_unlock(); } -static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb) +static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { return 0; } @@ -1148,8 +1150,8 @@ static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, * Create a new entry if allowable */ - if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 || - (c = ip6mr_cache_alloc_unres()) == NULL) { + c = ip6mr_cache_alloc_unres(); + if (!c) { spin_unlock_bh(&mfc_unres_lock); kfree_skb(skb); @@ -1256,10 +1258,11 @@ static unsigned int ip6mr_seq_read(struct net *net) return net->ipv6.ipmr_seq + ip6mr_rules_seq_read(net); } -static int ip6mr_dump(struct net *net, struct notifier_block *nb) +static int ip6mr_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { return mr_dump(net, nb, RTNL_FAMILY_IP6MR, ip6mr_rules_dump, - ip6mr_mr_table_iter, &mrt_lock); + ip6mr_mr_table_iter, &mrt_lock, extack); } static struct notifier_block ip6_mr_notifier = { diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 264c292e7dcc..79fc012dd2ca 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -363,8 +363,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, break; case IPV6_TRANSPARENT: - if (valbool && !ns_capable(net->user_ns, CAP_NET_ADMIN) && - !ns_capable(net->user_ns, CAP_NET_RAW)) { + if (valbool && !ns_capable(net->user_ns, CAP_NET_RAW) && + !ns_capable(net->user_ns, CAP_NET_ADMIN)) { retv = -EPERM; break; } diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 7f3f13c37916..eaa4c2cc2fbb 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -787,14 +787,15 @@ static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) if (pmc) { im->idev = pmc->idev; if (im->mca_sfmode == MCAST_INCLUDE) { - im->mca_tomb = pmc->mca_tomb; - im->mca_sources = pmc->mca_sources; + swap(im->mca_tomb, pmc->mca_tomb); + swap(im->mca_sources, pmc->mca_sources); for (psf = im->mca_sources; psf; psf = psf->sf_next) psf->sf_crcount = idev->mc_qrv; } else { im->mca_crcount = idev->mc_qrv; } in6_dev_put(pmc->idev); + ip6_mc_clear_src(pmc); kfree(pmc); } spin_unlock_bh(&im->mca_lock); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 083cc1c94cd3..53caf59c591e 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -196,6 +196,7 @@ static inline int ndisc_is_useropt(const struct net_device *dev, { return opt->nd_opt_type == ND_OPT_RDNSS || opt->nd_opt_type == ND_OPT_DNSSL || + opt->nd_opt_type == ND_OPT_CAPTIVE_PORTAL || ndisc_ops_is_useropt(dev, opt->nd_opt_type); } diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 61819ed858b1..409e79b84a83 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -113,12 +113,13 @@ int __nf_ip6_route(struct net *net, struct dst_entry **dst, EXPORT_SYMBOL_GPL(__nf_ip6_route); int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, - struct nf_ct_bridge_frag_data *data, + struct nf_bridge_frag_data *data, int (*output)(struct net *, struct sock *sk, - const struct nf_ct_bridge_frag_data *data, + const struct nf_bridge_frag_data *data, struct sk_buff *)) { int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; + ktime_t tstamp = skb->tstamp; struct ip6_frag_state state; u8 *prevhdr, nexthdr = 0; unsigned int mtu, hlen; @@ -183,6 +184,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, if (iter.frag) ip6_fraglist_prepare(skb, &iter); + skb->tstamp = tstamp; err = output(net, sk, data, skb); if (err || !iter.frag) break; @@ -215,6 +217,7 @@ slow_path: goto blackhole; } + skb2->tstamp = tstamp; err = output(net, sk, data, skb2); if (err) goto blackhole; diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 6120a7800975..0594131fa46d 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -128,9 +128,9 @@ config IP6_NF_MATCH_HL depends on NETFILTER_ADVANCED select NETFILTER_XT_MATCH_HL ---help--- - This is a backwards-compat option for the user's convenience - (e.g. when running oldconfig). It selects - CONFIG_NETFILTER_XT_MATCH_HL. + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_MATCH_HL. config IP6_NF_MATCH_IPV6HEADER tristate '"ipv6header" IPv6 Extension Headers Match' @@ -170,13 +170,13 @@ config IP6_NF_MATCH_RT To compile it as a module, choose M here. If unsure, say N. config IP6_NF_MATCH_SRH - tristate '"srh" Segment Routing header match support' - depends on NETFILTER_ADVANCED - help - srh matching allows you to match packets based on the segment + tristate '"srh" Segment Routing header match support' + depends on NETFILTER_ADVANCED + help + srh matching allows you to match packets based on the segment routing header of the packet. - To compile it as a module, choose M here. If unsure, say N. + To compile it as a module, choose M here. If unsure, say N. # The targets config IP6_NF_TARGET_HL @@ -184,9 +184,9 @@ config IP6_NF_TARGET_HL depends on NETFILTER_ADVANCED && IP6_NF_MANGLE select NETFILTER_XT_TARGET_HL ---help--- - This is a backwards-compatible option for the user's convenience - (e.g. when running oldconfig). It selects - CONFIG_NETFILTER_XT_TARGET_HL. + This is a backwards-compatible option for the user's convenience + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_TARGET_HL. config IP6_NF_FILTER tristate "Packet filtering" @@ -245,14 +245,14 @@ config IP6_NF_RAW # security table for MAC policy config IP6_NF_SECURITY - tristate "Security table" - depends on SECURITY - depends on NETFILTER_ADVANCED - help - This option adds a `security' table to iptables, for use - with Mandatory Access Control (MAC) policy. - - If unsure, say N. + tristate "Security table" + depends on SECURITY + depends on NETFILTER_ADVANCED + help + This option adds a `security' table to iptables, for use + with Mandatory Access Control (MAC) policy. + + If unsure, say N. config IP6_NF_NAT tristate "ip6tables NAT support" diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index 5cdb4a69d277..fd1f52a21bf1 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -36,8 +36,8 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) opts.options |= XT_SYNPROXY_OPT_ECN; opts.options &= info->options; - opts.mss_encode = opts.mss; - opts.mss = info->mss; + opts.mss_encode = opts.mss_option; + opts.mss_option = info->mss; if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) synproxy_init_timestamp_cookie(info, &opts); else diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c index 0fc6326ef499..c52ff929c93b 100644 --- a/net/ipv6/netfilter/ip6t_ipv6header.c +++ b/net/ipv6/netfilter/ip6t_ipv6header.c @@ -16,7 +16,7 @@ #include <net/ipv6.h> #include <linux/netfilter/x_tables.h> -#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter_ipv6.h> #include <linux/netfilter_ipv6/ip6t_ipv6header.h> MODULE_LICENSE("GPL"); @@ -42,7 +42,7 @@ ipv6header_mt6(const struct sk_buff *skb, struct xt_action_param *par) len = skb->len - ptr; temp = 0; - while (ip6t_ext_hdr(nexthdr)) { + while (nf_ip6_ext_hdr(nexthdr)) { const struct ipv6_opt_hdr *hp; struct ipv6_opt_hdr _hdr; int hdrlen; diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c index e6c9da9866b1..a0a2de30be3e 100644 --- a/net/ipv6/netfilter/nf_dup_ipv6.c +++ b/net/ipv6/netfilter/nf_dup_ipv6.c @@ -54,7 +54,7 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum, return; #if IS_ENABLED(CONFIG_NF_CONNTRACK) - nf_reset(skb); + nf_reset_ct(skb); nf_ct_set(skb, NULL, IP_CT_UNTRACKED); #endif if (hooknum == NF_INET_PRE_ROUTING || diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c index f6d9a48c7a2a..a8566ee12e83 100644 --- a/net/ipv6/netfilter/nf_flow_table_ipv6.c +++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c @@ -10,6 +10,8 @@ static struct nf_flowtable_type flowtable_ipv6 = { .family = NFPROTO_IPV6, .init = nf_flow_table_init, + .setup = nf_flow_table_offload_setup, + .action = nf_flow_rule_route_ipv6, .free = nf_flow_table_free, .hook = nf_flow_offload_ipv6_hook, .owner = THIS_MODULE, diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c index f53bd8f01219..22b80db6d882 100644 --- a/net/ipv6/netfilter/nf_log_ipv6.c +++ b/net/ipv6/netfilter/nf_log_ipv6.c @@ -18,7 +18,7 @@ #include <net/route.h> #include <linux/netfilter.h> -#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter_ipv6.h> #include <linux/netfilter/xt_LOG.h> #include <net/netfilter/nf_log.h> @@ -70,7 +70,7 @@ static void dump_ipv6_packet(struct net *net, struct nf_log_buf *m, fragment = 0; ptr = ip6hoff + sizeof(struct ipv6hdr); currenthdr = ih->nexthdr; - while (currenthdr != NEXTHDR_NONE && ip6t_ext_hdr(currenthdr)) { + while (currenthdr != NEXTHDR_NONE && nf_ip6_ext_hdr(currenthdr)) { struct ipv6_opt_hdr _hdr; const struct ipv6_opt_hdr *hp; diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c index 437d95545c31..b9df879c48d3 100644 --- a/net/ipv6/netfilter/nf_socket_ipv6.c +++ b/net/ipv6/netfilter/nf_socket_ipv6.c @@ -12,7 +12,6 @@ #include <net/sock.h> #include <net/inet_sock.h> #include <net/inet6_hashtables.h> -#include <net/netfilter/ipv6/nf_defrag_ipv6.h> #include <net/netfilter/nf_socket.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack.h> diff --git a/net/ipv6/netfilter/nf_tproxy_ipv6.c b/net/ipv6/netfilter/nf_tproxy_ipv6.c index 34d51cd426b0..6bac68fb27a3 100644 --- a/net/ipv6/netfilter/nf_tproxy_ipv6.c +++ b/net/ipv6/netfilter/nf_tproxy_ipv6.c @@ -150,4 +150,4 @@ EXPORT_SYMBOL_GPL(nf_tproxy_get_sock_v6); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs"); -MODULE_DESCRIPTION("Netfilter IPv4 transparent proxy support"); +MODULE_DESCRIPTION("Netfilter IPv6 transparent proxy support"); diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 87d2d8c1db7c..98ac32b49d8c 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -223,7 +223,7 @@ static int __net_init ping_v6_proc_init_net(struct net *net) return 0; } -static void __net_init ping_v6_proc_exit_net(struct net *net) +static void __net_exit ping_v6_proc_exit_net(struct net *net) { remove_proc_entry("icmp6", net->proc_net); } diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 8a6131991e38..dfe5e603ffe1 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -215,7 +215,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) /* Not releasing hash table! */ if (clone) { - nf_reset(clone); + nf_reset_ct(clone); rawv6_rcv(sk, clone); } } @@ -646,7 +646,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, skb->protocol = htons(ETH_P_IPV6); skb->priority = sk->sk_priority; - skb->mark = sk->sk_mark; + skb->mark = sockc->mark; skb->tstamp = sockc->transmit_time; skb_put(skb, length); @@ -810,6 +810,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipcm6_init(&ipc6); ipc6.sockc.tsflags = sk->sk_tsflags; + ipc6.sockc.mark = sk->sk_mark; if (sin6) { if (addr_len < SIN6_LEN_RFC2133) @@ -891,6 +892,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) opt = ipv6_fixup_options(&opt_space, opt); fl6.flowi6_proto = proto; + fl6.flowi6_mark = ipc6.sockc.mark; if (!hdrincl) { rfv.msg = msg; @@ -923,7 +925,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); - dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto out; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index fd059e08785a..4fbdc60b4e07 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -95,7 +95,8 @@ static int ip6_pkt_prohibit(struct sk_buff *skb); static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); static void ip6_link_failure(struct sk_buff *skb); static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu); + struct sk_buff *skb, u32 mtu, + bool confirm_neigh); static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, @@ -227,7 +228,7 @@ static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) struct net_device *dev = dst->dev; struct rt6_info *rt = (struct rt6_info *)dst; - daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr); + daddr = choose_neigh_daddr(rt6_nexthop(rt, &in6addr_any), NULL, daddr); if (!daddr) return; if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) @@ -264,7 +265,8 @@ static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) } static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) + struct sk_buff *skb, u32 mtu, + bool confirm_neigh) { } @@ -621,6 +623,7 @@ static void rt6_probe(struct fib6_nh *fib6_nh) { struct __rt6_probe_work *work = NULL; const struct in6_addr *nh_gw; + unsigned long last_probe; struct neighbour *neigh; struct net_device *dev; struct inet6_dev *idev; @@ -633,12 +636,13 @@ static void rt6_probe(struct fib6_nh *fib6_nh) * Router Reachability Probe MUST be rate-limited * to no more than one per minute. */ - if (fib6_nh->fib_nh_gw_family) + if (!fib6_nh->fib_nh_gw_family) return; nh_gw = &fib6_nh->fib_nh_gw6; dev = fib6_nh->fib_nh_dev; rcu_read_lock_bh(); + last_probe = READ_ONCE(fib6_nh->last_probe); idev = __in6_dev_get(dev); neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); if (neigh) { @@ -654,13 +658,15 @@ static void rt6_probe(struct fib6_nh *fib6_nh) __neigh_set_probe_once(neigh); } write_unlock(&neigh->lock); - } else if (time_after(jiffies, fib6_nh->last_probe + + } else if (time_after(jiffies, last_probe + idev->cnf.rtr_probe_interval)) { work = kmalloc(sizeof(*work), GFP_ATOMIC); } - if (work) { - fib6_nh->last_probe = jiffies; + if (!work || cmpxchg(&fib6_nh->last_probe, + last_probe, jiffies) != last_probe) { + kfree(work); + } else { INIT_WORK(&work->work, rt6_probe_deferred); work->target = *nh_gw; dev_hold(dev); @@ -1475,11 +1481,11 @@ static u32 rt6_exception_hash(const struct in6_addr *dst, u32 val; net_get_random_once(&seed, sizeof(seed)); - val = jhash(dst, sizeof(*dst), seed); + val = jhash2((const u32 *)dst, sizeof(*dst)/sizeof(u32), seed); #ifdef CONFIG_IPV6_SUBTREES if (src) - val = jhash(src, sizeof(*src), val); + val = jhash2((const u32 *)src, sizeof(*src)/sizeof(u32), val); #endif return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); } @@ -2291,10 +2297,7 @@ static void ip6_multipath_l3_keys(const struct sk_buff *skb, if (!icmph) goto out; - if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && - icmph->icmp6_type != ICMPV6_PKT_TOOBIG && - icmph->icmp6_type != ICMPV6_TIME_EXCEED && - icmph->icmp6_type != ICMPV6_PARAMPROB) + if (!icmpv6_is_err(icmph->icmp6_type)) goto out; inner_iph = skb_header_pointer(skb, @@ -2691,7 +2694,8 @@ static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) } static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, - const struct ipv6hdr *iph, u32 mtu) + const struct ipv6hdr *iph, u32 mtu, + bool confirm_neigh) { const struct in6_addr *daddr, *saddr; struct rt6_info *rt6 = (struct rt6_info *)dst; @@ -2709,7 +2713,10 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, daddr = NULL; saddr = NULL; } - dst_confirm_neigh(dst, daddr); + + if (confirm_neigh) + dst_confirm_neigh(dst, daddr); + mtu = max_t(u32, mtu, IPV6_MIN_MTU); if (mtu >= dst_mtu(dst)) return; @@ -2725,10 +2732,9 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, rcu_read_lock(); res.f6i = rcu_dereference(rt6->from); - if (!res.f6i) { - rcu_read_unlock(); - return; - } + if (!res.f6i) + goto out_unlock; + res.fib6_flags = res.f6i->fib6_flags; res.fib6_type = res.f6i->fib6_type; @@ -2744,10 +2750,8 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, /* fib6_info uses a nexthop that does not have fib6_nh * using the dst->dev + gw. Should be impossible. */ - if (!arg.match) { - rcu_read_unlock(); - return; - } + if (!arg.match) + goto out_unlock; res.nh = arg.match; } else { @@ -2760,14 +2764,17 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, if (rt6_insert_exception(nrt6, &res)) dst_release_immediate(&nrt6->dst); } +out_unlock: rcu_read_unlock(); } } static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) + struct sk_buff *skb, u32 mtu, + bool confirm_neigh) { - __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); + __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu, + confirm_neigh); } void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, @@ -2786,7 +2793,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, dst = ip6_route_output(net, NULL, &fl6); if (!dst->error) - __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); + __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu), true); dst_release(dst); } EXPORT_SYMBOL_GPL(ip6_update_pmtu); @@ -3385,6 +3392,9 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, int err; fib6_nh->fib_nh_family = AF_INET6; +#ifdef CONFIG_IPV6_ROUTER_PREF + fib6_nh->last_probe = jiffies; +#endif err = -ENODEV; if (cfg->fc_ifindex) { @@ -3747,6 +3757,7 @@ static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { struct fib6_info *sibling, *next_sibling; + struct fib6_node *fn; /* prefer to send a single notification with all hops */ skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); @@ -3762,12 +3773,32 @@ static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) info->skip_notify = 1; } + /* 'rt' points to the first sibling route. If it is not the + * leaf, then we do not need to send a notification. Otherwise, + * we need to check if the last sibling has a next route or not + * and emit a replace or delete notification, respectively. + */ info->skip_notify_kernel = 1; - call_fib6_multipath_entry_notifiers(net, - FIB_EVENT_ENTRY_DEL, - rt, - rt->fib6_nsiblings, - NULL); + fn = rcu_dereference_protected(rt->fib6_node, + lockdep_is_held(&table->tb6_lock)); + if (rcu_access_pointer(fn->leaf) == rt) { + struct fib6_info *last_sibling, *replace_rt; + + last_sibling = list_last_entry(&rt->fib6_siblings, + struct fib6_info, + fib6_siblings); + replace_rt = rcu_dereference_protected( + last_sibling->fib6_next, + lockdep_is_held(&table->tb6_lock)); + if (replace_rt) + call_fib6_entry_notifiers_replace(net, + replace_rt); + else + call_fib6_multipath_entry_notifiers(net, + FIB_EVENT_ENTRY_DEL, + rt, rt->fib6_nsiblings, + NULL); + } list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) { @@ -4388,13 +4419,14 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net, struct fib6_config cfg = { .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL, .fc_ifindex = idev->dev->ifindex, - .fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP, + .fc_flags = RTF_UP | RTF_NONEXTHOP, .fc_dst = *addr, .fc_dst_len = 128, .fc_protocol = RTPROT_KERNEL, .fc_nlinfo.nl_net = net, .fc_ignore_dev_down = true, }; + struct fib6_info *f6i; if (anycast) { cfg.fc_type = RTN_ANYCAST; @@ -4404,7 +4436,10 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net, cfg.fc_flags |= RTF_LOCAL; } - return ip6_route_info_create(&cfg, gfp_flags, NULL); + f6i = ip6_route_info_create(&cfg, gfp_flags, NULL); + if (!IS_ERR(f6i)) + f6i->dst_nocount = true; + return f6i; } /* remove deleted ip from prefsrc entries */ @@ -5011,12 +5046,37 @@ static void ip6_route_mpath_notify(struct fib6_info *rt, inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); } +static bool ip6_route_mpath_should_notify(const struct fib6_info *rt) +{ + bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); + bool should_notify = false; + struct fib6_info *leaf; + struct fib6_node *fn; + + rcu_read_lock(); + fn = rcu_dereference(rt->fib6_node); + if (!fn) + goto out; + + leaf = rcu_dereference(fn->leaf); + if (!leaf) + goto out; + + if (rt == leaf || + (rt_can_ecmp && rt->fib6_metric == leaf->fib6_metric && + rt6_qualify_for_ecmp(leaf))) + should_notify = true; +out: + rcu_read_unlock(); + + return should_notify; +} + static int ip6_route_multipath_add(struct fib6_config *cfg, struct netlink_ext_ack *extack) { struct fib6_info *rt_notif = NULL, *rt_last = NULL; struct nl_info *info = &cfg->fc_nlinfo; - enum fib_event_type event_type; struct fib6_config r_cfg; struct rtnexthop *rtnh; struct fib6_info *rt; @@ -5141,13 +5201,27 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, nhn++; } - event_type = replace ? FIB_EVENT_ENTRY_REPLACE : FIB_EVENT_ENTRY_ADD; - err = call_fib6_multipath_entry_notifiers(info->nl_net, event_type, - rt_notif, nhn - 1, extack); - if (err) { - /* Delete all the siblings that were just added */ - err_nh = NULL; - goto add_errout; + /* An in-kernel notification should only be sent in case the new + * multipath route is added as the first route in the node, or if + * it was appended to it. We pass 'rt_notif' since it is the first + * sibling and might allow us to skip some checks in the replace case. + */ + if (ip6_route_mpath_should_notify(rt_notif)) { + enum fib_event_type fib_event; + + if (rt_notif->fib6_nsiblings != nhn - 1) + fib_event = FIB_EVENT_ENTRY_APPEND; + else + fib_event = FIB_EVENT_ENTRY_REPLACE; + + err = call_fib6_multipath_entry_notifiers(info->nl_net, + fib_event, rt_notif, + nhn - 1, extack); + if (err) { + /* Delete all the siblings that were just added */ + err_nh = NULL; + goto add_errout; + } } /* success ... tell user about new route */ @@ -5325,11 +5399,11 @@ static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh, if (nexthop_is_multipath(nh)) { struct nlattr *mp; - mp = nla_nest_start(skb, RTA_MULTIPATH); + mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); if (!mp) goto nla_put_failure; - if (nexthop_mpath_fill_node(skb, nh)) + if (nexthop_mpath_fill_node(skb, nh, AF_INET6)) goto nla_put_failure; nla_nest_end(skb, mp); @@ -5337,7 +5411,7 @@ static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh, struct fib6_nh *fib6_nh; fib6_nh = nexthop_fib6_nh(nh); - if (fib_nexthop_info(skb, &fib6_nh->nh_common, + if (fib_nexthop_info(skb, &fib6_nh->nh_common, AF_INET6, flags, false) < 0) goto nla_put_failure; } @@ -5466,13 +5540,14 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, goto nla_put_failure; if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common, - rt->fib6_nh->fib_nh_weight) < 0) + rt->fib6_nh->fib_nh_weight, AF_INET6) < 0) goto nla_put_failure; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) { if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common, - sibling->fib6_nh->fib_nh_weight) < 0) + sibling->fib6_nh->fib_nh_weight, + AF_INET6) < 0) goto nla_put_failure; } @@ -5489,7 +5564,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, rtm->rtm_flags |= nh_flags; } else { - if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, + if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, AF_INET6, &nh_flags, false) < 0) goto nla_put_failure; @@ -5501,6 +5576,13 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, expires -= jiffies; } + if (!dst) { + if (rt->offload) + rtm->rtm_flags |= RTM_F_OFFLOAD; + if (rt->trap) + rtm->rtm_flags |= RTM_F_TRAP; + } + if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) goto nla_put_failure; @@ -6192,6 +6274,9 @@ static int __net_init ip6_route_net_init(struct net *net) dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, ip6_template_metrics, true); INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->rt6i_uncached); +#ifdef CONFIG_IPV6_SUBTREES + net->ipv6.fib6_routes_require_src = 0; +#endif #endif net->ipv6.sysctl.flush_delay = 0; diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 9d4f75e0d33a..7cbc19731997 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -23,6 +23,7 @@ #include <net/addrconf.h> #include <net/ip6_route.h> #include <net/dst_cache.h> +#include <net/ip_tunnels.h> #ifdef CONFIG_IPV6_SEG6_HMAC #include <net/seg6_hmac.h> #endif @@ -81,6 +82,11 @@ static struct ipv6_sr_hdr *get_srh(struct sk_buff *skb) if (!pskb_may_pull(skb, srhoff + len)) return NULL; + /* note that pskb_may_pull may change pointers in header; + * for this reason it is necessary to reload them when needed. + */ + srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + if (!seg6_validate_srh(srh, len)) return NULL; @@ -130,7 +136,8 @@ static bool decap_and_validate(struct sk_buff *skb, int proto) skb_reset_network_header(skb); skb_reset_transport_header(skb); - skb->encapsulation = 0; + if (iptunnel_pull_offloads(skb)) + return false; return true; } @@ -144,8 +151,9 @@ static void advance_nextseg(struct ipv6_sr_hdr *srh, struct in6_addr *daddr) *daddr = *addr; } -int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, - u32 tbl_id) +static int +seg6_lookup_any_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, + u32 tbl_id, bool local_delivery) { struct net *net = dev_net(skb->dev); struct ipv6hdr *hdr = ipv6_hdr(skb); @@ -153,6 +161,7 @@ int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, struct dst_entry *dst = NULL; struct rt6_info *rt; struct flowi6 fl6; + int dev_flags = 0; fl6.flowi6_iif = skb->dev->ifindex; fl6.daddr = nhaddr ? *nhaddr : hdr->daddr; @@ -177,7 +186,13 @@ int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, dst = &rt->dst; } - if (dst && dst->dev->flags & IFF_LOOPBACK && !dst->error) { + /* we want to discard traffic destined for local packet processing, + * if @local_delivery is set to false. + */ + if (!local_delivery) + dev_flags |= IFF_LOOPBACK; + + if (dst && (dst->dev->flags & dev_flags) && !dst->error) { dst_release(dst); dst = NULL; } @@ -194,6 +209,12 @@ out: return dst->error; } +int seg6_lookup_nexthop(struct sk_buff *skb, + struct in6_addr *nhaddr, u32 tbl_id) +{ + return seg6_lookup_any_nexthop(skb, nhaddr, tbl_id, false); +} + /* regular endpoint function */ static int input_action_end(struct sk_buff *skb, struct seg6_local_lwt *slwt) { @@ -336,6 +357,8 @@ static int input_action_end_dx6(struct sk_buff *skb, if (!ipv6_addr_any(&slwt->nh6)) nhaddr = &slwt->nh6; + skb_set_transport_header(skb, sizeof(struct ipv6hdr)); + seg6_lookup_nexthop(skb, nhaddr, 0); return dst_input(skb); @@ -365,6 +388,8 @@ static int input_action_end_dx4(struct sk_buff *skb, skb_dst_drop(skb); + skb_set_transport_header(skb, sizeof(struct iphdr)); + err = ip_route_input(skb, nhaddr, iph->saddr, 0, skb->dev); if (err) goto drop; @@ -385,7 +410,9 @@ static int input_action_end_dt6(struct sk_buff *skb, if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto drop; - seg6_lookup_nexthop(skb, NULL, slwt->table); + skb_set_transport_header(skb, sizeof(struct ipv6hdr)); + + seg6_lookup_any_nexthop(skb, NULL, slwt->table, true); return dst_input(skb); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index b2ccbc473127..98954830c40b 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -944,7 +944,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, } if (tunnel->parms.iph.daddr) - skb_dst_update_pmtu(skb, mtu); + skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->len > mtu && !skb_is_gso(skb)) { icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 16632e02e9b0..13235a012388 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -178,6 +178,9 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) treq = tcp_rsk(req); treq->tfo_listener = false; + if (IS_ENABLED(CONFIG_MPTCP)) + treq->is_mptcp = 0; + if (security_inet_conn_request(sk, skb, req)) goto out_free; @@ -235,7 +238,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) fl6.flowi6_uid = sk->sk_uid; security_req_classify_flow(req, flowi6_to_flowi(&fl6)); - dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) goto out_free; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5da069e91cac..eaf09e6b7844 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -75,13 +75,14 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); static const struct inet_connection_sock_af_ops ipv6_mapped; -static const struct inet_connection_sock_af_ops ipv6_specific; +const struct inet_connection_sock_af_ops ipv6_specific; #ifdef CONFIG_TCP_MD5SIG static const struct tcp_sock_af_ops tcp_sock_ipv6_specific; static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific; #else static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk, - const struct in6_addr *addr) + const struct in6_addr *addr, + int l3index) { return NULL; } @@ -215,7 +216,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, !ipv6_addr_equal(&sk->sk_v6_daddr, &usin->sin6_addr)) { tp->rx_opt.ts_recent = 0; tp->rx_opt.ts_recent_stamp = 0; - tp->write_seq = 0; + WRITE_ONCE(tp->write_seq, 0); } sk->sk_v6_daddr = usin->sin6_addr; @@ -237,6 +238,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; icsk->icsk_af_ops = &ipv6_mapped; + if (sk_is_mptcp(sk)) + mptcpv6_handle_mapped(sk, true); sk->sk_backlog_rcv = tcp_v4_do_rcv; #ifdef CONFIG_TCP_MD5SIG tp->af_specific = &tcp_sock_ipv6_mapped_specific; @@ -247,6 +250,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (err) { icsk->icsk_ext_hdr_len = exthdrlen; icsk->icsk_af_ops = &ipv6_specific; + if (sk_is_mptcp(sk)) + mptcpv6_handle_mapped(sk, false); sk->sk_backlog_rcv = tcp_v6_do_rcv; #ifdef CONFIG_TCP_MD5SIG tp->af_specific = &tcp_sock_ipv6_specific; @@ -275,7 +280,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto failure; @@ -311,10 +316,11 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (likely(!tp->repair)) { if (!tp->write_seq) - tp->write_seq = secure_tcpv6_seq(np->saddr.s6_addr32, - sk->sk_v6_daddr.s6_addr32, - inet->inet_sport, - inet->inet_dport); + WRITE_ONCE(tp->write_seq, + secure_tcpv6_seq(np->saddr.s6_addr32, + sk->sk_v6_daddr.s6_addr32, + inet->inet_sport, + inet->inet_dport)); tp->tsoffset = secure_tcpv6_ts_off(sock_net(sk), np->saddr.s6_addr32, sk->sk_v6_daddr.s6_addr32); @@ -406,7 +412,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, tp = tcp_sk(sk); /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ - fastopen = tp->fastopen_rsk; + fastopen = rcu_dereference(tp->fastopen_rsk); snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; if (sk->sk_state != TCP_LISTEN && !between(seq, snd_una, tp->snd_nxt)) { @@ -512,7 +518,8 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, opt = ireq->ipv6_opt; if (!opt) opt = rcu_dereference(np->opt); - err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass); + err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass, + sk->sk_priority); rcu_read_unlock(); err = net_xmit_eval(err); } @@ -530,15 +537,22 @@ static void tcp_v6_reqsk_destructor(struct request_sock *req) #ifdef CONFIG_TCP_MD5SIG static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk, - const struct in6_addr *addr) + const struct in6_addr *addr, + int l3index) { - return tcp_md5_do_lookup(sk, (union tcp_md5_addr *)addr, AF_INET6); + return tcp_md5_do_lookup(sk, l3index, + (union tcp_md5_addr *)addr, AF_INET6); } static struct tcp_md5sig_key *tcp_v6_md5_lookup(const struct sock *sk, const struct sock *addr_sk) { - return tcp_v6_md5_do_lookup(sk, &addr_sk->sk_v6_daddr); + int l3index; + + l3index = l3mdev_master_ifindex_by_index(sock_net(sk), + addr_sk->sk_bound_dev_if); + return tcp_v6_md5_do_lookup(sk, &addr_sk->sk_v6_daddr, + l3index); } static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, @@ -546,6 +560,7 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, { struct tcp_md5sig cmd; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.tcpm_addr; + int l3index = 0; u8 prefixlen; if (optlen < sizeof(cmd)) @@ -567,12 +582,30 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, prefixlen = ipv6_addr_v4mapped(&sin6->sin6_addr) ? 32 : 128; } + if (optname == TCP_MD5SIG_EXT && + cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); + if (dev && netif_is_l3_master(dev)) + l3index = dev->ifindex; + rcu_read_unlock(); + + /* ok to reference set/not set outside of rcu; + * right now device MUST be an L3 master + */ + if (!dev || !l3index) + return -EINVAL; + } + if (!cmd.tcpm_keylen) { if (ipv6_addr_v4mapped(&sin6->sin6_addr)) return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3], - AF_INET, prefixlen); + AF_INET, prefixlen, + l3index); return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr, - AF_INET6, prefixlen); + AF_INET6, prefixlen, l3index); } if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) @@ -580,12 +613,13 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, if (ipv6_addr_v4mapped(&sin6->sin6_addr)) return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3], - AF_INET, prefixlen, cmd.tcpm_key, - cmd.tcpm_keylen, GFP_KERNEL); + AF_INET, prefixlen, l3index, + cmd.tcpm_key, cmd.tcpm_keylen, + GFP_KERNEL); return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr, - AF_INET6, prefixlen, cmd.tcpm_key, - cmd.tcpm_keylen, GFP_KERNEL); + AF_INET6, prefixlen, l3index, + cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); } static int tcp_v6_md5_hash_headers(struct tcp_md5sig_pool *hp, @@ -696,17 +730,23 @@ clear_hash_noput: #endif static bool tcp_v6_inbound_md5_hash(const struct sock *sk, - const struct sk_buff *skb) + const struct sk_buff *skb, + int dif, int sdif) { #ifdef CONFIG_TCP_MD5SIG const __u8 *hash_location = NULL; struct tcp_md5sig_key *hash_expected; const struct ipv6hdr *ip6h = ipv6_hdr(skb); const struct tcphdr *th = tcp_hdr(skb); - int genhash; + int genhash, l3index; u8 newhash[16]; - hash_expected = tcp_v6_md5_do_lookup(sk, &ip6h->saddr); + /* sdif set, means packet ingressed via a device + * in an L3 domain and dif is set to the l3mdev + */ + l3index = sdif ? dif : 0; + + hash_expected = tcp_v6_md5_do_lookup(sk, &ip6h->saddr, l3index); hash_location = tcp_parse_md5sig_option(th); /* We've parsed the options - do we have a hash? */ @@ -730,10 +770,10 @@ static bool tcp_v6_inbound_md5_hash(const struct sock *sk, if (genhash || memcmp(hash_location, newhash, 16) != 0) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); - net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u\n", + net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u L3 index %d\n", genhash ? "failed" : "mismatch", &ip6h->saddr, ntohs(th->source), - &ip6h->daddr, ntohs(th->dest)); + &ip6h->daddr, ntohs(th->dest), l3index); return true; } #endif @@ -783,7 +823,7 @@ struct request_sock_ops tcp6_request_sock_ops __read_mostly = { .syn_ack_timeout = tcp_syn_ack_timeout, }; -static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { +const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { .mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr), #ifdef CONFIG_TCP_MD5SIG @@ -803,7 +843,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, int rst, - u8 tclass, __be32 label) + u8 tclass, __be32 label, u32 priority) { const struct tcphdr *th = tcp_hdr(skb); struct tcphdr *t1; @@ -904,10 +944,11 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 * Underlying function will use this to retrieve the network * namespace */ - dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL); + dst = ip6_dst_lookup_flow(sock_net(ctl_sk), ctl_sk, &fl6, NULL); if (!IS_ERR(dst)) { skb_dst_set(buff, dst); - ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass); + ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass, + priority); TCP_INC_STATS(net, TCP_MIB_OUTSEGS); if (rst) TCP_INC_STATS(net, TCP_MIB_OUTRSTS); @@ -930,6 +971,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) struct sock *sk1 = NULL; #endif __be32 label = 0; + u32 priority = 0; struct net *net; int oif = 0; @@ -947,8 +989,18 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) rcu_read_lock(); hash_location = tcp_parse_md5sig_option(th); if (sk && sk_fullsock(sk)) { - key = tcp_v6_md5_do_lookup(sk, &ipv6h->saddr); + int l3index; + + /* sdif set, means packet ingressed via a device + * in an L3 domain and inet_iif is set to it. + */ + l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0; + key = tcp_v6_md5_do_lookup(sk, &ipv6h->saddr, l3index); } else if (hash_location) { + int dif = tcp_v6_iif_l3_slave(skb); + int sdif = tcp_v6_sdif(skb); + int l3index; + /* * active side is lost. Try to find listening socket through * source port, and then find md5 key through listening socket. @@ -960,13 +1012,16 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) &tcp_hashinfo, NULL, 0, &ipv6h->saddr, th->source, &ipv6h->daddr, - ntohs(th->source), - tcp_v6_iif_l3_slave(skb), - tcp_v6_sdif(skb)); + ntohs(th->source), dif, sdif); if (!sk1) goto out; - key = tcp_v6_md5_do_lookup(sk1, &ipv6h->saddr); + /* sdif set, means packet ingressed via a device + * in an L3 domain and dif is set to it. + */ + l3index = tcp_v6_sdif(skb) ? dif : 0; + + key = tcp_v6_md5_do_lookup(sk1, &ipv6h->saddr, l3index); if (!key) goto out; @@ -990,16 +1045,19 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) trace_tcp_send_reset(sk, skb); if (np->repflow) label = ip6_flowlabel(ipv6h); + priority = sk->sk_priority; } - if (sk->sk_state == TCP_TIME_WAIT) + if (sk->sk_state == TCP_TIME_WAIT) { label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel); + priority = inet_twsk(sk)->tw_priority; + } } else { if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_TCP_RESET) label = ip6_flowlabel(ipv6h); } tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, - label); + label, priority); #ifdef CONFIG_TCP_MD5SIG out: @@ -1010,10 +1068,10 @@ out: static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, u8 tclass, - __be32 label) + __be32 label, u32 priority) { tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, key, 0, - tclass, label); + tclass, label, priority); } static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) @@ -1025,7 +1083,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_time_stamp_raw() + tcptw->tw_ts_offset, tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), - tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel)); + tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), tw->tw_priority); inet_twsk_put(tw); } @@ -1033,6 +1091,10 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { + int l3index; + + l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0; + /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV * sk->sk_state == TCP_SYN_RECV -> for Fast Open. */ @@ -1047,8 +1109,8 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, req->ts_recent, sk->sk_bound_dev_if, - tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr), - 0, 0); + tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr, l3index), + 0, 0, sk->sk_priority); } @@ -1063,6 +1125,21 @@ static struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb) return sk; } +u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph, + struct tcphdr *th, u32 *cookie) +{ + u16 mss = 0; +#ifdef CONFIG_SYN_COOKIES + mss = tcp_get_syncookie_mss(&tcp6_request_sock_ops, + &tcp_request_sock_ipv6_ops, sk, th); + if (mss) { + *cookie = __cookie_v6_init_sequence(iph, th, &mss); + tcp_synq_overflow(sk); + } +#endif + return mss; +} + static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) { if (skb->protocol == htons(ETH_P_IP)) @@ -1104,6 +1181,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * struct sock *newsk; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *key; + int l3index; #endif struct flowi6 fl6; @@ -1129,6 +1207,8 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * newnp->saddr = newsk->sk_v6_rcv_saddr; inet_csk(newsk)->icsk_af_ops = &ipv6_mapped; + if (sk_is_mptcp(newsk)) + mptcpv6_handle_mapped(newsk, true); newsk->sk_backlog_rcv = tcp_v4_do_rcv; #ifdef CONFIG_TCP_MD5SIG newtp->af_specific = &tcp_sock_ipv6_mapped_specific; @@ -1247,8 +1327,10 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * newinet->inet_rcv_saddr = LOOPBACK4_IPV6; #ifdef CONFIG_TCP_MD5SIG + l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); + /* Copy over the MD5 key from the original socket */ - key = tcp_v6_md5_do_lookup(sk, &newsk->sk_v6_daddr); + key = tcp_v6_md5_do_lookup(sk, &newsk->sk_v6_daddr, l3index); if (key) { /* We're using one, so create a matching key * on the newsk structure. If we fail to get @@ -1256,7 +1338,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * * across. Shucks. */ tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newsk->sk_v6_daddr, - AF_INET6, 128, key->key, key->keylen, + AF_INET6, 128, l3index, key->key, key->keylen, sk_gfp_mask(sk, GFP_ATOMIC)); } #endif @@ -1458,6 +1540,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) { struct sk_buff *skb_to_free; int sdif = inet6_sdif(skb); + int dif = inet6_iif(skb); const struct tcphdr *th; const struct ipv6hdr *hdr; bool refcounted; @@ -1506,7 +1589,7 @@ process: struct sock *nsk; sk = req->rsk_listener; - if (tcp_v6_inbound_md5_hash(sk, skb)) { + if (tcp_v6_inbound_md5_hash(sk, skb, dif, sdif)) { sk_drops_add(sk, skb); reqsk_put(req); goto discard_it; @@ -1561,7 +1644,7 @@ process: if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; - if (tcp_v6_inbound_md5_hash(sk, skb)) + if (tcp_v6_inbound_md5_hash(sk, skb, dif, sdif)) goto discard_and_relse; if (tcp_filter(sk, skb)) @@ -1717,7 +1800,7 @@ static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_destructor = tcp_twsk_destructor, }; -static const struct inet_connection_sock_af_ops ipv6_specific = { +const struct inet_connection_sock_af_ops ipv6_specific = { .queue_xmit = inet6_csk_xmit, .send_check = tcp_v6_send_check, .rebuild_header = inet6_sk_rebuild_header, @@ -1869,12 +1952,13 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) state = inet_sk_state_load(sp); if (state == TCP_LISTEN) - rx_queue = sp->sk_ack_backlog; + rx_queue = READ_ONCE(sp->sk_ack_backlog); else /* Because we don't lock the socket, * we might find a transient negative value. */ - rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); + rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - + READ_ONCE(tp->copied_seq), 0); seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " @@ -1885,7 +1969,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], destp, state, - tp->write_seq - tp->snd_una, + READ_ONCE(tp->write_seq) - tp->snd_una, rx_queue, timer_active, jiffies_delta_to_clock_t(timer_expires - jiffies), @@ -2085,9 +2169,16 @@ int __init tcpv6_init(void) ret = register_pernet_subsys(&tcpv6_net_ops); if (ret) goto out_tcpv6_protosw; + + ret = mptcpv6_init(); + if (ret) + goto out_tcpv6_pernet_subsys; + out: return ret; +out_tcpv6_pernet_subsys: + unregister_pernet_subsys(&tcpv6_net_ops); out_tcpv6_protosw: inet6_unregister_protosw(&tcpv6_protosw); out_tcpv6_protocol: diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 827fe7385078..5dc439a391fe 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -135,7 +135,7 @@ static int compute_score(struct sock *sk, struct net *net, return -1; score++; - if (sk->sk_incoming_cpu == raw_smp_processor_id()) + if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; return score; @@ -158,13 +158,14 @@ static struct sock *udp6_lib_lookup2(struct net *net, score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif); if (score > badness) { - if (sk->sk_reuseport) { + if (sk->sk_reuseport && + sk->sk_state != TCP_ESTABLISHED) { hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); - if (result) + if (result && !reuseport_has_conns(sk, false)) return result; } result = sk; @@ -689,8 +690,7 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) __skb_push(skb, -skb_mac_offset(skb)); segs = udp_rcv_segment(sk, skb, false); - for (skb = segs; skb; skb = next) { - next = skb->next; + skb_list_walk_safe(segs, skb, next) { __skb_pull(skb, skb_transport_offset(skb)); ret = udpv6_queue_rcv_one_skb(sk, skb); @@ -1108,6 +1108,7 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, __wsum csum = 0; int offset = skb_transport_offset(skb); int len = skb->len - offset; + int datalen = len - sizeof(*uh); /* * Create a UDP header @@ -1140,8 +1141,12 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, return -EIO; } - skb_shinfo(skb)->gso_size = cork->gso_size; - skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; + if (datalen > cork->gso_size) { + skb_shinfo(skb)->gso_size = cork->gso_size; + skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; + skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen, + cork->gso_size); + } goto csum_partial; } @@ -1230,6 +1235,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipcm6_init(&ipc6); ipc6.gso_size = up->gso_size; ipc6.sockc.tsflags = sk->sk_tsflags; + ipc6.sockc.mark = sk->sk_mark; /* destination address check */ if (sin6) { @@ -1352,7 +1358,7 @@ do_udp_sendmsg: if (!fl6.flowi6_oif) fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex; - fl6.flowi6_mark = sk->sk_mark; + fl6.flowi6_mark = ipc6.sockc.mark; fl6.flowi6_uid = sk->sk_uid; if (msg->msg_controllen) { diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 64b8f05d6735..584157a07759 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -115,8 +115,10 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); + struct sk_buff *pp; + struct sock *sk; - if (unlikely(!uh) || !static_branch_unlikely(&udpv6_encap_needed_key)) + if (unlikely(!uh)) goto flush; /* Don't bother verifying checksum if we're going to flush anyway. */ @@ -127,12 +129,16 @@ struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb) ip6_gro_compute_pseudo)) goto flush; else if (uh->check) - skb_gro_checksum_try_convert(skb, IPPROTO_UDP, uh->check, + skb_gro_checksum_try_convert(skb, IPPROTO_UDP, ip6_gro_compute_pseudo); skip: NAPI_GRO_CB(skb)->is_ipv6 = 1; - return udp_gro_receive(head, skb, uh, udp6_lib_lookup_skb); + rcu_read_lock(); + sk = static_branch_unlikely(&udpv6_encap_needed_key) ? udp6_lib_lookup_skb(skb, uh->source, uh->dest) : NULL; + pp = udp_gro_receive(head, skb, uh, sk); + rcu_read_unlock(); + return pp; flush: NAPI_GRO_CB(skb)->flush = 1; @@ -144,6 +150,23 @@ INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff) const struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); + if (NAPI_GRO_CB(skb)->is_flist) { + uh->len = htons(skb->len - nhoff); + + skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4); + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; + + if (skb->ip_summed == CHECKSUM_UNNECESSARY) { + if (skb->csum_level < SKB_MAX_CSUM_LEVEL) + skb->csum_level++; + } else { + skb->ip_summed = CHECKSUM_UNNECESSARY; + skb->csum_level = 0; + } + + return 0; + } + if (uh->check) uh->check = ~udp_v6_check(skb->len - nhoff, &ipv6h->saddr, &ipv6h->daddr, 0); diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index eecac1b7148e..fbe51d40bd7e 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -187,7 +187,7 @@ skip_frag: int xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, - net, sk, skb, NULL, skb_dst(skb)->dev, + net, sk, skb, skb->dev, skb_dst(skb)->dev, __xfrm6_output, !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 699e0730ce8e..af7a4b8b1e9c 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -98,12 +98,13 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, } static void xfrm6_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) + struct sk_buff *skb, u32 mtu, + bool confirm_neigh) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct dst_entry *path = xdst->route; - path->ops->update_pmtu(path, sk, skb, mtu); + path->ops->update_pmtu(path, sk, skb, mtu, confirm_neigh); } static void xfrm6_redirect(struct dst_entry *dst, struct sock *sk, diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index ebb62a4ebe30..c4bdcbc84b07 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -50,7 +50,7 @@ static struct iucv_interface *pr_iucv; static const u8 iprm_shutdown[8] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01}; -#define TRGCLS_SIZE FIELD_SIZEOF(struct iucv_message, class) +#define TRGCLS_SIZE sizeof_field(struct iucv_message, class) #define __iucv_sock_wait(sk, condition, timeo, ret) \ do { \ diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 5dbc0c48f8cb..ea9e73428ed9 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -378,8 +378,12 @@ static int kcm_parse_func_strparser(struct strparser *strp, struct sk_buff *skb) { struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp); struct bpf_prog *prog = psock->bpf_prog; + int res; - return (*prog->bpf_func)(skb, prog->insnsi); + preempt_disable(); + res = BPF_PROG_RUN(prog, skb); + preempt_enable(); + return res; } static int kcm_read_sock_done(struct strparser *strp, int err) @@ -635,15 +639,15 @@ do_frag_list: frag_offset = 0; do_frag: frag = &skb_shinfo(skb)->frags[fragidx]; - if (WARN_ON(!frag->size)) { + if (WARN_ON(!skb_frag_size(frag))) { ret = -EINVAL; goto out; } ret = kernel_sendpage(psock->sk->sk_socket, - frag->page.p, - frag->page_offset + frag_offset, - frag->size - frag_offset, + skb_frag_page(frag), + skb_frag_off(frag) + frag_offset, + skb_frag_size(frag) - frag_offset, MSG_DONTWAIT); if (ret <= 0) { if (ret == -EAGAIN) { @@ -678,7 +682,7 @@ do_frag: sent += ret; frag_offset += ret; KCM_STATS_ADD(psock->stats.tx_bytes, ret); - if (frag_offset < frag->size) { + if (frag_offset < skb_frag_size(frag)) { /* Not finished with this frag */ goto do_frag; } diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 105e5a7092e7..fcb53ed1c4fb 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -122,8 +122,6 @@ static inline struct l2tp_tunnel *l2tp_tunnel(struct sock *sk) static inline struct l2tp_net *l2tp_pernet(const struct net *net) { - BUG_ON(!net); - return net_generic(net, l2tp_net_id); } @@ -322,8 +320,13 @@ int l2tp_session_register(struct l2tp_session *session, spin_lock_bh(&pn->l2tp_session_hlist_lock); + /* IP encap expects session IDs to be globally unique, while + * UDP encap doesn't. + */ hlist_for_each_entry(session_walk, g_head, global_hlist) - if (session_walk->session_id == session->session_id) { + if (session_walk->session_id == session->session_id && + (session_walk->tunnel->encap == L2TP_ENCAPTYPE_IP || + tunnel->encap == L2TP_ENCAPTYPE_IP)) { err = -EEXIST; goto err_tlock_pnlock; } @@ -1078,7 +1081,7 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); - nf_reset(skb); + nf_reset_ct(skb); bh_lock_sock(sk); if (sock_owned_by_user(sk)) { diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index bd3f39349d40..d3b520b9b2c9 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -56,7 +56,6 @@ static int l2tp_eth_dev_init(struct net_device *dev) { eth_hw_addr_random(dev); eth_broadcast_addr(dev->broadcast); - netdev_lockdep_set_classes(dev); return 0; } @@ -151,7 +150,7 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, skb->ip_summed = CHECKSUM_NONE; skb_dst_drop(skb); - nf_reset(skb); + nf_reset_ct(skb); rcu_read_lock(); dev = rcu_dereference(spriv->dev); diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 622833317dcb..0d7c887a2b75 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -193,7 +193,7 @@ pass_up: if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_put; - nf_reset(skb); + nf_reset_ct(skb); return sk_receive_skb(sk, skb, 1); diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 687e23a8b326..d148766f40d1 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -206,7 +206,7 @@ pass_up: if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_put; - nf_reset(skb); + nf_reset_ct(skb); return sk_receive_skb(sk, skb, 1); @@ -615,7 +615,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); - dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto out; diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 2017b7d780f5..2922d4150d88 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -113,22 +113,26 @@ static inline u8 llc_ui_header_len(struct sock *sk, struct sockaddr_llc *addr) * * Send data via reliable llc2 connection. * Returns 0 upon success, non-zero if action did not succeed. + * + * This function always consumes a reference to the skb. */ static int llc_ui_send_data(struct sock* sk, struct sk_buff *skb, int noblock) { struct llc_sock* llc = llc_sk(sk); - int rc = 0; if (unlikely(llc_data_accept_state(llc->state) || llc->remote_busy_flag || llc->p_flag)) { long timeout = sock_sndtimeo(sk, noblock); + int rc; rc = llc_ui_wait_for_busy_core(sk, timeout); + if (rc) { + kfree_skb(skb); + return rc; + } } - if (unlikely(!rc)) - rc = llc_build_and_send_pkt(sk, skb); - return rc; + return llc_build_and_send_pkt(sk, skb); } static void llc_ui_sk_init(struct socket *sock, struct sock *sk) @@ -701,7 +705,7 @@ static int llc_ui_accept(struct socket *sock, struct socket *newsock, int flags, /* put original socket back into a clean listen state. */ sk->sk_state = TCP_LISTEN; - sk->sk_ack_backlog--; + sk_acceptq_removed(sk); dprintk("%s: ok success on %02X, client on %02X\n", __func__, llc_sk(sk)->addr.sllc_sap, newllc->daddr.lsap); frees: @@ -776,7 +780,7 @@ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, } /* Well, if we have backlog, try to process it now yet. */ - if (copied >= target && !sk->sk_backlog.tail) + if (copied >= target && !READ_ONCE(sk->sk_backlog.tail)) break; if (copied) { @@ -899,7 +903,7 @@ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) DECLARE_SOCKADDR(struct sockaddr_llc *, addr, msg->msg_name); int flags = msg->msg_flags; int noblock = flags & MSG_DONTWAIT; - struct sk_buff *skb; + struct sk_buff *skb = NULL; size_t size = 0; int rc = -EINVAL, copied = 0, hdrlen; @@ -908,10 +912,10 @@ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) lock_sock(sk); if (addr) { if (msg->msg_namelen < sizeof(*addr)) - goto release; + goto out; } else { if (llc_ui_addr_null(&llc->addr)) - goto release; + goto out; addr = &llc->addr; } /* must bind connection to sap if user hasn't done it. */ @@ -919,7 +923,7 @@ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) /* bind to sap with null dev, exclusive. */ rc = llc_ui_autobind(sock, addr); if (rc) - goto release; + goto out; } hdrlen = llc->dev->hard_header_len + llc_ui_header_len(sk, addr); size = hdrlen + len; @@ -928,12 +932,12 @@ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) copied = size - hdrlen; rc = -EINVAL; if (copied < 0) - goto release; + goto out; release_sock(sk); skb = sock_alloc_send_skb(sk, size, noblock, &rc); lock_sock(sk); if (!skb) - goto release; + goto out; skb->dev = llc->dev; skb->protocol = llc_proto_type(addr->sllc_arphrd); skb_reserve(skb, hdrlen); @@ -943,29 +947,31 @@ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) if (sk->sk_type == SOCK_DGRAM || addr->sllc_ua) { llc_build_and_send_ui_pkt(llc->sap, skb, addr->sllc_mac, addr->sllc_sap); + skb = NULL; goto out; } if (addr->sllc_test) { llc_build_and_send_test_pkt(llc->sap, skb, addr->sllc_mac, addr->sllc_sap); + skb = NULL; goto out; } if (addr->sllc_xid) { llc_build_and_send_xid_pkt(llc->sap, skb, addr->sllc_mac, addr->sllc_sap); + skb = NULL; goto out; } rc = -ENOPROTOOPT; if (!(sk->sk_type == SOCK_STREAM && !addr->sllc_ua)) goto out; rc = llc_ui_send_data(sk, skb, noblock); + skb = NULL; out: - if (rc) { - kfree_skb(skb); -release: + kfree_skb(skb); + if (rc) dprintk("%s: failed sending from %02X to %02X: %d\n", __func__, llc->laddr.lsap, llc->daddr.lsap, rc); - } release_sock(sk); return rc ? : copied; } diff --git a/net/llc/llc_c_ac.c b/net/llc/llc_c_ac.c index 4d78375f9872..647c0554d04c 100644 --- a/net/llc/llc_c_ac.c +++ b/net/llc/llc_c_ac.c @@ -372,6 +372,7 @@ int llc_conn_ac_send_i_cmd_p_set_1(struct sock *sk, struct sk_buff *skb) llc_pdu_init_as_i_cmd(skb, 1, llc->vS, llc->vR); rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); if (likely(!rc)) { + skb_get(skb); llc_conn_send_pdu(sk, skb); llc_conn_ac_inc_vs_by_1(sk, skb); } @@ -389,7 +390,8 @@ static int llc_conn_ac_send_i_cmd_p_set_0(struct sock *sk, struct sk_buff *skb) llc_pdu_init_as_i_cmd(skb, 0, llc->vS, llc->vR); rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); if (likely(!rc)) { - rc = llc_conn_send_pdu(sk, skb); + skb_get(skb); + llc_conn_send_pdu(sk, skb); llc_conn_ac_inc_vs_by_1(sk, skb); } return rc; @@ -406,6 +408,7 @@ int llc_conn_ac_send_i_xxx_x_set_0(struct sock *sk, struct sk_buff *skb) llc_pdu_init_as_i_cmd(skb, 0, llc->vS, llc->vR); rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); if (likely(!rc)) { + skb_get(skb); llc_conn_send_pdu(sk, skb); llc_conn_ac_inc_vs_by_1(sk, skb); } @@ -916,7 +919,8 @@ static int llc_conn_ac_send_i_rsp_f_set_ackpf(struct sock *sk, llc_pdu_init_as_i_cmd(skb, llc->ack_pf, llc->vS, llc->vR); rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); if (likely(!rc)) { - rc = llc_conn_send_pdu(sk, skb); + skb_get(skb); + llc_conn_send_pdu(sk, skb); llc_conn_ac_inc_vs_by_1(sk, skb); } return rc; diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index 4ff89cb7c86f..7b620acaca9e 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -30,7 +30,7 @@ #endif static int llc_find_offset(int state, int ev_type); -static int llc_conn_send_pdus(struct sock *sk, struct sk_buff *skb); +static void llc_conn_send_pdus(struct sock *sk); static int llc_conn_service(struct sock *sk, struct sk_buff *skb); static int llc_exec_conn_trans_actions(struct sock *sk, struct llc_conn_state_trans *trans, @@ -55,6 +55,8 @@ int sysctl_llc2_busy_timeout = LLC2_BUSY_TIME * HZ; * (executing it's actions and changing state), upper layer will be * indicated or confirmed, if needed. Returns 0 for success, 1 for * failure. The socket lock has to be held before calling this function. + * + * This function always consumes a reference to the skb. */ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb) { @@ -62,12 +64,6 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb) struct llc_sock *llc = llc_sk(skb->sk); struct llc_conn_state_ev *ev = llc_conn_ev(skb); - /* - * We have to hold the skb, because llc_conn_service will kfree it in - * the sending path and we need to look at the skb->cb, where we encode - * llc_conn_state_ev. - */ - skb_get(skb); ev->ind_prim = ev->cfm_prim = 0; /* * Send event to state machine @@ -75,21 +71,12 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb) rc = llc_conn_service(skb->sk, skb); if (unlikely(rc != 0)) { printk(KERN_ERR "%s: llc_conn_service failed\n", __func__); - goto out_kfree_skb; - } - - if (unlikely(!ev->ind_prim && !ev->cfm_prim)) { - /* indicate or confirm not required */ - if (!skb->next) - goto out_kfree_skb; goto out_skb_put; } - if (unlikely(ev->ind_prim && ev->cfm_prim)) /* Paranoia */ - skb_get(skb); - switch (ev->ind_prim) { case LLC_DATA_PRIM: + skb_get(skb); llc_save_primitive(sk, skb, LLC_DATA_PRIM); if (unlikely(sock_queue_rcv_skb(sk, skb))) { /* @@ -106,6 +93,7 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb) * skb->sk pointing to the newly created struct sock in * llc_conn_handler. -acme */ + skb_get(skb); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_state_change(sk); break; @@ -121,7 +109,6 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb) sk->sk_state_change(sk); } } - kfree_skb(skb); sock_put(sk); break; case LLC_RESET_PRIM: @@ -130,14 +117,11 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb) * RESET is not being notified to upper layers for now */ printk(KERN_INFO "%s: received a reset ind!\n", __func__); - kfree_skb(skb); break; default: - if (ev->ind_prim) { + if (ev->ind_prim) printk(KERN_INFO "%s: received unknown %d prim!\n", __func__, ev->ind_prim); - kfree_skb(skb); - } /* No indication */ break; } @@ -179,25 +163,22 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb) printk(KERN_INFO "%s: received a reset conf!\n", __func__); break; default: - if (ev->cfm_prim) { + if (ev->cfm_prim) printk(KERN_INFO "%s: received unknown %d prim!\n", __func__, ev->cfm_prim); - break; - } - goto out_skb_put; /* No confirmation */ + /* No confirmation */ + break; } -out_kfree_skb: - kfree_skb(skb); out_skb_put: kfree_skb(skb); return rc; } -int llc_conn_send_pdu(struct sock *sk, struct sk_buff *skb) +void llc_conn_send_pdu(struct sock *sk, struct sk_buff *skb) { /* queue PDU to send to MAC layer */ skb_queue_tail(&sk->sk_write_queue, skb); - return llc_conn_send_pdus(sk, skb); + llc_conn_send_pdus(sk); } /** @@ -255,7 +236,7 @@ void llc_conn_resend_i_pdu_as_cmd(struct sock *sk, u8 nr, u8 first_p_bit) if (howmany_resend > 0) llc->vS = (llc->vS + 1) % LLC_2_SEQ_NBR_MODULO; /* any PDUs to re-send are queued up; start sending to MAC */ - llc_conn_send_pdus(sk, NULL); + llc_conn_send_pdus(sk); out:; } @@ -296,7 +277,7 @@ void llc_conn_resend_i_pdu_as_rsp(struct sock *sk, u8 nr, u8 first_f_bit) if (howmany_resend > 0) llc->vS = (llc->vS + 1) % LLC_2_SEQ_NBR_MODULO; /* any PDUs to re-send are queued up; start sending to MAC */ - llc_conn_send_pdus(sk, NULL); + llc_conn_send_pdus(sk); out:; } @@ -340,16 +321,12 @@ out: /** * llc_conn_send_pdus - Sends queued PDUs * @sk: active connection - * @hold_skb: the skb held by caller, or NULL if does not care * - * Sends queued pdus to MAC layer for transmission. When @hold_skb is - * NULL, always return 0. Otherwise, return 0 if @hold_skb is sent - * successfully, or 1 for failure. + * Sends queued pdus to MAC layer for transmission. */ -static int llc_conn_send_pdus(struct sock *sk, struct sk_buff *hold_skb) +static void llc_conn_send_pdus(struct sock *sk) { struct sk_buff *skb; - int ret = 0; while ((skb = skb_dequeue(&sk->sk_write_queue)) != NULL) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); @@ -361,20 +338,10 @@ static int llc_conn_send_pdus(struct sock *sk, struct sk_buff *hold_skb) skb_queue_tail(&llc_sk(sk)->pdu_unack_q, skb); if (!skb2) break; - dev_queue_xmit(skb2); - } else { - bool is_target = skb == hold_skb; - int rc; - - if (is_target) - skb_get(skb); - rc = dev_queue_xmit(skb); - if (is_target) - ret = rc; + skb = skb2; } + dev_queue_xmit(skb); } - - return ret; } /** @@ -846,7 +813,7 @@ void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb) else { dprintk("%s: adding to backlog...\n", __func__); llc_set_backlog_type(skb, LLC_PACKET); - if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) + if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) goto drop_unlock; } out: diff --git a/net/llc/llc_if.c b/net/llc/llc_if.c index 8db03c2d5440..ad6547736c21 100644 --- a/net/llc/llc_if.c +++ b/net/llc/llc_if.c @@ -38,6 +38,8 @@ * closed and -EBUSY when sending data is not permitted in this state or * LLC has send an I pdu with p bit set to 1 and is waiting for it's * response. + * + * This function always consumes a reference to the skb. */ int llc_build_and_send_pkt(struct sock *sk, struct sk_buff *skb) { @@ -46,20 +48,22 @@ int llc_build_and_send_pkt(struct sock *sk, struct sk_buff *skb) struct llc_sock *llc = llc_sk(sk); if (unlikely(llc->state == LLC_CONN_STATE_ADM)) - goto out; + goto out_free; rc = -EBUSY; if (unlikely(llc_data_accept_state(llc->state) || /* data_conn_refuse */ llc->p_flag)) { llc->failed_data_req = 1; - goto out; + goto out_free; } ev = llc_conn_ev(skb); ev->type = LLC_CONN_EV_TYPE_PRIM; ev->prim = LLC_DATA_PRIM; ev->prim_type = LLC_PRIM_TYPE_REQ; skb->dev = llc->dev; - rc = llc_conn_state_process(sk, skb); -out: + return llc_conn_state_process(sk, skb); + +out_free: + kfree_skb(skb); return rc; } diff --git a/net/llc/llc_s_ac.c b/net/llc/llc_s_ac.c index a94bd56bcac6..7ae4cc684d3a 100644 --- a/net/llc/llc_s_ac.c +++ b/net/llc/llc_s_ac.c @@ -58,8 +58,10 @@ int llc_sap_action_send_ui(struct llc_sap *sap, struct sk_buff *skb) ev->daddr.lsap, LLC_PDU_CMD); llc_pdu_init_as_ui_cmd(skb); rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac); - if (likely(!rc)) + if (likely(!rc)) { + skb_get(skb); rc = dev_queue_xmit(skb); + } return rc; } @@ -81,8 +83,10 @@ int llc_sap_action_send_xid_c(struct llc_sap *sap, struct sk_buff *skb) ev->daddr.lsap, LLC_PDU_CMD); llc_pdu_init_as_xid_cmd(skb, LLC_XID_NULL_CLASS_2, 0); rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac); - if (likely(!rc)) + if (likely(!rc)) { + skb_get(skb); rc = dev_queue_xmit(skb); + } return rc; } @@ -135,8 +139,10 @@ int llc_sap_action_send_test_c(struct llc_sap *sap, struct sk_buff *skb) ev->daddr.lsap, LLC_PDU_CMD); llc_pdu_init_as_test_cmd(skb); rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac); - if (likely(!rc)) + if (likely(!rc)) { + skb_get(skb); rc = dev_queue_xmit(skb); + } return rc; } diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c index a7f7b8ff4729..be419062e19a 100644 --- a/net/llc/llc_sap.c +++ b/net/llc/llc_sap.c @@ -197,29 +197,22 @@ out: * After executing actions of the event, upper layer will be indicated * if needed(on receiving an UI frame). sk can be null for the * datalink_proto case. + * + * This function always consumes a reference to the skb. */ static void llc_sap_state_process(struct llc_sap *sap, struct sk_buff *skb) { struct llc_sap_state_ev *ev = llc_sap_ev(skb); - /* - * We have to hold the skb, because llc_sap_next_state - * will kfree it in the sending path and we need to - * look at the skb->cb, where we encode llc_sap_state_ev. - */ - skb_get(skb); ev->ind_cfm_flag = 0; llc_sap_next_state(sap, skb); - if (ev->ind_cfm_flag == LLC_IND) { - if (skb->sk->sk_state == TCP_LISTEN) - kfree_skb(skb); - else { - llc_save_primitive(skb->sk, skb, ev->prim); - /* queue skb to the user. */ - if (sock_queue_rcv_skb(skb->sk, skb)) - kfree_skb(skb); - } + if (ev->ind_cfm_flag == LLC_IND && skb->sk->sk_state != TCP_LISTEN) { + llc_save_primitive(skb->sk, skb, ev->prim); + + /* queue skb to the user. */ + if (sock_queue_rcv_skb(skb->sk, skb) == 0) + return; } kfree_skb(skb); } diff --git a/net/llc/llc_station.c b/net/llc/llc_station.c index 204a8351efff..c29170e767a8 100644 --- a/net/llc/llc_station.c +++ b/net/llc/llc_station.c @@ -32,7 +32,7 @@ static int llc_stat_ev_rx_null_dsap_xid_c(struct sk_buff *skb) return LLC_PDU_IS_CMD(pdu) && /* command PDU */ LLC_PDU_TYPE_IS_U(pdu) && /* U type PDU */ LLC_U_PDU_CMD(pdu) == LLC_1_PDU_CMD_XID && - !pdu->dsap ? 0 : 1; /* NULL DSAP value */ + !pdu->dsap; /* NULL DSAP value */ } static int llc_stat_ev_rx_null_dsap_test_c(struct sk_buff *skb) @@ -42,7 +42,7 @@ static int llc_stat_ev_rx_null_dsap_test_c(struct sk_buff *skb) return LLC_PDU_IS_CMD(pdu) && /* command PDU */ LLC_PDU_TYPE_IS_U(pdu) && /* U type PDU */ LLC_U_PDU_CMD(pdu) == LLC_1_PDU_CMD_TEST && - !pdu->dsap ? 0 : 1; /* NULL DSAP */ + !pdu->dsap; /* NULL DSAP */ } static int llc_station_ac_send_xid_r(struct sk_buff *skb) diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile index 4f03ebe732fa..6cbb1286d6c0 100644 --- a/net/mac80211/Makefile +++ b/net/mac80211/Makefile @@ -32,7 +32,8 @@ mac80211-y := \ chan.o \ trace.o mlme.o \ tdls.o \ - ocb.o + ocb.o \ + airtime.o mac80211-$(CONFIG_MAC80211_LEDS) += led.o mac80211-$(CONFIG_MAC80211_DEBUGFS) += \ diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 01b0dad24500..4d1c335e06e5 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -178,17 +178,54 @@ static void sta_rx_agg_reorder_timer_expired(struct timer_list *t) rcu_read_unlock(); } -static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *da, u16 tid, +static void ieee80211_add_addbaext(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb, + const struct ieee80211_addba_ext_ie *req) +{ + struct ieee80211_supported_band *sband; + struct ieee80211_addba_ext_ie *resp; + const struct ieee80211_sta_he_cap *he_cap; + u8 frag_level, cap_frag_level; + u8 *pos; + + sband = ieee80211_get_sband(sdata); + if (!sband) + return; + he_cap = ieee80211_get_he_iftype_cap(sband, sdata->vif.type); + if (!he_cap) + return; + + pos = skb_put_zero(skb, 2 + sizeof(struct ieee80211_addba_ext_ie)); + *pos++ = WLAN_EID_ADDBA_EXT; + *pos++ = sizeof(struct ieee80211_addba_ext_ie); + resp = (struct ieee80211_addba_ext_ie *)pos; + resp->data = req->data & IEEE80211_ADDBA_EXT_NO_FRAG; + + frag_level = u32_get_bits(req->data, + IEEE80211_ADDBA_EXT_FRAG_LEVEL_MASK); + cap_frag_level = u32_get_bits(he_cap->he_cap_elem.mac_cap_info[0], + IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_MASK); + if (frag_level > cap_frag_level) + frag_level = cap_frag_level; + resp->data |= u8_encode_bits(frag_level, + IEEE80211_ADDBA_EXT_FRAG_LEVEL_MASK); +} + +static void ieee80211_send_addba_resp(struct sta_info *sta, u8 *da, u16 tid, u8 dialog_token, u16 status, u16 policy, - u16 buf_size, u16 timeout) + u16 buf_size, u16 timeout, + const struct ieee80211_addba_ext_ie *addbaext) { + struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ieee80211_mgmt *mgmt; bool amsdu = ieee80211_hw_check(&local->hw, SUPPORTS_AMSDU_IN_AMPDU); u16 capab; - skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom); + skb = dev_alloc_skb(sizeof(*mgmt) + + 2 + sizeof(struct ieee80211_addba_ext_ie) + + local->hw.extra_tx_headroom); if (!skb) return; @@ -222,13 +259,17 @@ static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *d mgmt->u.action.u.addba_resp.timeout = cpu_to_le16(timeout); mgmt->u.action.u.addba_resp.status = cpu_to_le16(status); + if (sta->sta.he_cap.has_he && addbaext) + ieee80211_add_addbaext(sdata, skb, addbaext); + ieee80211_tx_skb(sdata, skb); } void ___ieee80211_start_rx_ba_session(struct sta_info *sta, u8 dialog_token, u16 timeout, u16 start_seq_num, u16 ba_policy, u16 tid, - u16 buf_size, bool tx, bool auto_seq) + u16 buf_size, bool tx, bool auto_seq, + const struct ieee80211_addba_ext_ie *addbaext) { struct ieee80211_local *local = sta->sdata->local; struct tid_ampdu_rx *tid_agg_rx; @@ -410,21 +451,22 @@ end: } if (tx) - ieee80211_send_addba_resp(sta->sdata, sta->sta.addr, tid, + ieee80211_send_addba_resp(sta, sta->sta.addr, tid, dialog_token, status, 1, buf_size, - timeout); + timeout, addbaext); } static void __ieee80211_start_rx_ba_session(struct sta_info *sta, u8 dialog_token, u16 timeout, u16 start_seq_num, u16 ba_policy, u16 tid, u16 buf_size, bool tx, - bool auto_seq) + bool auto_seq, + const struct ieee80211_addba_ext_ie *addbaext) { mutex_lock(&sta->ampdu_mlme.mtx); ___ieee80211_start_rx_ba_session(sta, dialog_token, timeout, start_seq_num, ba_policy, tid, - buf_size, tx, auto_seq); + buf_size, tx, auto_seq, addbaext); mutex_unlock(&sta->ampdu_mlme.mtx); } @@ -434,7 +476,9 @@ void ieee80211_process_addba_request(struct ieee80211_local *local, size_t len) { u16 capab, tid, timeout, ba_policy, buf_size, start_seq_num; + struct ieee802_11_elems elems = { 0 }; u8 dialog_token; + int ies_len; /* extract session parameters from addba request frame */ dialog_token = mgmt->u.action.u.addba_req.dialog_token; @@ -447,9 +491,19 @@ void ieee80211_process_addba_request(struct ieee80211_local *local, tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2; buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6; + ies_len = len - offsetof(struct ieee80211_mgmt, + u.action.u.addba_req.variable); + if (ies_len) { + ieee802_11_parse_elems(mgmt->u.action.u.addba_req.variable, + ies_len, true, &elems, mgmt->bssid, NULL); + if (elems.parse_error) + return; + } + __ieee80211_start_rx_ba_session(sta, dialog_token, timeout, start_seq_num, ba_policy, tid, - buf_size, true, false); + buf_size, true, false, + elems.addba_ext_ie); } void ieee80211_manage_rx_ba_offl(struct ieee80211_vif *vif, diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index b11883d26875..33da6f738c99 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -485,7 +485,14 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) params.ssn = sta->tid_seq[tid] >> 4; ret = drv_ampdu_action(local, sdata, ¶ms); - if (ret) { + if (ret == IEEE80211_AMPDU_TX_START_IMMEDIATE) { + /* + * We didn't send the request yet, so don't need to check + * here if we already got a response, just mark as driver + * ready immediately. + */ + set_bit(HT_AGG_STATE_DRV_READY, &tid_tx->state); + } else if (ret) { ht_dbg(sdata, "BA request denied - HW unavailable for %pM tid %d\n", sta->sta.addr, tid); diff --git a/net/mac80211/airtime.c b/net/mac80211/airtime.c new file mode 100644 index 000000000000..9fc2968856c0 --- /dev/null +++ b/net/mac80211/airtime.c @@ -0,0 +1,597 @@ +// SPDX-License-Identifier: ISC +/* + * Copyright (C) 2019 Felix Fietkau <nbd@nbd.name> + */ + +#include <net/mac80211.h> +#include "ieee80211_i.h" +#include "sta_info.h" + +#define AVG_PKT_SIZE 1024 + +/* Number of bits for an average sized packet */ +#define MCS_NBITS (AVG_PKT_SIZE << 3) + +/* Number of kilo-symbols (symbols * 1024) for a packet with (bps) bits per + * symbol. We use k-symbols to avoid rounding in the _TIME macros below. + */ +#define MCS_N_KSYMS(bps) DIV_ROUND_UP(MCS_NBITS << 10, (bps)) + +/* Transmission time (in 1024 * usec) for a packet containing (ksyms) * 1024 + * symbols. + */ +#define MCS_SYMBOL_TIME(sgi, ksyms) \ + (sgi ? \ + ((ksyms) * 4 * 18) / 20 : /* 3.6 us per sym */ \ + ((ksyms) * 4) /* 4.0 us per sym */ \ + ) + +/* Transmit duration for the raw data part of an average sized packet */ +#define MCS_DURATION(streams, sgi, bps) \ + ((u32)MCS_SYMBOL_TIME(sgi, MCS_N_KSYMS((streams) * (bps)))) + +#define MCS_DURATION_S(shift, streams, sgi, bps) \ + ((u16)((MCS_DURATION(streams, sgi, bps) >> shift))) + +/* These should match the values in enum nl80211_he_gi */ +#define HE_GI_08 0 +#define HE_GI_16 1 +#define HE_GI_32 2 + +/* Transmission time (1024 usec) for a packet containing (ksyms) * k-symbols */ +#define HE_SYMBOL_TIME(gi, ksyms) \ + (gi == HE_GI_08 ? \ + ((ksyms) * 16 * 17) / 20 : /* 13.6 us per sym */ \ + (gi == HE_GI_16 ? \ + ((ksyms) * 16 * 18) / 20 : /* 14.4 us per sym */ \ + ((ksyms) * 16) /* 16.0 us per sym */ \ + )) + +/* Transmit duration for the raw data part of an average sized packet */ +#define HE_DURATION(streams, gi, bps) \ + ((u32)HE_SYMBOL_TIME(gi, MCS_N_KSYMS((streams) * (bps)))) + +#define HE_DURATION_S(shift, streams, gi, bps) \ + (HE_DURATION(streams, gi, bps) >> shift) + +#define BW_20 0 +#define BW_40 1 +#define BW_80 2 +#define BW_160 3 + +/* + * Define group sort order: HT40 -> SGI -> #streams + */ +#define IEEE80211_MAX_STREAMS 4 +#define IEEE80211_HT_STREAM_GROUPS 4 /* BW(=2) * SGI(=2) */ +#define IEEE80211_VHT_STREAM_GROUPS 8 /* BW(=4) * SGI(=2) */ + +#define IEEE80211_HE_MAX_STREAMS 8 +#define IEEE80211_HE_STREAM_GROUPS 12 /* BW(=4) * GI(=3) */ + +#define IEEE80211_HT_GROUPS_NB (IEEE80211_MAX_STREAMS * \ + IEEE80211_HT_STREAM_GROUPS) +#define IEEE80211_VHT_GROUPS_NB (IEEE80211_MAX_STREAMS * \ + IEEE80211_VHT_STREAM_GROUPS) +#define IEEE80211_HE_GROUPS_NB (IEEE80211_HE_MAX_STREAMS * \ + IEEE80211_HE_STREAM_GROUPS) +#define IEEE80211_GROUPS_NB (IEEE80211_HT_GROUPS_NB + \ + IEEE80211_VHT_GROUPS_NB + \ + IEEE80211_HE_GROUPS_NB) + +#define IEEE80211_HT_GROUP_0 0 +#define IEEE80211_VHT_GROUP_0 (IEEE80211_HT_GROUP_0 + IEEE80211_HT_GROUPS_NB) +#define IEEE80211_HE_GROUP_0 (IEEE80211_VHT_GROUP_0 + IEEE80211_VHT_GROUPS_NB) + +#define MCS_GROUP_RATES 12 + +#define HT_GROUP_IDX(_streams, _sgi, _ht40) \ + IEEE80211_HT_GROUP_0 + \ + IEEE80211_MAX_STREAMS * 2 * _ht40 + \ + IEEE80211_MAX_STREAMS * _sgi + \ + _streams - 1 + +#define _MAX(a, b) (((a)>(b))?(a):(b)) + +#define GROUP_SHIFT(duration) \ + _MAX(0, 16 - __builtin_clz(duration)) + +/* MCS rate information for an MCS group */ +#define __MCS_GROUP(_streams, _sgi, _ht40, _s) \ + [HT_GROUP_IDX(_streams, _sgi, _ht40)] = { \ + .shift = _s, \ + .duration = { \ + MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 54 : 26), \ + MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 108 : 52), \ + MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 162 : 78), \ + MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 216 : 104), \ + MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 324 : 156), \ + MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 432 : 208), \ + MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 486 : 234), \ + MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 540 : 260) \ + } \ +} + +#define MCS_GROUP_SHIFT(_streams, _sgi, _ht40) \ + GROUP_SHIFT(MCS_DURATION(_streams, _sgi, _ht40 ? 54 : 26)) + +#define MCS_GROUP(_streams, _sgi, _ht40) \ + __MCS_GROUP(_streams, _sgi, _ht40, \ + MCS_GROUP_SHIFT(_streams, _sgi, _ht40)) + +#define VHT_GROUP_IDX(_streams, _sgi, _bw) \ + (IEEE80211_VHT_GROUP_0 + \ + IEEE80211_MAX_STREAMS * 2 * (_bw) + \ + IEEE80211_MAX_STREAMS * (_sgi) + \ + (_streams) - 1) + +#define BW2VBPS(_bw, r4, r3, r2, r1) \ + (_bw == BW_160 ? r4 : _bw == BW_80 ? r3 : _bw == BW_40 ? r2 : r1) + +#define __VHT_GROUP(_streams, _sgi, _bw, _s) \ + [VHT_GROUP_IDX(_streams, _sgi, _bw)] = { \ + .shift = _s, \ + .duration = { \ + MCS_DURATION_S(_s, _streams, _sgi, \ + BW2VBPS(_bw, 234, 117, 54, 26)), \ + MCS_DURATION_S(_s, _streams, _sgi, \ + BW2VBPS(_bw, 468, 234, 108, 52)), \ + MCS_DURATION_S(_s, _streams, _sgi, \ + BW2VBPS(_bw, 702, 351, 162, 78)), \ + MCS_DURATION_S(_s, _streams, _sgi, \ + BW2VBPS(_bw, 936, 468, 216, 104)), \ + MCS_DURATION_S(_s, _streams, _sgi, \ + BW2VBPS(_bw, 1404, 702, 324, 156)), \ + MCS_DURATION_S(_s, _streams, _sgi, \ + BW2VBPS(_bw, 1872, 936, 432, 208)), \ + MCS_DURATION_S(_s, _streams, _sgi, \ + BW2VBPS(_bw, 2106, 1053, 486, 234)), \ + MCS_DURATION_S(_s, _streams, _sgi, \ + BW2VBPS(_bw, 2340, 1170, 540, 260)), \ + MCS_DURATION_S(_s, _streams, _sgi, \ + BW2VBPS(_bw, 2808, 1404, 648, 312)), \ + MCS_DURATION_S(_s, _streams, _sgi, \ + BW2VBPS(_bw, 3120, 1560, 720, 346)) \ + } \ +} + +#define VHT_GROUP_SHIFT(_streams, _sgi, _bw) \ + GROUP_SHIFT(MCS_DURATION(_streams, _sgi, \ + BW2VBPS(_bw, 243, 117, 54, 26))) + +#define VHT_GROUP(_streams, _sgi, _bw) \ + __VHT_GROUP(_streams, _sgi, _bw, \ + VHT_GROUP_SHIFT(_streams, _sgi, _bw)) + + +#define HE_GROUP_IDX(_streams, _gi, _bw) \ + (IEEE80211_HE_GROUP_0 + \ + IEEE80211_HE_MAX_STREAMS * 3 * (_bw) + \ + IEEE80211_HE_MAX_STREAMS * (_gi) + \ + (_streams) - 1) + +#define __HE_GROUP(_streams, _gi, _bw, _s) \ + [HE_GROUP_IDX(_streams, _gi, _bw)] = { \ + .shift = _s, \ + .duration = { \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 979, 489, 230, 115)), \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 1958, 979, 475, 230)), \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 2937, 1468, 705, 345)), \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 3916, 1958, 936, 475)), \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 5875, 2937, 1411, 705)), \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 7833, 3916, 1872, 936)), \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 8827, 4406, 2102, 1051)), \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 9806, 4896, 2347, 1166)), \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 11764, 5875, 2808, 1411)), \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 13060, 6523, 3124, 1555)), \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 14702, 7344, 3513, 1756)), \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 16329, 8164, 3902, 1944)) \ + } \ +} + +#define HE_GROUP_SHIFT(_streams, _gi, _bw) \ + GROUP_SHIFT(HE_DURATION(_streams, _gi, \ + BW2VBPS(_bw, 979, 489, 230, 115))) + +#define HE_GROUP(_streams, _gi, _bw) \ + __HE_GROUP(_streams, _gi, _bw, \ + HE_GROUP_SHIFT(_streams, _gi, _bw)) +struct mcs_group { + u8 shift; + u16 duration[MCS_GROUP_RATES]; +}; + +static const struct mcs_group airtime_mcs_groups[] = { + MCS_GROUP(1, 0, BW_20), + MCS_GROUP(2, 0, BW_20), + MCS_GROUP(3, 0, BW_20), + MCS_GROUP(4, 0, BW_20), + + MCS_GROUP(1, 1, BW_20), + MCS_GROUP(2, 1, BW_20), + MCS_GROUP(3, 1, BW_20), + MCS_GROUP(4, 1, BW_20), + + MCS_GROUP(1, 0, BW_40), + MCS_GROUP(2, 0, BW_40), + MCS_GROUP(3, 0, BW_40), + MCS_GROUP(4, 0, BW_40), + + MCS_GROUP(1, 1, BW_40), + MCS_GROUP(2, 1, BW_40), + MCS_GROUP(3, 1, BW_40), + MCS_GROUP(4, 1, BW_40), + + VHT_GROUP(1, 0, BW_20), + VHT_GROUP(2, 0, BW_20), + VHT_GROUP(3, 0, BW_20), + VHT_GROUP(4, 0, BW_20), + + VHT_GROUP(1, 1, BW_20), + VHT_GROUP(2, 1, BW_20), + VHT_GROUP(3, 1, BW_20), + VHT_GROUP(4, 1, BW_20), + + VHT_GROUP(1, 0, BW_40), + VHT_GROUP(2, 0, BW_40), + VHT_GROUP(3, 0, BW_40), + VHT_GROUP(4, 0, BW_40), + + VHT_GROUP(1, 1, BW_40), + VHT_GROUP(2, 1, BW_40), + VHT_GROUP(3, 1, BW_40), + VHT_GROUP(4, 1, BW_40), + + VHT_GROUP(1, 0, BW_80), + VHT_GROUP(2, 0, BW_80), + VHT_GROUP(3, 0, BW_80), + VHT_GROUP(4, 0, BW_80), + + VHT_GROUP(1, 1, BW_80), + VHT_GROUP(2, 1, BW_80), + VHT_GROUP(3, 1, BW_80), + VHT_GROUP(4, 1, BW_80), + + VHT_GROUP(1, 0, BW_160), + VHT_GROUP(2, 0, BW_160), + VHT_GROUP(3, 0, BW_160), + VHT_GROUP(4, 0, BW_160), + + VHT_GROUP(1, 1, BW_160), + VHT_GROUP(2, 1, BW_160), + VHT_GROUP(3, 1, BW_160), + VHT_GROUP(4, 1, BW_160), + + HE_GROUP(1, HE_GI_08, BW_20), + HE_GROUP(2, HE_GI_08, BW_20), + HE_GROUP(3, HE_GI_08, BW_20), + HE_GROUP(4, HE_GI_08, BW_20), + HE_GROUP(5, HE_GI_08, BW_20), + HE_GROUP(6, HE_GI_08, BW_20), + HE_GROUP(7, HE_GI_08, BW_20), + HE_GROUP(8, HE_GI_08, BW_20), + + HE_GROUP(1, HE_GI_16, BW_20), + HE_GROUP(2, HE_GI_16, BW_20), + HE_GROUP(3, HE_GI_16, BW_20), + HE_GROUP(4, HE_GI_16, BW_20), + HE_GROUP(5, HE_GI_16, BW_20), + HE_GROUP(6, HE_GI_16, BW_20), + HE_GROUP(7, HE_GI_16, BW_20), + HE_GROUP(8, HE_GI_16, BW_20), + + HE_GROUP(1, HE_GI_32, BW_20), + HE_GROUP(2, HE_GI_32, BW_20), + HE_GROUP(3, HE_GI_32, BW_20), + HE_GROUP(4, HE_GI_32, BW_20), + HE_GROUP(5, HE_GI_32, BW_20), + HE_GROUP(6, HE_GI_32, BW_20), + HE_GROUP(7, HE_GI_32, BW_20), + HE_GROUP(8, HE_GI_32, BW_20), + + HE_GROUP(1, HE_GI_08, BW_40), + HE_GROUP(2, HE_GI_08, BW_40), + HE_GROUP(3, HE_GI_08, BW_40), + HE_GROUP(4, HE_GI_08, BW_40), + HE_GROUP(5, HE_GI_08, BW_40), + HE_GROUP(6, HE_GI_08, BW_40), + HE_GROUP(7, HE_GI_08, BW_40), + HE_GROUP(8, HE_GI_08, BW_40), + + HE_GROUP(1, HE_GI_16, BW_40), + HE_GROUP(2, HE_GI_16, BW_40), + HE_GROUP(3, HE_GI_16, BW_40), + HE_GROUP(4, HE_GI_16, BW_40), + HE_GROUP(5, HE_GI_16, BW_40), + HE_GROUP(6, HE_GI_16, BW_40), + HE_GROUP(7, HE_GI_16, BW_40), + HE_GROUP(8, HE_GI_16, BW_40), + + HE_GROUP(1, HE_GI_32, BW_40), + HE_GROUP(2, HE_GI_32, BW_40), + HE_GROUP(3, HE_GI_32, BW_40), + HE_GROUP(4, HE_GI_32, BW_40), + HE_GROUP(5, HE_GI_32, BW_40), + HE_GROUP(6, HE_GI_32, BW_40), + HE_GROUP(7, HE_GI_32, BW_40), + HE_GROUP(8, HE_GI_32, BW_40), + + HE_GROUP(1, HE_GI_08, BW_80), + HE_GROUP(2, HE_GI_08, BW_80), + HE_GROUP(3, HE_GI_08, BW_80), + HE_GROUP(4, HE_GI_08, BW_80), + HE_GROUP(5, HE_GI_08, BW_80), + HE_GROUP(6, HE_GI_08, BW_80), + HE_GROUP(7, HE_GI_08, BW_80), + HE_GROUP(8, HE_GI_08, BW_80), + + HE_GROUP(1, HE_GI_16, BW_80), + HE_GROUP(2, HE_GI_16, BW_80), + HE_GROUP(3, HE_GI_16, BW_80), + HE_GROUP(4, HE_GI_16, BW_80), + HE_GROUP(5, HE_GI_16, BW_80), + HE_GROUP(6, HE_GI_16, BW_80), + HE_GROUP(7, HE_GI_16, BW_80), + HE_GROUP(8, HE_GI_16, BW_80), + + HE_GROUP(1, HE_GI_32, BW_80), + HE_GROUP(2, HE_GI_32, BW_80), + HE_GROUP(3, HE_GI_32, BW_80), + HE_GROUP(4, HE_GI_32, BW_80), + HE_GROUP(5, HE_GI_32, BW_80), + HE_GROUP(6, HE_GI_32, BW_80), + HE_GROUP(7, HE_GI_32, BW_80), + HE_GROUP(8, HE_GI_32, BW_80), + + HE_GROUP(1, HE_GI_08, BW_160), + HE_GROUP(2, HE_GI_08, BW_160), + HE_GROUP(3, HE_GI_08, BW_160), + HE_GROUP(4, HE_GI_08, BW_160), + HE_GROUP(5, HE_GI_08, BW_160), + HE_GROUP(6, HE_GI_08, BW_160), + HE_GROUP(7, HE_GI_08, BW_160), + HE_GROUP(8, HE_GI_08, BW_160), + + HE_GROUP(1, HE_GI_16, BW_160), + HE_GROUP(2, HE_GI_16, BW_160), + HE_GROUP(3, HE_GI_16, BW_160), + HE_GROUP(4, HE_GI_16, BW_160), + HE_GROUP(5, HE_GI_16, BW_160), + HE_GROUP(6, HE_GI_16, BW_160), + HE_GROUP(7, HE_GI_16, BW_160), + HE_GROUP(8, HE_GI_16, BW_160), + + HE_GROUP(1, HE_GI_32, BW_160), + HE_GROUP(2, HE_GI_32, BW_160), + HE_GROUP(3, HE_GI_32, BW_160), + HE_GROUP(4, HE_GI_32, BW_160), + HE_GROUP(5, HE_GI_32, BW_160), + HE_GROUP(6, HE_GI_32, BW_160), + HE_GROUP(7, HE_GI_32, BW_160), + HE_GROUP(8, HE_GI_32, BW_160), +}; + +static u32 +ieee80211_calc_legacy_rate_duration(u16 bitrate, bool short_pre, + bool cck, int len) +{ + u32 duration; + + if (cck) { + duration = 144 + 48; /* preamble + PLCP */ + if (short_pre) + duration >>= 1; + + duration += 10; /* SIFS */ + } else { + duration = 20 + 16; /* premable + SIFS */ + } + + len <<= 3; + duration += (len * 10) / bitrate; + + return duration; +} + +u32 ieee80211_calc_rx_airtime(struct ieee80211_hw *hw, + struct ieee80211_rx_status *status, + int len) +{ + struct ieee80211_supported_band *sband; + const struct ieee80211_rate *rate; + bool sgi = status->enc_flags & RX_ENC_FLAG_SHORT_GI; + bool sp = status->enc_flags & RX_ENC_FLAG_SHORTPRE; + int bw, streams; + int group, idx; + u32 duration; + bool cck; + + switch (status->bw) { + case RATE_INFO_BW_20: + bw = BW_20; + break; + case RATE_INFO_BW_40: + bw = BW_40; + break; + case RATE_INFO_BW_80: + bw = BW_80; + break; + case RATE_INFO_BW_160: + bw = BW_160; + break; + default: + WARN_ON_ONCE(1); + return 0; + } + + switch (status->encoding) { + case RX_ENC_LEGACY: + if (WARN_ON_ONCE(status->band > NL80211_BAND_5GHZ)) + return 0; + + sband = hw->wiphy->bands[status->band]; + if (!sband || status->rate_idx >= sband->n_bitrates) + return 0; + + rate = &sband->bitrates[status->rate_idx]; + cck = rate->flags & IEEE80211_RATE_MANDATORY_B; + + return ieee80211_calc_legacy_rate_duration(rate->bitrate, sp, + cck, len); + + case RX_ENC_VHT: + streams = status->nss; + idx = status->rate_idx; + group = VHT_GROUP_IDX(streams, sgi, bw); + break; + case RX_ENC_HT: + streams = ((status->rate_idx >> 3) & 3) + 1; + idx = status->rate_idx & 7; + group = HT_GROUP_IDX(streams, sgi, bw); + break; + case RX_ENC_HE: + streams = status->nss; + idx = status->rate_idx; + group = HE_GROUP_IDX(streams, status->he_gi, bw); + break; + default: + WARN_ON_ONCE(1); + return 0; + } + + if (WARN_ON_ONCE((status->encoding != RX_ENC_HE && streams > 4) || + (status->encoding == RX_ENC_HE && streams > 8))) + return 0; + + duration = airtime_mcs_groups[group].duration[idx]; + duration <<= airtime_mcs_groups[group].shift; + duration *= len; + duration /= AVG_PKT_SIZE; + duration /= 1024; + + duration += 36 + (streams << 2); + + return duration; +} +EXPORT_SYMBOL_GPL(ieee80211_calc_rx_airtime); + +static u32 ieee80211_calc_tx_airtime_rate(struct ieee80211_hw *hw, + struct ieee80211_tx_rate *rate, + u8 band, int len) +{ + struct ieee80211_rx_status stat = { + .band = band, + }; + + if (rate->idx < 0 || !rate->count) + return 0; + + if (rate->flags & IEEE80211_TX_RC_80_MHZ_WIDTH) + stat.bw = RATE_INFO_BW_80; + else if (rate->flags & IEEE80211_TX_RC_40_MHZ_WIDTH) + stat.bw = RATE_INFO_BW_40; + else + stat.bw = RATE_INFO_BW_20; + + stat.enc_flags = 0; + if (rate->flags & IEEE80211_TX_RC_USE_SHORT_PREAMBLE) + stat.enc_flags |= RX_ENC_FLAG_SHORTPRE; + if (rate->flags & IEEE80211_TX_RC_SHORT_GI) + stat.enc_flags |= RX_ENC_FLAG_SHORT_GI; + + stat.rate_idx = rate->idx; + if (rate->flags & IEEE80211_TX_RC_VHT_MCS) { + stat.encoding = RX_ENC_VHT; + stat.rate_idx = ieee80211_rate_get_vht_mcs(rate); + stat.nss = ieee80211_rate_get_vht_nss(rate); + } else if (rate->flags & IEEE80211_TX_RC_MCS) { + stat.encoding = RX_ENC_HT; + } else { + stat.encoding = RX_ENC_LEGACY; + } + + return ieee80211_calc_rx_airtime(hw, &stat, len); +} + +u32 ieee80211_calc_tx_airtime(struct ieee80211_hw *hw, + struct ieee80211_tx_info *info, + int len) +{ + u32 duration = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(info->status.rates); i++) { + struct ieee80211_tx_rate *rate = &info->status.rates[i]; + u32 cur_duration; + + cur_duration = ieee80211_calc_tx_airtime_rate(hw, rate, + info->band, len); + if (!cur_duration) + break; + + duration += cur_duration * rate->count; + } + + return duration; +} +EXPORT_SYMBOL_GPL(ieee80211_calc_tx_airtime); + +u32 ieee80211_calc_expected_tx_airtime(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_sta *pubsta, + int len) +{ + struct ieee80211_supported_band *sband; + struct ieee80211_chanctx_conf *conf; + int rateidx, shift = 0; + bool cck, short_pream; + u32 basic_rates; + u8 band = 0; + u16 rate; + + len += 38; /* Ethernet header length */ + + conf = rcu_dereference(vif->chanctx_conf); + if (conf) { + band = conf->def.chan->band; + shift = ieee80211_chandef_get_shift(&conf->def); + } + + if (pubsta) { + struct sta_info *sta = container_of(pubsta, struct sta_info, + sta); + + return ieee80211_calc_tx_airtime_rate(hw, + &sta->tx_stats.last_rate, + band, len); + } + + if (!conf) + return 0; + + /* No station to get latest rate from, so calculate the worst-case + * duration using the lowest configured basic rate. + */ + sband = hw->wiphy->bands[band]; + + basic_rates = vif->bss_conf.basic_rates; + short_pream = vif->bss_conf.use_short_preamble; + + rateidx = basic_rates ? ffs(basic_rates) - 1 : 0; + rate = sband->bitrates[rateidx].bitrate << shift; + cck = sband->bitrates[rateidx].flags & IEEE80211_RATE_MANDATORY_B; + + return ieee80211_calc_legacy_rate_duration(rate, short_pream, cck, len); +} diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 4d458067d80d..6aee699deb28 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -980,7 +980,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, BSS_CHANGED_SSID | BSS_CHANGED_P2P_PS | BSS_CHANGED_TXPOWER | - BSS_CHANGED_TWT; + BSS_CHANGED_TWT | + BSS_CHANGED_HE_OBSS_PD; int err; int prev_beacon_int; @@ -1051,6 +1052,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, sdata->vif.bss_conf.enable_beacon = true; sdata->vif.bss_conf.allow_p2p_go_ps = sdata->vif.p2p; sdata->vif.bss_conf.twt_responder = params->twt_responder; + memcpy(&sdata->vif.bss_conf.he_obss_pd, ¶ms->he_obss_pd, + sizeof(struct ieee80211_he_obss_pd)); sdata->vif.bss_conf.ssid_len = params->ssid_len; if (params->ssid_len) @@ -1529,7 +1532,6 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev, struct sta_info *sta; struct ieee80211_sub_if_data *sdata; int err; - int layer2_update; if (params->vlan) { sdata = IEEE80211_DEV_TO_SUB_IF(params->vlan); @@ -1543,7 +1545,12 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev, if (ether_addr_equal(mac, sdata->vif.addr)) return -EINVAL; - if (is_multicast_ether_addr(mac)) + if (!is_valid_ether_addr(mac)) + return -EINVAL; + + if (params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER) && + sdata->vif.type == NL80211_IFTYPE_STATION && + !sdata->u.mgd.associated) return -EINVAL; sta = sta_info_alloc(sdata, mac, GFP_KERNEL); @@ -1553,10 +1560,6 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev, if (params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER)) sta->sta.tdls = true; - if (sta->sta.tdls && sdata->vif.type == NL80211_IFTYPE_STATION && - !sdata->u.mgd.associated) - return -EINVAL; - err = sta_apply_parameters(local, sta, params); if (err) { sta_info_free(local, sta); @@ -1572,18 +1575,12 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev, test_sta_flag(sta, WLAN_STA_ASSOC)) rate_control_rate_init(sta); - layer2_update = sdata->vif.type == NL80211_IFTYPE_AP_VLAN || - sdata->vif.type == NL80211_IFTYPE_AP; - err = sta_info_insert_rcu(sta); if (err) { rcu_read_unlock(); return err; } - if (layer2_update) - cfg80211_send_layer2_update(sta->sdata->dev, sta->sta.addr); - rcu_read_unlock(); return 0; @@ -1681,10 +1678,11 @@ static int ieee80211_change_station(struct wiphy *wiphy, sta->sdata = vlansdata; ieee80211_check_fast_xmit(sta); - if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) + if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) { ieee80211_vif_inc_num_mcast(sta->sdata); - - cfg80211_send_layer2_update(sta->sdata->dev, sta->sta.addr); + cfg80211_send_layer2_update(sta->sdata->dev, + sta->sta.addr); + } } err = sta_apply_parameters(local, sta, params); @@ -2956,6 +2954,28 @@ static int ieee80211_start_radar_detection(struct wiphy *wiphy, return err; } +static void ieee80211_end_cac(struct wiphy *wiphy, + struct net_device *dev) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; + + mutex_lock(&local->mtx); + list_for_each_entry(sdata, &local->interfaces, list) { + /* it might be waiting for the local->mtx, but then + * by the time it gets it, sdata->wdev.cac_started + * will no longer be true + */ + cancel_delayed_work(&sdata->dfs_cac_timer_work); + + if (sdata->wdev.cac_started) { + ieee80211_vif_release_channel(sdata); + sdata->wdev.cac_started = false; + } + } + mutex_unlock(&local->mtx); +} + static struct cfg80211_beacon_data * cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon) { @@ -3430,7 +3450,7 @@ int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb, spin_lock_irqsave(&local->ack_status_lock, spin_flags); id = idr_alloc(&local->ack_status_frames, ack_skb, - 1, 0x10000, GFP_ATOMIC); + 1, 0x2000, GFP_ATOMIC); spin_unlock_irqrestore(&local->ack_status_lock, spin_flags); if (id < 0) { @@ -4025,6 +4045,7 @@ const struct cfg80211_ops mac80211_config_ops = { #endif .get_channel = ieee80211_cfg_get_channel, .start_radar_detection = ieee80211_start_radar_detection, + .end_cac = ieee80211_end_cac, .channel_switch = ieee80211_channel_switch, .set_qos_map = ieee80211_set_qos_map, .set_ap_chanwidth = ieee80211_set_ap_chanwidth, diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 2e7f75938c51..ad41d74530c6 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -59,6 +59,8 @@ static const struct file_operations name## _ops = { \ debugfs_create_file(#name, mode, phyd, local, &name## _ops); +DEBUGFS_READONLY_FILE(hw_conf, "%x", + local->hw.conf.flags); DEBUGFS_READONLY_FILE(user_power, "%d", local->user_power_level); DEBUGFS_READONLY_FILE(power, "%d", @@ -148,6 +150,87 @@ static const struct file_operations aqm_ops = { .llseek = default_llseek, }; +static ssize_t aql_txq_limit_read(struct file *file, + char __user *user_buf, + size_t count, + loff_t *ppos) +{ + struct ieee80211_local *local = file->private_data; + char buf[400]; + int len = 0; + + len = scnprintf(buf, sizeof(buf), + "AC AQL limit low AQL limit high\n" + "VO %u %u\n" + "VI %u %u\n" + "BE %u %u\n" + "BK %u %u\n", + local->aql_txq_limit_low[IEEE80211_AC_VO], + local->aql_txq_limit_high[IEEE80211_AC_VO], + local->aql_txq_limit_low[IEEE80211_AC_VI], + local->aql_txq_limit_high[IEEE80211_AC_VI], + local->aql_txq_limit_low[IEEE80211_AC_BE], + local->aql_txq_limit_high[IEEE80211_AC_BE], + local->aql_txq_limit_low[IEEE80211_AC_BK], + local->aql_txq_limit_high[IEEE80211_AC_BK]); + return simple_read_from_buffer(user_buf, count, ppos, + buf, len); +} + +static ssize_t aql_txq_limit_write(struct file *file, + const char __user *user_buf, + size_t count, + loff_t *ppos) +{ + struct ieee80211_local *local = file->private_data; + char buf[100]; + size_t len; + u32 ac, q_limit_low, q_limit_high, q_limit_low_old, q_limit_high_old; + struct sta_info *sta; + + if (count > sizeof(buf)) + return -EINVAL; + + if (copy_from_user(buf, user_buf, count)) + return -EFAULT; + + buf[sizeof(buf) - 1] = 0; + len = strlen(buf); + if (len > 0 && buf[len - 1] == '\n') + buf[len - 1] = 0; + + if (sscanf(buf, "%u %u %u", &ac, &q_limit_low, &q_limit_high) != 3) + return -EINVAL; + + if (ac >= IEEE80211_NUM_ACS) + return -EINVAL; + + q_limit_low_old = local->aql_txq_limit_low[ac]; + q_limit_high_old = local->aql_txq_limit_high[ac]; + + local->aql_txq_limit_low[ac] = q_limit_low; + local->aql_txq_limit_high[ac] = q_limit_high; + + mutex_lock(&local->sta_mtx); + list_for_each_entry(sta, &local->sta_list, list) { + /* If a sta has customized queue limits, keep it */ + if (sta->airtime[ac].aql_limit_low == q_limit_low_old && + sta->airtime[ac].aql_limit_high == q_limit_high_old) { + sta->airtime[ac].aql_limit_low = q_limit_low; + sta->airtime[ac].aql_limit_high = q_limit_high; + } + } + mutex_unlock(&local->sta_mtx); + return count; +} + +static const struct file_operations aql_txq_limit_ops = { + .write = aql_txq_limit_write, + .read = aql_txq_limit_read, + .open = simple_open, + .llseek = default_llseek, +}; + static ssize_t force_tx_status_read(struct file *file, char __user *user_buf, size_t count, @@ -271,8 +354,7 @@ static const char *hw_flag_names[] = { FLAG(TX_STATUS_NO_AMPDU_LEN), FLAG(SUPPORTS_MULTI_BSSID), FLAG(SUPPORTS_ONLY_HE_MULTI_BSSID), - FLAG(EXT_KEY_ID_NATIVE), - FLAG(NO_AMPDU_KEYBORDER_SUPPORT), + FLAG(AMPDU_KEYBORDER_SUPPORT), #undef FLAG }; @@ -434,6 +516,7 @@ void debugfs_hw_add(struct ieee80211_local *local) DEBUGFS_ADD(hwflags); DEBUGFS_ADD(user_power); DEBUGFS_ADD(power); + DEBUGFS_ADD(hw_conf); DEBUGFS_ADD_MODE(force_tx_status, 0600); if (local->ops->wake_tx_queue) @@ -442,6 +525,10 @@ void debugfs_hw_add(struct ieee80211_local *local) debugfs_create_u16("airtime_flags", 0600, phyd, &local->airtime_flags); + DEBUGFS_ADD(aql_txq_limit); + debugfs_create_u32("aql_threshold", 0600, + phyd, &local->aql_threshold); + statsd = debugfs_create_dir("statistics", phyd); /* if the dir failed, don't put all the other things into the root! */ diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index b1438fd4d876..64b544ae9966 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -487,9 +487,14 @@ static ssize_t ieee80211_if_fmt_aqm( const struct ieee80211_sub_if_data *sdata, char *buf, int buflen) { struct ieee80211_local *local = sdata->local; - struct txq_info *txqi = to_txq_info(sdata->vif.txq); + struct txq_info *txqi; int len; + if (!sdata->vif.txq) + return 0; + + txqi = to_txq_info(sdata->vif.txq); + spin_lock_bh(&local->fq.lock); rcu_read_lock(); @@ -658,7 +663,9 @@ static void add_common_files(struct ieee80211_sub_if_data *sdata) DEBUGFS_ADD(rc_rateidx_vht_mcs_mask_5ghz); DEBUGFS_ADD(hw_queues); - if (sdata->local->ops->wake_tx_queue) + if (sdata->local->ops->wake_tx_queue && + sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE && + sdata->vif.type != NL80211_IFTYPE_NAN) DEBUGFS_ADD(aqm); } diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index c8ad20c28c43..c80b1e163ea4 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -197,7 +197,7 @@ static ssize_t sta_airtime_read(struct file *file, char __user *userbuf, { struct sta_info *sta = file->private_data; struct ieee80211_local *local = sta->sdata->local; - size_t bufsz = 200; + size_t bufsz = 400; char *buf = kzalloc(bufsz, GFP_KERNEL), *p = buf; u64 rx_airtime = 0, tx_airtime = 0; s64 deficit[IEEE80211_NUM_ACS]; @@ -218,13 +218,8 @@ static ssize_t sta_airtime_read(struct file *file, char __user *userbuf, p += scnprintf(p, bufsz + buf - p, "RX: %llu us\nTX: %llu us\nWeight: %u\n" "Deficit: VO: %lld us VI: %lld us BE: %lld us BK: %lld us\n", - rx_airtime, - tx_airtime, - sta->airtime_weight, - deficit[0], - deficit[1], - deficit[2], - deficit[3]); + rx_airtime, tx_airtime, sta->airtime_weight, + deficit[0], deficit[1], deficit[2], deficit[3]); rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); kfree(buf); @@ -250,6 +245,70 @@ static ssize_t sta_airtime_write(struct file *file, const char __user *userbuf, } STA_OPS_RW(airtime); +static ssize_t sta_aql_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + struct sta_info *sta = file->private_data; + struct ieee80211_local *local = sta->sdata->local; + size_t bufsz = 400; + char *buf = kzalloc(bufsz, GFP_KERNEL), *p = buf; + u32 q_depth[IEEE80211_NUM_ACS]; + u32 q_limit_l[IEEE80211_NUM_ACS], q_limit_h[IEEE80211_NUM_ACS]; + ssize_t rv; + int ac; + + if (!buf) + return -ENOMEM; + + for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { + spin_lock_bh(&local->active_txq_lock[ac]); + q_limit_l[ac] = sta->airtime[ac].aql_limit_low; + q_limit_h[ac] = sta->airtime[ac].aql_limit_high; + spin_unlock_bh(&local->active_txq_lock[ac]); + q_depth[ac] = atomic_read(&sta->airtime[ac].aql_tx_pending); + } + + p += scnprintf(p, bufsz + buf - p, + "Q depth: VO: %u us VI: %u us BE: %u us BK: %u us\n" + "Q limit[low/high]: VO: %u/%u VI: %u/%u BE: %u/%u BK: %u/%u\n", + q_depth[0], q_depth[1], q_depth[2], q_depth[3], + q_limit_l[0], q_limit_h[0], q_limit_l[1], q_limit_h[1], + q_limit_l[2], q_limit_h[2], q_limit_l[3], q_limit_h[3]), + + rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); + kfree(buf); + return rv; +} + +static ssize_t sta_aql_write(struct file *file, const char __user *userbuf, + size_t count, loff_t *ppos) +{ + struct sta_info *sta = file->private_data; + u32 ac, q_limit_l, q_limit_h; + char _buf[100] = {}, *buf = _buf; + + if (count > sizeof(_buf)) + return -EINVAL; + + if (copy_from_user(buf, userbuf, count)) + return -EFAULT; + + buf[sizeof(_buf) - 1] = '\0'; + if (sscanf(buf, "limit %u %u %u", &ac, &q_limit_l, &q_limit_h) + != 3) + return -EINVAL; + + if (ac >= IEEE80211_NUM_ACS) + return -EINVAL; + + sta->airtime[ac].aql_limit_low = q_limit_l; + sta->airtime[ac].aql_limit_high = q_limit_h; + + return count; +} +STA_OPS_RW(aql); + + static ssize_t sta_agg_status_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { @@ -928,12 +987,7 @@ STA_OPS(he_capa); sta->debugfs_dir, sta, &sta_ ##name## _ops); #define DEBUGFS_ADD_COUNTER(name, field) \ - if (sizeof(sta->field) == sizeof(u32)) \ - debugfs_create_u32(#name, 0400, sta->debugfs_dir, \ - (u32 *) &sta->field); \ - else \ - debugfs_create_u64(#name, 0400, sta->debugfs_dir, \ - (u64 *) &sta->field); + debugfs_create_ulong(#name, 0400, sta->debugfs_dir, &sta->field); void ieee80211_sta_debugfs_add(struct sta_info *sta) { @@ -978,14 +1032,12 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta) NL80211_EXT_FEATURE_AIRTIME_FAIRNESS)) DEBUGFS_ADD(airtime); - if (sizeof(sta->driver_buffered_tids) == sizeof(u32)) - debugfs_create_x32("driver_buffered_tids", 0400, - sta->debugfs_dir, - (u32 *)&sta->driver_buffered_tids); - else - debugfs_create_x64("driver_buffered_tids", 0400, - sta->debugfs_dir, - (u64 *)&sta->driver_buffered_tids); + if (wiphy_ext_feature_isset(local->hw.wiphy, + NL80211_EXT_FEATURE_AQL)) + DEBUGFS_ADD(aql); + + debugfs_create_xul("driver_buffered_tids", 0400, sta->debugfs_dir, + &sta->driver_buffered_tids); drv_sta_add_debugfs(local, sdata, &sta->sta, sta->debugfs_dir); } diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index c2d8b5451a5e..2c9b3eb8b652 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -692,14 +692,16 @@ static inline int drv_remain_on_channel(struct ieee80211_local *local, return ret; } -static inline int drv_cancel_remain_on_channel(struct ieee80211_local *local) +static inline int +drv_cancel_remain_on_channel(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata) { int ret; might_sleep(); - trace_drv_cancel_remain_on_channel(local); - ret = local->ops->cancel_remain_on_channel(&local->hw); + trace_drv_cancel_remain_on_channel(local, sdata); + ret = local->ops->cancel_remain_on_channel(&local->hw, &sdata->vif); trace_drv_return_int(local, ret); return ret; diff --git a/net/mac80211/he.c b/net/mac80211/he.c index 219650591c79..736da0035135 100644 --- a/net/mac80211/he.c +++ b/net/mac80211/he.c @@ -50,3 +50,43 @@ ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, he_cap->has_he = true; } + +void +ieee80211_he_op_ie_to_bss_conf(struct ieee80211_vif *vif, + const struct ieee80211_he_operation *he_op_ie_elem) +{ + struct ieee80211_he_operation *he_operation = + &vif->bss_conf.he_operation; + + if (!he_op_ie_elem) { + memset(he_operation, 0, sizeof(*he_operation)); + return; + } + + vif->bss_conf.he_operation = *he_op_ie_elem; +} + +void +ieee80211_he_spr_ie_to_bss_conf(struct ieee80211_vif *vif, + const struct ieee80211_he_spr *he_spr_ie_elem) +{ + struct ieee80211_he_obss_pd *he_obss_pd = + &vif->bss_conf.he_obss_pd; + const u8 *data; + + memset(he_obss_pd, 0, sizeof(*he_obss_pd)); + + if (!he_spr_ie_elem) + return; + data = he_spr_ie_elem->optional; + + if (he_spr_ie_elem->he_sr_control & + IEEE80211_HE_SPR_NON_SRG_OFFSET_PRESENT) + data++; + if (he_spr_ie_elem->he_sr_control & + IEEE80211_HE_SPR_SRG_INFORMATION_PRESENT) { + he_obss_pd->max_offset = *data++; + he_obss_pd->min_offset = *data++; + he_obss_pd->enable = true; + } +} diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index d5a500b2a448..a2e4d6b8fd98 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -359,7 +359,7 @@ void ieee80211_ba_session_work(struct work_struct *work) sta->ampdu_mlme.tid_rx_manage_offl)) ___ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid, IEEE80211_MAX_AMPDU_BUF_HT, - false, true); + false, true, NULL); if (test_and_clear_bit(tid + IEEE80211_NUM_TIDS, sta->ampdu_mlme.tid_rx_manage_offl)) diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index f00dca056295..d40744903fa9 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -538,7 +538,6 @@ int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct cfg80211_bss *cbss; - int err, changed = 0; sdata_assert_lock(sdata); @@ -560,13 +559,7 @@ int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata) ifibss->chandef = sdata->csa_chandef; /* generate the beacon */ - err = ieee80211_ibss_csa_beacon(sdata, NULL); - if (err < 0) - return err; - - changed |= err; - - return changed; + return ieee80211_ibss_csa_beacon(sdata, NULL); } void ieee80211_ibss_stop(struct ieee80211_sub_if_data *sdata) @@ -1252,6 +1245,7 @@ void ieee80211_ibss_rx_no_sta(struct ieee80211_sub_if_data *sdata, static void ieee80211_ibss_sta_expire(struct ieee80211_sub_if_data *sdata) { + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct ieee80211_local *local = sdata->local; struct sta_info *sta, *tmp; unsigned long exp_time = IEEE80211_IBSS_INACTIVITY_LIMIT; @@ -1268,10 +1262,17 @@ static void ieee80211_ibss_sta_expire(struct ieee80211_sub_if_data *sdata) if (time_is_before_jiffies(last_active + exp_time) || (time_is_before_jiffies(last_active + exp_rsn) && sta->sta_state != IEEE80211_STA_AUTHORIZED)) { + u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; + sta_dbg(sta->sdata, "expiring inactive %sSTA %pM\n", sta->sta_state != IEEE80211_STA_AUTHORIZED ? "not authorized " : "", sta->sta.addr); + ieee80211_send_deauth_disassoc(sdata, sta->sta.addr, + ifibss->bssid, + IEEE80211_STYPE_DEAUTH, + WLAN_REASON_DEAUTH_LEAVING, + true, frame_buf); WARN_ON(__sta_info_destroy(sta)); } } diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 004e2e3adb88..ad15b3be8bb3 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1142,6 +1142,10 @@ struct ieee80211_local { u16 schedule_round[IEEE80211_NUM_ACS]; u16 airtime_flags; + u32 aql_txq_limit_low[IEEE80211_NUM_ACS]; + u32 aql_txq_limit_high[IEEE80211_NUM_ACS]; + u32 aql_threshold; + atomic_t aql_total_pending_airtime; const struct ieee80211_ops *ops; @@ -1480,6 +1484,7 @@ struct ieee802_11_elems { const struct ieee80211_meshconf_ie *mesh_config; const u8 *he_cap; const struct ieee80211_he_operation *he_operation; + const struct ieee80211_he_spr *he_spr; const struct ieee80211_mu_edca_param_set *mu_edca_param_set; const u8 *uora_element; const u8 *mesh_id; @@ -1506,6 +1511,7 @@ struct ieee802_11_elems { u8 max_bssid_indicator; u8 dtim_count; u8 dtim_period; + const struct ieee80211_addba_ext_ie *addba_ext_ie; /* length of them, respectively */ u8 ext_capab_len; @@ -1767,7 +1773,8 @@ ieee80211_build_data_template(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u32 info_flags); void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb, struct ieee80211_supported_band *sband, - int retry_count, int shift, bool send_to_cooked); + int retry_count, int shift, bool send_to_cooked, + struct ieee80211_tx_status *status); void ieee80211_check_fast_xmit(struct sta_info *sta); void ieee80211_check_fast_xmit_all(struct ieee80211_local *local); @@ -1804,7 +1811,8 @@ void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, void ___ieee80211_start_rx_ba_session(struct sta_info *sta, u8 dialog_token, u16 timeout, u16 start_seq_num, u16 ba_policy, u16 tid, - u16 buf_size, bool tx, bool auto_seq); + u16 buf_size, bool tx, bool auto_seq, + const struct ieee80211_addba_ext_ie *addbaext); void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, enum ieee80211_agg_stop_reason reason); void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata, @@ -1869,6 +1877,13 @@ ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const u8 *he_cap_ie, u8 he_cap_len, struct sta_info *sta); +void +ieee80211_he_spr_ie_to_bss_conf(struct ieee80211_vif *vif, + const struct ieee80211_he_spr *he_spr_ie_elem); + +void +ieee80211_he_op_ie_to_bss_conf(struct ieee80211_vif *vif, + const struct ieee80211_he_operation *he_op_ie_elem); /* Spectrum management */ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, @@ -2088,7 +2103,8 @@ void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, const u8 *da, const u8 *key, u8 key_len, u8 key_idx, u32 tx_flags); void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata, - const u8 *bssid, u16 stype, u16 reason, + const u8 *da, const u8 *bssid, + u16 stype, u16 reason, bool send_frame, u8 *frame_buf); enum { @@ -2133,9 +2149,11 @@ u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, u32 cap); u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, const struct cfg80211_chan_def *chandef); +u8 ieee80211_ie_len_he_cap(struct ieee80211_sub_if_data *sdata, u8 iftype); u8 *ieee80211_ie_build_he_cap(u8 *pos, const struct ieee80211_sta_he_cap *he_cap, u8 *end); +u8 *ieee80211_ie_build_he_oper(u8 *pos); int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef, const struct ieee80211_supported_band *sband, const u8 *srates, int srates_len, u32 *rates); @@ -2235,6 +2253,10 @@ const char *ieee80211_get_reason_code_string(u16 reason_code); extern const struct ethtool_ops ieee80211_ethtool_ops; +u32 ieee80211_calc_expected_tx_airtime(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_sta *pubsta, + int len); #ifdef CONFIG_MAC80211_NOINLINE #define debug_noinline noinline #else diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 8dc6580e1787..af8b09214786 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1876,7 +1876,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, /* MTU range: 256 - 2304 */ ndev->min_mtu = 256; - ndev->max_mtu = IEEE80211_MAX_DATA_LEN; + ndev->max_mtu = local->hw.max_mtu; ret = register_netdevice(ndev); if (ret) { diff --git a/net/mac80211/key.c b/net/mac80211/key.c index dd60f6428049..0f889b919b06 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -6,6 +6,7 @@ * Copyright 2007-2008 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH + * Copyright 2018-2019 Intel Corporation */ #include <linux/if_ether.h> @@ -270,7 +271,7 @@ int ieee80211_set_tx_key(struct ieee80211_key *key) sta->ptk_idx = key->conf.keyidx; - if (ieee80211_hw_check(&local->hw, NO_AMPDU_KEYBORDER_SUPPORT)) + if (!ieee80211_hw_check(&local->hw, AMPDU_KEYBORDER_SUPPORT)) clear_sta_flag(sta, WLAN_STA_BLOCK_BA); ieee80211_check_fast_xmit(sta); @@ -290,15 +291,15 @@ static void ieee80211_pairwise_rekey(struct ieee80211_key *old, /* Extended Key ID key install, initial one or rekey */ if (sta->ptk_idx != INVALID_PTK_KEYIDX && - ieee80211_hw_check(&local->hw, - NO_AMPDU_KEYBORDER_SUPPORT)) { + !ieee80211_hw_check(&local->hw, AMPDU_KEYBORDER_SUPPORT)) { /* Aggregation Sessions with Extended Key ID must not * mix MPDUs with different keyIDs within one A-MPDU. - * Tear down any running Tx aggregation and all new - * Rx/Tx aggregation request during rekey if the driver - * asks us to do so. (Blocking Tx only would be - * sufficient but WLAN_STA_BLOCK_BA gets the job done - * for the few ms we need it.) + * Tear down running Tx aggregation sessions and block + * new Rx/Tx aggregation requests during rekey to + * ensure there are no A-MPDUs when the driver is not + * supporting A-MPDU key borders. (Blocking Tx only + * would be sufficient but WLAN_STA_BLOCK_BA gets the + * job done for the few ms we need it.) */ set_sta_flag(sta, WLAN_STA_BLOCK_BA); mutex_lock(&sta->ampdu_mlme.mtx); @@ -781,9 +782,8 @@ int ieee80211_key_link(struct ieee80211_key *key, /* The rekey code assumes that the old and new key are using * the same cipher. Enforce the assumption for pairwise keys. */ - if (key && - ((alt_key && alt_key->conf.cipher != key->conf.cipher) || - (old_key && old_key->conf.cipher != key->conf.cipher))) + if ((alt_key && alt_key->conf.cipher != key->conf.cipher) || + (old_key && old_key->conf.cipher != key->conf.cipher)) goto out; } else if (sta) { old_key = key_mtx_dereference(sdata->local, sta->gtk[idx]); @@ -793,7 +793,7 @@ int ieee80211_key_link(struct ieee80211_key *key, /* Non-pairwise keys must also not switch the cipher on rekey */ if (!pairwise) { - if (key && old_key && old_key->conf.cipher != key->conf.cipher) + if (old_key && old_key->conf.cipher != key->conf.cipher) goto out; } @@ -843,46 +843,30 @@ void ieee80211_key_free(struct ieee80211_key *key, bool delay_tailroom) ieee80211_key_destroy(key, delay_tailroom); } -void ieee80211_enable_keys(struct ieee80211_sub_if_data *sdata) +void ieee80211_reenable_keys(struct ieee80211_sub_if_data *sdata) { struct ieee80211_key *key; struct ieee80211_sub_if_data *vlan; ASSERT_RTNL(); - if (WARN_ON(!ieee80211_sdata_running(sdata))) - return; - - mutex_lock(&sdata->local->key_mtx); - - WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt || - sdata->crypto_tx_tailroom_pending_dec); - - if (sdata->vif.type == NL80211_IFTYPE_AP) { - list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) - WARN_ON_ONCE(vlan->crypto_tx_tailroom_needed_cnt || - vlan->crypto_tx_tailroom_pending_dec); - } - - list_for_each_entry(key, &sdata->key_list, list) { - increment_tailroom_need_count(sdata); - ieee80211_key_enable_hw_accel(key); - } - - mutex_unlock(&sdata->local->key_mtx); -} - -void ieee80211_reset_crypto_tx_tailroom(struct ieee80211_sub_if_data *sdata) -{ - struct ieee80211_sub_if_data *vlan; - mutex_lock(&sdata->local->key_mtx); sdata->crypto_tx_tailroom_needed_cnt = 0; + sdata->crypto_tx_tailroom_pending_dec = 0; if (sdata->vif.type == NL80211_IFTYPE_AP) { - list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) + list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) { vlan->crypto_tx_tailroom_needed_cnt = 0; + vlan->crypto_tx_tailroom_pending_dec = 0; + } + } + + if (ieee80211_sdata_running(sdata)) { + list_for_each_entry(key, &sdata->key_list, list) { + increment_tailroom_need_count(sdata); + ieee80211_key_enable_hw_accel(key); + } } mutex_unlock(&sdata->local->key_mtx); diff --git a/net/mac80211/key.h b/net/mac80211/key.h index b8b9cd743bf4..d6d6e89cf7dd 100644 --- a/net/mac80211/key.h +++ b/net/mac80211/key.h @@ -2,6 +2,7 @@ /* * Copyright 2002-2004, Instant802 Networks, Inc. * Copyright 2005, Devicescape Software, Inc. + * Copyright (C) 2019 Intel Corporation */ #ifndef IEEE80211_KEY_H @@ -156,8 +157,7 @@ void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata, bool force_synchronize); void ieee80211_free_sta_keys(struct ieee80211_local *local, struct sta_info *sta); -void ieee80211_enable_keys(struct ieee80211_sub_if_data *sdata); -void ieee80211_reset_crypto_tx_tailroom(struct ieee80211_sub_if_data *sdata); +void ieee80211_reenable_keys(struct ieee80211_sub_if_data *sdata); #define key_mtx_dereference(local, ref) \ rcu_dereference_protected(ref, lockdep_is_held(&((local)->key_mtx))) diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 4c2702f128f3..4c2b5ba3ac09 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -639,6 +639,7 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, IEEE80211_RADIOTAP_VHT_KNOWN_BANDWIDTH; local->hw.uapsd_queues = IEEE80211_DEFAULT_UAPSD_QUEUES; local->hw.uapsd_max_sp_len = IEEE80211_DEFAULT_MAX_SP_LEN; + local->hw.max_mtu = IEEE80211_MAX_DATA_LEN; local->user_power_level = IEEE80211_UNSET_POWER_LEVEL; wiphy->ht_capa_mod_mask = &mac80211_ht_capa_mod_mask; wiphy->vht_capa_mod_mask = &mac80211_vht_capa_mod_mask; @@ -666,8 +667,14 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, for (i = 0; i < IEEE80211_NUM_ACS; i++) { INIT_LIST_HEAD(&local->active_txqs[i]); spin_lock_init(&local->active_txq_lock[i]); + local->aql_txq_limit_low[i] = IEEE80211_DEFAULT_AQL_TXQ_LIMIT_L; + local->aql_txq_limit_high[i] = + IEEE80211_DEFAULT_AQL_TXQ_LIMIT_H; } + local->airtime_flags = AIRTIME_USE_TX | AIRTIME_USE_RX; + local->aql_threshold = IEEE80211_AQL_THRESHOLD; + atomic_set(&local->aql_total_pending_airtime, 0); INIT_LIST_HEAD(&local->chanctx_list); mutex_init(&local->chanctx_mtx); @@ -1048,21 +1055,15 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) } } - /* Enable Extended Key IDs when driver allowed it, or when it - * supports neither HW crypto nor A-MPDUs - */ - if ((!local->ops->set_key && - !ieee80211_hw_check(hw, AMPDU_AGGREGATION)) || - ieee80211_hw_check(&local->hw, EXT_KEY_ID_NATIVE)) - wiphy_ext_feature_set(local->hw.wiphy, - NL80211_EXT_FEATURE_EXT_KEY_ID); - - /* Mac80211 and therefore all cards only using SW crypto are able to - * handle PTK rekeys correctly + /* Mac80211 and therefore all drivers using SW crypto only + * are able to handle PTK rekeys and Extended Key ID. */ - if (!local->ops->set_key) + if (!local->ops->set_key) { wiphy_ext_feature_set(local->hw.wiphy, NL80211_EXT_FEATURE_CAN_REPLACE_PTK0); + wiphy_ext_feature_set(local->hw.wiphy, + NL80211_EXT_FEATURE_EXT_KEY_ID); + } /* * Calculate scan IE length -- we need this to alloc @@ -1297,8 +1298,8 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) ieee80211_remove_interfaces(local); fail_rate: rtnl_unlock(); - ieee80211_led_exit(local); fail_flows: + ieee80211_led_exit(local); destroy_workqueue(local->workqueue); fail_workqueue: wiphy_unregister(local->hw.wiphy); diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 2e7fa743c892..d09b3c789314 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -532,6 +532,61 @@ int mesh_add_vht_oper_ie(struct ieee80211_sub_if_data *sdata, return 0; } +int mesh_add_he_cap_ie(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb, u8 ie_len) +{ + const struct ieee80211_sta_he_cap *he_cap; + struct ieee80211_supported_band *sband; + u8 *pos; + + sband = ieee80211_get_sband(sdata); + if (!sband) + return -EINVAL; + + he_cap = ieee80211_get_he_iftype_cap(sband, NL80211_IFTYPE_MESH_POINT); + + if (!he_cap || + sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT || + sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 || + sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10) + return 0; + + if (skb_tailroom(skb) < ie_len) + return -ENOMEM; + + pos = skb_put(skb, ie_len); + ieee80211_ie_build_he_cap(pos, he_cap, pos + ie_len); + + return 0; +} + +int mesh_add_he_oper_ie(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + const struct ieee80211_sta_he_cap *he_cap; + struct ieee80211_supported_band *sband; + u8 *pos; + + sband = ieee80211_get_sband(sdata); + if (!sband) + return -EINVAL; + + he_cap = ieee80211_get_he_iftype_cap(sband, NL80211_IFTYPE_MESH_POINT); + if (!he_cap || + sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT || + sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 || + sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10) + return 0; + + if (skb_tailroom(skb) < 2 + 1 + sizeof(struct ieee80211_he_operation)) + return -ENOMEM; + + pos = skb_put(skb, 2 + 1 + sizeof(struct ieee80211_he_operation)); + ieee80211_ie_build_he_oper(pos); + + return 0; +} + static void ieee80211_mesh_path_timer(struct timer_list *t) { struct ieee80211_sub_if_data *sdata = @@ -677,6 +732,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) struct ieee80211_chanctx_conf *chanctx_conf; struct mesh_csa_settings *csa; enum nl80211_band band; + u8 ie_len_he_cap; u8 *pos; struct ieee80211_sub_if_data *sdata; int hdr_len = offsetofend(struct ieee80211_mgmt, u.beacon); @@ -687,6 +743,8 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) band = chanctx_conf->def.chan->band; rcu_read_unlock(); + ie_len_he_cap = ieee80211_ie_len_he_cap(sdata, + NL80211_IFTYPE_MESH_POINT); head_len = hdr_len + 2 + /* NULL SSID */ /* Channel Switch Announcement */ @@ -706,6 +764,8 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) 2 + sizeof(__le16) + /* awake window */ 2 + sizeof(struct ieee80211_vht_cap) + 2 + sizeof(struct ieee80211_vht_operation) + + ie_len_he_cap + + 2 + 1 + sizeof(struct ieee80211_he_operation) + ifmsh->ie_len; bcn = kzalloc(sizeof(*bcn) + head_len + tail_len, GFP_KERNEL); @@ -823,6 +883,8 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) mesh_add_awake_window_ie(sdata, skb) || mesh_add_vht_cap_ie(sdata, skb) || mesh_add_vht_oper_ie(sdata, skb) || + mesh_add_he_cap_ie(sdata, skb, ie_len_he_cap) || + mesh_add_he_oper_ie(sdata, skb) || mesh_add_vendor_ies(sdata, skb)) goto out_free; diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index 94d57cce70da..953f720754e8 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -218,6 +218,10 @@ int mesh_add_vht_cap_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); int mesh_add_vht_oper_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); +int mesh_add_he_cap_ie(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb, u8 ie_len); +int mesh_add_he_oper_ie(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb); void mesh_rmc_free(struct ieee80211_sub_if_data *sdata); int mesh_rmc_init(struct ieee80211_sub_if_data *sdata); void ieee80211s_init(void); diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 68af62306385..d69983370381 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -328,6 +328,9 @@ u32 airtime_link_metric_get(struct ieee80211_local *local, unsigned long fail_avg = ewma_mesh_fail_avg_read(&sta->mesh->fail_avg); + if (sta->mesh->plink_state != NL80211_PLINK_ESTAB) + return MAX_METRIC; + /* Try to get rate based on HW/SW RC algorithm. * Rate is returned in units of Kbps, correct this * to comply with airtime calculation units diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index dd3aefd052a9..737c5f4dbf52 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -218,9 +218,12 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, bool include_plid = false; u16 peering_proto = 0; u8 *pos, ie_len = 4; + u8 ie_len_he_cap; int hdr_len = offsetofend(struct ieee80211_mgmt, u.action.u.self_prot); int err = -ENOMEM; + ie_len_he_cap = ieee80211_ie_len_he_cap(sdata, + NL80211_IFTYPE_MESH_POINT); skb = dev_alloc_skb(local->tx_headroom + hdr_len + 2 + /* capability info */ @@ -233,6 +236,8 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, 2 + sizeof(struct ieee80211_ht_operation) + 2 + sizeof(struct ieee80211_vht_cap) + 2 + sizeof(struct ieee80211_vht_operation) + + ie_len_he_cap + + 2 + 1 + sizeof(struct ieee80211_he_operation) + 2 + 8 + /* peering IE */ sdata->u.mesh.ie_len); if (!skb) @@ -321,7 +326,9 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, if (mesh_add_ht_cap_ie(sdata, skb) || mesh_add_ht_oper_ie(sdata, skb) || mesh_add_vht_cap_ie(sdata, skb) || - mesh_add_vht_oper_ie(sdata, skb)) + mesh_add_vht_oper_ie(sdata, skb) || + mesh_add_he_cap_ie(sdata, skb, ie_len_he_cap) || + mesh_add_he_oper_ie(sdata, skb)) goto free; } @@ -433,6 +440,9 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata, ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, elems->vht_cap_elem, sta); + ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, elems->he_cap, + elems->he_cap_len, sta); + if (bw != sta->sta.bandwidth) changed |= IEEE80211_RC_BW_CHANGED; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 4c888dc9bd81..e041af2f021a 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -8,7 +8,7 @@ * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2019 Intel Corporation + * Copyright (C) 2018 - 2020 Intel Corporation */ #include <linux/delay.h> @@ -158,10 +158,10 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, memcpy(&sta_ht_cap, &sband->ht_cap, sizeof(sta_ht_cap)); ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap); + memset(chandef, 0, sizeof(struct cfg80211_chan_def)); chandef->chan = channel; chandef->width = NL80211_CHAN_WIDTH_20_NOHT; chandef->center_freq1 = channel->center_freq; - chandef->center_freq2 = 0; if (!ht_oper || !sta_ht_cap.ht_supported) { ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT; @@ -1311,7 +1311,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, if (!res) { ch_switch.timestamp = timestamp; ch_switch.device_timestamp = device_timestamp; - ch_switch.block_tx = beacon ? csa_ie.mode : 0; + ch_switch.block_tx = csa_ie.mode; ch_switch.chandef = csa_ie.chandef; ch_switch.count = csa_ie.count; ch_switch.delay = csa_ie.max_switch_time; @@ -1404,7 +1404,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, sdata->vif.csa_active = true; sdata->csa_chandef = csa_ie.chandef; - sdata->csa_block_tx = ch_switch.block_tx; + sdata->csa_block_tx = csa_ie.mode; ifmgd->csa_ignored_same_chan = false; if (sdata->csa_block_tx) @@ -1438,7 +1438,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, * reset when the disconnection worker runs. */ sdata->vif.csa_active = true; - sdata->csa_block_tx = ch_switch.block_tx; + sdata->csa_block_tx = csa_ie.mode; ieee80211_queue_work(&local->hw, &ifmgd->csa_connection_drop_work); mutex_unlock(&local->chanctx_mtx); @@ -2278,8 +2278,9 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, !ifmgd->have_beacon) drv_mgd_prepare_tx(sdata->local, sdata, 0); - ieee80211_send_deauth_disassoc(sdata, ifmgd->bssid, stype, - reason, tx, frame_buf); + ieee80211_send_deauth_disassoc(sdata, ifmgd->bssid, + ifmgd->bssid, stype, reason, + tx, frame_buf); } /* flush out frame - make sure the deauth was actually sent */ @@ -2522,7 +2523,10 @@ static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata) if (ieee80211_hw_check(&sdata->local->hw, REPORTS_TX_ACK_STATUS)) { ifmgd->nullfunc_failed = false; - ieee80211_send_nullfunc(sdata->local, sdata, false); + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE)) + ifmgd->probe_send_count--; + else + ieee80211_send_nullfunc(sdata->local, sdata, false); } else { int ssid_len; @@ -2629,7 +2633,8 @@ struct sk_buff *ieee80211_ap_probereq_get(struct ieee80211_hw *hw, rcu_read_lock(); ssid = ieee80211_bss_get_ie(cbss, WLAN_EID_SSID); - if (WARN_ON_ONCE(ssid == NULL)) + if (WARN_ONCE(!ssid || ssid[1] > IEEE80211_MAX_SSID_LEN, + "invalid SSID element (len=%d)", ssid ? ssid[1] : -1)) ssid_len = 0; else ssid_len = ssid[1]; @@ -3181,15 +3186,14 @@ static int ieee80211_recalc_twt_req(struct ieee80211_sub_if_data *sdata, static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, struct cfg80211_bss *cbss, - struct ieee80211_mgmt *mgmt, size_t len) + struct ieee80211_mgmt *mgmt, size_t len, + struct ieee802_11_elems *elems) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; struct sta_info *sta; - u8 *pos; u16 capab_info, aid; - struct ieee802_11_elems elems; struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf; const struct cfg80211_bss_ies *bss_ies = NULL; struct ieee80211_mgd_assoc_data *assoc_data = ifmgd->assoc_data; @@ -3217,19 +3221,15 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, ifmgd->broken_ap = true; } - pos = mgmt->u.assoc_resp.variable; - ieee802_11_parse_elems(pos, len - (pos - (u8 *)mgmt), false, &elems, - mgmt->bssid, assoc_data->bss->bssid); - - if (!elems.supp_rates) { + if (!elems->supp_rates) { sdata_info(sdata, "no SuppRates element in AssocResp\n"); return false; } ifmgd->aid = aid; ifmgd->tdls_chan_switch_prohibited = - elems.ext_capab && elems.ext_capab_len >= 5 && - (elems.ext_capab[4] & WLAN_EXT_CAPA5_TDLS_CH_SW_PROHIBITED); + elems->ext_capab && elems->ext_capab_len >= 5 && + (elems->ext_capab[4] & WLAN_EXT_CAPA5_TDLS_CH_SW_PROHIBITED); /* * Some APs are erroneously not including some information in their @@ -3238,11 +3238,11 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, * 2G/3G/4G wifi routers, reported models include the "Onda PN51T", * "Vodafone PocketWiFi 2", "ZTE MF60" and a similar T-Mobile device. */ - if ((assoc_data->wmm && !elems.wmm_param) || + if ((assoc_data->wmm && !elems->wmm_param) || (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) && - (!elems.ht_cap_elem || !elems.ht_operation)) || + (!elems->ht_cap_elem || !elems->ht_operation)) || (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) && - (!elems.vht_cap_elem || !elems.vht_operation))) { + (!elems->vht_cap_elem || !elems->vht_operation))) { const struct cfg80211_bss_ies *ies; struct ieee802_11_elems bss_elems; @@ -3260,8 +3260,8 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, mgmt->bssid, assoc_data->bss->bssid); if (assoc_data->wmm && - !elems.wmm_param && bss_elems.wmm_param) { - elems.wmm_param = bss_elems.wmm_param; + !elems->wmm_param && bss_elems.wmm_param) { + elems->wmm_param = bss_elems.wmm_param; sdata_info(sdata, "AP bug: WMM param missing from AssocResp\n"); } @@ -3270,27 +3270,27 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, * Also check if we requested HT/VHT, otherwise the AP doesn't * have to include the IEs in the (re)association response. */ - if (!elems.ht_cap_elem && bss_elems.ht_cap_elem && + if (!elems->ht_cap_elem && bss_elems.ht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) { - elems.ht_cap_elem = bss_elems.ht_cap_elem; + elems->ht_cap_elem = bss_elems.ht_cap_elem; sdata_info(sdata, "AP bug: HT capability missing from AssocResp\n"); } - if (!elems.ht_operation && bss_elems.ht_operation && + if (!elems->ht_operation && bss_elems.ht_operation && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) { - elems.ht_operation = bss_elems.ht_operation; + elems->ht_operation = bss_elems.ht_operation; sdata_info(sdata, "AP bug: HT operation missing from AssocResp\n"); } - if (!elems.vht_cap_elem && bss_elems.vht_cap_elem && + if (!elems->vht_cap_elem && bss_elems.vht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) { - elems.vht_cap_elem = bss_elems.vht_cap_elem; + elems->vht_cap_elem = bss_elems.vht_cap_elem; sdata_info(sdata, "AP bug: VHT capa missing from AssocResp\n"); } - if (!elems.vht_operation && bss_elems.vht_operation && + if (!elems->vht_operation && bss_elems.vht_operation && !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) { - elems.vht_operation = bss_elems.vht_operation; + elems->vht_operation = bss_elems.vht_operation; sdata_info(sdata, "AP bug: VHT operation missing from AssocResp\n"); } @@ -3301,7 +3301,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, * they should be present here. This is just a safety net. */ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) && - (!elems.wmm_param || !elems.ht_cap_elem || !elems.ht_operation)) { + (!elems->wmm_param || !elems->ht_cap_elem || !elems->ht_operation)) { sdata_info(sdata, "HT AP is missing WMM params or HT capability/operation\n"); ret = false; @@ -3309,7 +3309,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, } if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) && - (!elems.vht_cap_elem || !elems.vht_operation)) { + (!elems->vht_cap_elem || !elems->vht_operation)) { sdata_info(sdata, "VHT AP is missing VHT capability/operation\n"); ret = false; @@ -3336,7 +3336,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, } if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && - (!elems.he_cap || !elems.he_operation)) { + (!elems->he_cap || !elems->he_operation)) { mutex_unlock(&sdata->local->sta_mtx); sdata_info(sdata, "HE AP is missing HE capability/operation\n"); @@ -3345,23 +3345,23 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, } /* Set up internal HT/VHT capabilities */ - if (elems.ht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) + if (elems->ht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband, - elems.ht_cap_elem, sta); + elems->ht_cap_elem, sta); - if (elems.vht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) + if (elems->vht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, - elems.vht_cap_elem, sta); + elems->vht_cap_elem, sta); - if (elems.he_operation && !(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && - elems.he_cap) { + if (elems->he_operation && !(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && + elems->he_cap) { ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, - elems.he_cap, - elems.he_cap_len, + elems->he_cap, + elems->he_cap_len, sta); bss_conf->he_support = sta->sta.he_cap.has_he; - changed |= ieee80211_recalc_twt_req(sdata, sta, &elems); + changed |= ieee80211_recalc_twt_req(sdata, sta, elems); } else { bss_conf->he_support = false; bss_conf->twt_requester = false; @@ -3369,14 +3369,14 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, if (bss_conf->he_support) { bss_conf->bss_color = - le32_get_bits(elems.he_operation->he_oper_params, + le32_get_bits(elems->he_operation->he_oper_params, IEEE80211_HE_OPERATION_BSS_COLOR_MASK); bss_conf->htc_trig_based_pkt_ext = - le32_get_bits(elems.he_operation->he_oper_params, + le32_get_bits(elems->he_operation->he_oper_params, IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK); bss_conf->frame_time_rts_th = - le32_get_bits(elems.he_operation->he_oper_params, + le32_get_bits(elems->he_operation->he_oper_params, IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK); bss_conf->multi_sta_back_32bit = @@ -3387,10 +3387,12 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, sta->sta.he_cap.he_cap_elem.mac_cap_info[2] & IEEE80211_HE_MAC_CAP2_ACK_EN; - bss_conf->uora_exists = !!elems.uora_element; - if (elems.uora_element) - bss_conf->uora_ocw_range = elems.uora_element[0]; + bss_conf->uora_exists = !!elems->uora_element; + if (elems->uora_element) + bss_conf->uora_ocw_range = elems->uora_element[0]; + ieee80211_he_op_ie_to_bss_conf(&sdata->vif, elems->he_operation); + ieee80211_he_spr_ie_to_bss_conf(&sdata->vif, elems->he_spr); /* TODO: OPEN: what happens if BSS color disable is set? */ } @@ -3414,11 +3416,11 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, * NSS calculation (that would be done in rate_control_rate_init()) * and use the # of streams from that element. */ - if (elems.opmode_notif && - !(*elems.opmode_notif & IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF)) { + if (elems->opmode_notif && + !(*elems->opmode_notif & IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF)) { u8 nss; - nss = *elems.opmode_notif & IEEE80211_OPMODE_NOTIF_RX_NSS_MASK; + nss = *elems->opmode_notif & IEEE80211_OPMODE_NOTIF_RX_NSS_MASK; nss >>= IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT; nss += 1; sta->sta.rx_nss = nss; @@ -3433,7 +3435,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, sta->sta.mfp = false; } - sta->sta.wme = elems.wmm_param && local->hw.queues >= IEEE80211_NUM_ACS; + sta->sta.wme = elems->wmm_param && local->hw.queues >= IEEE80211_NUM_ACS; err = sta_info_move_state(sta, IEEE80211_STA_ASSOC); if (!err && !(ifmgd->flags & IEEE80211_STA_CONTROL_PORT)) @@ -3461,9 +3463,9 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, if (ifmgd->flags & IEEE80211_STA_DISABLE_WMM) { ieee80211_set_wmm_default(sdata, false, false); - } else if (!ieee80211_sta_wmm_params(local, sdata, elems.wmm_param, - elems.wmm_param_len, - elems.mu_edca_param_set)) { + } else if (!ieee80211_sta_wmm_params(local, sdata, elems->wmm_param, + elems->wmm_param_len, + elems->mu_edca_param_set)) { /* still enable QoS since we might have HT/VHT */ ieee80211_set_wmm_default(sdata, false, true); /* set the disable-WMM flag in this case to disable @@ -3477,11 +3479,11 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, } changed |= BSS_CHANGED_QOS; - if (elems.max_idle_period_ie) { + if (elems->max_idle_period_ie) { bss_conf->max_idle_period = - le16_to_cpu(elems.max_idle_period_ie->max_idle_period); + le16_to_cpu(elems->max_idle_period_ie->max_idle_period); bss_conf->protected_keep_alive = - !!(elems.max_idle_period_ie->idle_options & + !!(elems->max_idle_period_ie->idle_options & WLAN_IDLE_OPTIONS_PROTECTED_KEEP_ALIVE); changed |= BSS_CHANGED_KEEP_ALIVE; } else { @@ -3591,7 +3593,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, event.u.mlme.reason = status_code; drv_event_callback(sdata->local, sdata, &event); } else { - if (!ieee80211_assoc_success(sdata, bss, mgmt, len)) { + if (!ieee80211_assoc_success(sdata, bss, mgmt, len, &elems)) { /* oops -- internal error -- send timeout for now */ ieee80211_destroy_assoc_data(sdata, false, false); cfg80211_assoc_timeout(sdata->dev, bss); @@ -4504,7 +4506,7 @@ void ieee80211_mgd_quiesce(struct ieee80211_sub_if_data *sdata) * cfg80211 won't know and won't actually abort those attempts, * thus we need to do that ourselves. */ - ieee80211_send_deauth_disassoc(sdata, bssid, + ieee80211_send_deauth_disassoc(sdata, bssid, bssid, IEEE80211_STYPE_DEAUTH, WLAN_REASON_DEAUTH_LEAVING, false, frame_buf); @@ -5227,7 +5229,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, rcu_read_lock(); ssidie = ieee80211_bss_get_ie(req->bss, WLAN_EID_SSID); - if (!ssidie) { + if (!ssidie || ssidie[1] > sizeof(assoc_data->ssid)) { rcu_read_unlock(); kfree(assoc_data); return -EINVAL; @@ -5291,7 +5293,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; ifmgd->flags |= IEEE80211_STA_DISABLE_HE; netdev_info(sdata->dev, - "disabling HE/HT/VHT due to WEP/TKIP use\n"); + "disabling HT/VHT/HE due to WEP/TKIP use\n"); } } @@ -5545,7 +5547,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, ieee80211_get_reason_code_string(req->reason_code)); drv_mgd_prepare_tx(sdata->local, sdata, 0); - ieee80211_send_deauth_disassoc(sdata, req->bssid, + ieee80211_send_deauth_disassoc(sdata, req->bssid, req->bssid, IEEE80211_STYPE_DEAUTH, req->reason_code, tx, frame_buf); @@ -5565,7 +5567,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, ieee80211_get_reason_code_string(req->reason_code)); drv_mgd_prepare_tx(sdata->local, sdata, 0); - ieee80211_send_deauth_disassoc(sdata, req->bssid, + ieee80211_send_deauth_disassoc(sdata, req->bssid, req->bssid, IEEE80211_STYPE_DEAUTH, req->reason_code, tx, frame_buf); diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index 60ef8972b254..c710504ccf1a 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -8,6 +8,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> + * Copyright (C) 2019 Intel Corporation */ #include <linux/export.h> #include <net/mac80211.h> @@ -732,7 +733,7 @@ static int ieee80211_cancel_roc(struct ieee80211_local *local, } if (local->ops->remain_on_channel) { - ret = drv_cancel_remain_on_channel(local); + ret = drv_cancel_remain_on_channel(local, roc->sdata); if (WARN_ON_ONCE(ret)) { mutex_unlock(&local->mtx); return ret; @@ -991,7 +992,7 @@ void ieee80211_roc_purge(struct ieee80211_local *local, if (roc->started) { if (local->ops->remain_on_channel) { /* can race, so ignore return value */ - drv_cancel_remain_on_channel(local); + drv_cancel_remain_on_channel(local, sdata); ieee80211_roc_notify_destroy(roc); } else { roc->abort = true; diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h index 5d5348bc41ec..5397c6dad056 100644 --- a/net/mac80211/rate.h +++ b/net/mac80211/rate.h @@ -60,15 +60,6 @@ static inline void rate_control_add_sta_debugfs(struct sta_info *sta) #endif } -static inline void rate_control_remove_sta_debugfs(struct sta_info *sta) -{ -#ifdef CONFIG_MAC80211_DEBUGFS - struct rate_control_ref *ref = sta->rate_ctrl; - if (ref && ref->ops->remove_sta_debugfs) - ref->ops->remove_sta_debugfs(ref->priv, sta->rate_ctrl_priv); -#endif -} - void ieee80211_check_rate_mask(struct ieee80211_sub_if_data *sdata); /* Get a reference to the rate control algorithm. If `name' is NULL, get the diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c index ee86c3333999..86bc469a28bc 100644 --- a/net/mac80211/rc80211_minstrel.c +++ b/net/mac80211/rc80211_minstrel.c @@ -70,7 +70,7 @@ rix_to_ndx(struct minstrel_sta_info *mi, int rix) } /* return current EMWA throughput */ -int minstrel_get_tp_avg(struct minstrel_rate *mr, int prob_ewma) +int minstrel_get_tp_avg(struct minstrel_rate *mr, int prob_avg) { int usecs; @@ -79,13 +79,13 @@ int minstrel_get_tp_avg(struct minstrel_rate *mr, int prob_ewma) usecs = 1000000; /* reset thr. below 10% success */ - if (mr->stats.prob_ewma < MINSTREL_FRAC(10, 100)) + if (mr->stats.prob_avg < MINSTREL_FRAC(10, 100)) return 0; - if (prob_ewma > MINSTREL_FRAC(90, 100)) + if (prob_avg > MINSTREL_FRAC(90, 100)) return MINSTREL_TRUNC(100000 * (MINSTREL_FRAC(90, 100) / usecs)); else - return MINSTREL_TRUNC(100000 * (prob_ewma / usecs)); + return MINSTREL_TRUNC(100000 * (prob_avg / usecs)); } /* find & sort topmost throughput rates */ @@ -98,8 +98,8 @@ minstrel_sort_best_tp_rates(struct minstrel_sta_info *mi, int i, u8 *tp_list) for (j = MAX_THR_RATES; j > 0; --j) { tmp_mrs = &mi->r[tp_list[j - 1]].stats; - if (minstrel_get_tp_avg(&mi->r[i], cur_mrs->prob_ewma) <= - minstrel_get_tp_avg(&mi->r[tp_list[j - 1]], tmp_mrs->prob_ewma)) + if (minstrel_get_tp_avg(&mi->r[i], cur_mrs->prob_avg) <= + minstrel_get_tp_avg(&mi->r[tp_list[j - 1]], tmp_mrs->prob_avg)) break; } @@ -157,20 +157,24 @@ minstrel_update_rates(struct minstrel_priv *mp, struct minstrel_sta_info *mi) * Recalculate statistics and counters of a given rate */ void -minstrel_calc_rate_stats(struct minstrel_rate_stats *mrs) +minstrel_calc_rate_stats(struct minstrel_priv *mp, + struct minstrel_rate_stats *mrs) { unsigned int cur_prob; if (unlikely(mrs->attempts > 0)) { mrs->sample_skipped = 0; cur_prob = MINSTREL_FRAC(mrs->success, mrs->attempts); - if (unlikely(!mrs->att_hist)) { - mrs->prob_ewma = cur_prob; + if (mp->new_avg) { + minstrel_filter_avg_add(&mrs->prob_avg, + &mrs->prob_avg_1, cur_prob); + } else if (unlikely(!mrs->att_hist)) { + mrs->prob_avg = cur_prob; } else { /*update exponential weighted moving avarage */ - mrs->prob_ewma = minstrel_ewma(mrs->prob_ewma, - cur_prob, - EWMA_LEVEL); + mrs->prob_avg = minstrel_ewma(mrs->prob_avg, + cur_prob, + EWMA_LEVEL); } mrs->att_hist += mrs->attempts; mrs->succ_hist += mrs->success; @@ -200,12 +204,12 @@ minstrel_update_stats(struct minstrel_priv *mp, struct minstrel_sta_info *mi) struct minstrel_rate_stats *tmp_mrs = &mi->r[tmp_prob_rate].stats; /* Update statistics of success probability per rate */ - minstrel_calc_rate_stats(mrs); + minstrel_calc_rate_stats(mp, mrs); /* Sample less often below the 10% chance of success. * Sample less often above the 95% chance of success. */ - if (mrs->prob_ewma > MINSTREL_FRAC(95, 100) || - mrs->prob_ewma < MINSTREL_FRAC(10, 100)) { + if (mrs->prob_avg > MINSTREL_FRAC(95, 100) || + mrs->prob_avg < MINSTREL_FRAC(10, 100)) { mr->adjusted_retry_count = mrs->retry_count >> 1; if (mr->adjusted_retry_count > 2) mr->adjusted_retry_count = 2; @@ -225,14 +229,14 @@ minstrel_update_stats(struct minstrel_priv *mp, struct minstrel_sta_info *mi) * choose the maximum throughput rate as max_prob_rate * (2) if all success probabilities < 95%, the rate with * highest success probability is chosen as max_prob_rate */ - if (mrs->prob_ewma >= MINSTREL_FRAC(95, 100)) { - tmp_cur_tp = minstrel_get_tp_avg(mr, mrs->prob_ewma); + if (mrs->prob_avg >= MINSTREL_FRAC(95, 100)) { + tmp_cur_tp = minstrel_get_tp_avg(mr, mrs->prob_avg); tmp_prob_tp = minstrel_get_tp_avg(&mi->r[tmp_prob_rate], - tmp_mrs->prob_ewma); + tmp_mrs->prob_avg); if (tmp_cur_tp >= tmp_prob_tp) tmp_prob_rate = i; } else { - if (mrs->prob_ewma >= tmp_mrs->prob_ewma) + if (mrs->prob_avg >= tmp_mrs->prob_avg) tmp_prob_rate = i; } } @@ -290,7 +294,7 @@ minstrel_tx_status(void *priv, struct ieee80211_supported_band *sband, mi->sample_deferred--; if (time_after(jiffies, mi->last_stats_update + - (mp->update_interval * HZ) / 1000)) + mp->update_interval / (mp->new_avg ? 2 : 1))) minstrel_update_stats(mp, mi); } @@ -422,7 +426,7 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta, * has a probability of >95%, we shouldn't be attempting * to use it, as this only wastes precious airtime */ if (!mrr_capable && - (mi->r[ndx].stats.prob_ewma > MINSTREL_FRAC(95, 100))) + (mi->r[ndx].stats.prob_avg > MINSTREL_FRAC(95, 100))) return; mi->prev_sample = true; @@ -573,7 +577,7 @@ static u32 minstrel_get_expected_throughput(void *priv_sta) * computing cur_tp */ tmp_mrs = &mi->r[idx].stats; - tmp_cur_tp = minstrel_get_tp_avg(&mi->r[idx], tmp_mrs->prob_ewma) * 10; + tmp_cur_tp = minstrel_get_tp_avg(&mi->r[idx], tmp_mrs->prob_avg) * 10; tmp_cur_tp = tmp_cur_tp * 1200 * 8 / 1024; return tmp_cur_tp; diff --git a/net/mac80211/rc80211_minstrel.h b/net/mac80211/rc80211_minstrel.h index 3c96a853adbd..dbb43bcd3c45 100644 --- a/net/mac80211/rc80211_minstrel.h +++ b/net/mac80211/rc80211_minstrel.h @@ -19,6 +19,21 @@ #define MAX_THR_RATES 4 /* + * Coefficients for moving average with noise filter (period=16), + * scaled by 10 bits + * + * a1 = exp(-pi * sqrt(2) / period) + * coeff2 = 2 * a1 * cos(sqrt(2) * 2 * pi / period) + * coeff3 = -sqr(a1) + * coeff1 = 1 - coeff2 - coeff3 + */ +#define MINSTREL_AVG_COEFF1 (MINSTREL_FRAC(1, 1) - \ + MINSTREL_AVG_COEFF2 - \ + MINSTREL_AVG_COEFF3) +#define MINSTREL_AVG_COEFF2 0x00001499 +#define MINSTREL_AVG_COEFF3 -0x0000092e + +/* * Perform EWMA (Exponentially Weighted Moving Average) calculation */ static inline int @@ -32,6 +47,37 @@ minstrel_ewma(int old, int new, int weight) return old + incr; } +static inline int minstrel_filter_avg_add(u16 *prev_1, u16 *prev_2, s32 in) +{ + s32 out_1 = *prev_1; + s32 out_2 = *prev_2; + s32 val; + + if (!in) + in += 1; + + if (!out_1) { + val = out_1 = in; + goto out; + } + + val = MINSTREL_AVG_COEFF1 * in; + val += MINSTREL_AVG_COEFF2 * out_1; + val += MINSTREL_AVG_COEFF3 * out_2; + val >>= MINSTREL_SCALE; + + if (val > 1 << MINSTREL_SCALE) + val = 1 << MINSTREL_SCALE; + if (val < 0) + val = 1; + +out: + *prev_2 = out_1; + *prev_1 = val; + + return val; +} + struct minstrel_rate_stats { /* current / last sampling period attempts/success counters */ u16 attempts, last_attempts; @@ -40,8 +86,9 @@ struct minstrel_rate_stats { /* total attempts/success counters */ u32 att_hist, succ_hist; - /* prob_ewma - exponential weighted moving average of prob */ - u16 prob_ewma; + /* prob_avg - moving average of prob */ + u16 prob_avg; + u16 prob_avg_1; /* maximum retry counts */ u8 retry_count; @@ -95,6 +142,8 @@ struct minstrel_sta_info { struct minstrel_priv { struct ieee80211_hw *hw; bool has_mrr; + bool new_avg; + u32 sample_switch; unsigned int cw_min; unsigned int cw_max; unsigned int max_retry; @@ -125,8 +174,9 @@ extern const struct rate_control_ops mac80211_minstrel; void minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir); /* Recalculate success probabilities and counters for a given rate using EWMA */ -void minstrel_calc_rate_stats(struct minstrel_rate_stats *mrs); -int minstrel_get_tp_avg(struct minstrel_rate *mr, int prob_ewma); +void minstrel_calc_rate_stats(struct minstrel_priv *mp, + struct minstrel_rate_stats *mrs); +int minstrel_get_tp_avg(struct minstrel_rate *mr, int prob_avg); /* debugfs */ int minstrel_stats_open(struct inode *inode, struct file *file); diff --git a/net/mac80211/rc80211_minstrel_debugfs.c b/net/mac80211/rc80211_minstrel_debugfs.c index c8afd85b51a0..9b8e0daeb7bb 100644 --- a/net/mac80211/rc80211_minstrel_debugfs.c +++ b/net/mac80211/rc80211_minstrel_debugfs.c @@ -90,8 +90,8 @@ minstrel_stats_open(struct inode *inode, struct file *file) p += sprintf(p, "%6u ", mr->perfect_tx_time); tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100)); - tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma); - eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); + tp_avg = minstrel_get_tp_avg(mr, mrs->prob_avg); + eprob = MINSTREL_TRUNC(mrs->prob_avg * 1000); p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u" " %3u %3u %-3u " @@ -147,8 +147,8 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file) p += sprintf(p, "%u,",mr->perfect_tx_time); tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100)); - tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma); - eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); + tp_avg = minstrel_get_tp_avg(mr, mrs->prob_avg); + eprob = MINSTREL_TRUNC(mrs->prob_avg * 1000); p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u,%u,%u," "%llu,%llu,%d,%d\n", diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 5a882da82f0e..694a31978a04 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -18,6 +18,8 @@ #define AVG_AMPDU_SIZE 16 #define AVG_PKT_SIZE 1200 +#define SAMPLE_SWITCH_THR 100 + /* Number of bits for an average sized packet */ #define MCS_NBITS ((AVG_PKT_SIZE * AVG_AMPDU_SIZE) << 3) @@ -58,6 +60,7 @@ [GROUP_IDX(_streams, _sgi, _ht40)] = { \ .streams = _streams, \ .shift = _s, \ + .bw = _ht40, \ .flags = \ IEEE80211_TX_RC_MCS | \ (_sgi ? IEEE80211_TX_RC_SHORT_GI : 0) | \ @@ -94,6 +97,7 @@ [VHT_GROUP_IDX(_streams, _sgi, _bw)] = { \ .streams = _streams, \ .shift = _s, \ + .bw = _bw, \ .flags = \ IEEE80211_TX_RC_VHT_MCS | \ (_sgi ? IEEE80211_TX_RC_SHORT_GI : 0) | \ @@ -342,12 +346,12 @@ minstrel_ht_avg_ampdu_len(struct minstrel_ht_sta *mi) */ int minstrel_ht_get_tp_avg(struct minstrel_ht_sta *mi, int group, int rate, - int prob_ewma) + int prob_avg) { unsigned int nsecs = 0; /* do not account throughput if sucess prob is below 10% */ - if (prob_ewma < MINSTREL_FRAC(10, 100)) + if (prob_avg < MINSTREL_FRAC(10, 100)) return 0; if (group != MINSTREL_CCK_GROUP) @@ -361,11 +365,11 @@ minstrel_ht_get_tp_avg(struct minstrel_ht_sta *mi, int group, int rate, * account for collision related packet error rate fluctuation * (prob is scaled - see MINSTREL_FRAC above) */ - if (prob_ewma > MINSTREL_FRAC(90, 100)) + if (prob_avg > MINSTREL_FRAC(90, 100)) return MINSTREL_TRUNC(100000 * ((MINSTREL_FRAC(90, 100) * 1000) / nsecs)); else - return MINSTREL_TRUNC(100000 * ((prob_ewma * 1000) / nsecs)); + return MINSTREL_TRUNC(100000 * ((prob_avg * 1000) / nsecs)); } /* @@ -385,13 +389,13 @@ minstrel_ht_sort_best_tp_rates(struct minstrel_ht_sta *mi, u16 index, cur_group = index / MCS_GROUP_RATES; cur_idx = index % MCS_GROUP_RATES; - cur_prob = mi->groups[cur_group].rates[cur_idx].prob_ewma; + cur_prob = mi->groups[cur_group].rates[cur_idx].prob_avg; cur_tp_avg = minstrel_ht_get_tp_avg(mi, cur_group, cur_idx, cur_prob); do { tmp_group = tp_list[j - 1] / MCS_GROUP_RATES; tmp_idx = tp_list[j - 1] % MCS_GROUP_RATES; - tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_ewma; + tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_tp_avg = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); if (cur_tp_avg < tmp_tp_avg || @@ -428,7 +432,7 @@ minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index) tmp_group = mi->max_prob_rate / MCS_GROUP_RATES; tmp_idx = mi->max_prob_rate % MCS_GROUP_RATES; - tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_ewma; + tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_tp_avg = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); /* if max_tp_rate[0] is from MCS_GROUP max_prob_rate get selected from @@ -440,11 +444,11 @@ minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index) max_gpr_group = mg->max_group_prob_rate / MCS_GROUP_RATES; max_gpr_idx = mg->max_group_prob_rate % MCS_GROUP_RATES; - max_gpr_prob = mi->groups[max_gpr_group].rates[max_gpr_idx].prob_ewma; + max_gpr_prob = mi->groups[max_gpr_group].rates[max_gpr_idx].prob_avg; - if (mrs->prob_ewma > MINSTREL_FRAC(75, 100)) { + if (mrs->prob_avg > MINSTREL_FRAC(75, 100)) { cur_tp_avg = minstrel_ht_get_tp_avg(mi, cur_group, cur_idx, - mrs->prob_ewma); + mrs->prob_avg); if (cur_tp_avg > tmp_tp_avg) mi->max_prob_rate = index; @@ -454,9 +458,9 @@ minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index) if (cur_tp_avg > max_gpr_tp_avg) mg->max_group_prob_rate = index; } else { - if (mrs->prob_ewma > tmp_prob) + if (mrs->prob_avg > tmp_prob) mi->max_prob_rate = index; - if (mrs->prob_ewma > max_gpr_prob) + if (mrs->prob_avg > max_gpr_prob) mg->max_group_prob_rate = index; } } @@ -478,15 +482,15 @@ minstrel_ht_assign_best_tp_rates(struct minstrel_ht_sta *mi, tmp_group = tmp_cck_tp_rate[0] / MCS_GROUP_RATES; tmp_idx = tmp_cck_tp_rate[0] % MCS_GROUP_RATES; - tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_ewma; + tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_cck_tp = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); tmp_group = tmp_mcs_tp_rate[0] / MCS_GROUP_RATES; tmp_idx = tmp_mcs_tp_rate[0] % MCS_GROUP_RATES; - tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_ewma; + tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_mcs_tp = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); - if (tmp_cck_tp > tmp_mcs_tp) { + if (tmp_cck_tp_rate && tmp_cck_tp > tmp_mcs_tp) { for(i = 0; i < MAX_THR_RATES; i++) { minstrel_ht_sort_best_tp_rates(mi, tmp_cck_tp_rate[i], tmp_mcs_tp_rate); @@ -514,7 +518,7 @@ minstrel_ht_prob_rate_reduce_streams(struct minstrel_ht_sta *mi) continue; tmp_idx = mg->max_group_prob_rate % MCS_GROUP_RATES; - tmp_prob = mi->groups[group].rates[tmp_idx].prob_ewma; + tmp_prob = mi->groups[group].rates[tmp_idx].prob_avg; if (tmp_tp < minstrel_ht_get_tp_avg(mi, group, tmp_idx, tmp_prob) && (minstrel_mcs_groups[group].streams < tmp_max_streams)) { @@ -526,6 +530,133 @@ minstrel_ht_prob_rate_reduce_streams(struct minstrel_ht_sta *mi) } } +static inline int +minstrel_get_duration(int index) +{ + const struct mcs_group *group = &minstrel_mcs_groups[index / MCS_GROUP_RATES]; + unsigned int duration = group->duration[index % MCS_GROUP_RATES]; + return duration << group->shift; +} + +static bool +minstrel_ht_probe_group(struct minstrel_ht_sta *mi, const struct mcs_group *tp_group, + int tp_idx, const struct mcs_group *group) +{ + if (group->bw < tp_group->bw) + return false; + + if (group->streams == tp_group->streams) + return true; + + if (tp_idx < 4 && group->streams == tp_group->streams - 1) + return true; + + return group->streams == tp_group->streams + 1; +} + +static void +minstrel_ht_find_probe_rates(struct minstrel_ht_sta *mi, u16 *rates, int *n_rates, + bool faster_rate) +{ + const struct mcs_group *group, *tp_group; + int i, g, max_dur; + int tp_idx; + + tp_group = &minstrel_mcs_groups[mi->max_tp_rate[0] / MCS_GROUP_RATES]; + tp_idx = mi->max_tp_rate[0] % MCS_GROUP_RATES; + + max_dur = minstrel_get_duration(mi->max_tp_rate[0]); + if (faster_rate) + max_dur -= max_dur / 16; + + for (g = 0; g < MINSTREL_GROUPS_NB; g++) { + u16 supported = mi->supported[g]; + + if (!supported) + continue; + + group = &minstrel_mcs_groups[g]; + if (!minstrel_ht_probe_group(mi, tp_group, tp_idx, group)) + continue; + + for (i = 0; supported; supported >>= 1, i++) { + int idx; + + if (!(supported & 1)) + continue; + + if ((group->duration[i] << group->shift) > max_dur) + continue; + + idx = g * MCS_GROUP_RATES + i; + if (idx == mi->max_tp_rate[0]) + continue; + + rates[(*n_rates)++] = idx; + break; + } + } +} + +static void +minstrel_ht_rate_sample_switch(struct minstrel_priv *mp, + struct minstrel_ht_sta *mi) +{ + struct minstrel_rate_stats *mrs; + u16 rates[MINSTREL_GROUPS_NB]; + int n_rates = 0; + int probe_rate = 0; + bool faster_rate; + int i; + u8 random; + + /* + * Use rate switching instead of probing packets for devices with + * little control over retry fallback behavior + */ + if (mp->hw->max_rates > 1) + return; + + /* + * If the current EWMA prob is >75%, look for a rate that's 6.25% + * faster than the max tp rate. + * If that fails, look again for a rate that is at least as fast + */ + mrs = minstrel_get_ratestats(mi, mi->max_tp_rate[0]); + faster_rate = mrs->prob_avg > MINSTREL_FRAC(75, 100); + minstrel_ht_find_probe_rates(mi, rates, &n_rates, faster_rate); + if (!n_rates && faster_rate) + minstrel_ht_find_probe_rates(mi, rates, &n_rates, false); + + /* If no suitable rate was found, try to pick the next one in the group */ + if (!n_rates) { + int g_idx = mi->max_tp_rate[0] / MCS_GROUP_RATES; + u16 supported = mi->supported[g_idx]; + + supported >>= mi->max_tp_rate[0] % MCS_GROUP_RATES; + for (i = 0; supported; supported >>= 1, i++) { + if (!(supported & 1)) + continue; + + probe_rate = mi->max_tp_rate[0] + i; + goto out; + } + + return; + } + + i = 0; + if (n_rates > 1) { + random = prandom_u32(); + i = random % n_rates; + } + probe_rate = rates[i]; + +out: + mi->sample_rate = probe_rate; + mi->sample_mode = MINSTREL_SAMPLE_ACTIVE; +} + /* * Update rate statistics and select new primary rates * @@ -536,7 +667,8 @@ minstrel_ht_prob_rate_reduce_streams(struct minstrel_ht_sta *mi) * higher throughput rates, even if the probablity is a bit lower */ static void -minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) +minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, + bool sample) { struct minstrel_mcs_group_data *mg; struct minstrel_rate_stats *mrs; @@ -544,6 +676,18 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) u16 tmp_mcs_tp_rate[MAX_THR_RATES], tmp_group_tp_rate[MAX_THR_RATES]; u16 tmp_cck_tp_rate[MAX_THR_RATES], index; + mi->sample_mode = MINSTREL_SAMPLE_IDLE; + + if (sample) { + mi->total_packets_cur = mi->total_packets - + mi->total_packets_last; + mi->total_packets_last = mi->total_packets; + } + if (!mp->sample_switch) + sample = false; + if (mi->total_packets_cur < SAMPLE_SWITCH_THR && mp->sample_switch != 1) + sample = false; + if (mi->ampdu_packets > 0) { if (!ieee80211_hw_check(mp->hw, TX_STATUS_NO_AMPDU_LEN)) mi->avg_ampdu_len = minstrel_ewma(mi->avg_ampdu_len, @@ -558,11 +702,19 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) mi->sample_slow = 0; mi->sample_count = 0; - /* Initialize global rate indexes */ - for(j = 0; j < MAX_THR_RATES; j++){ - tmp_mcs_tp_rate[j] = 0; - tmp_cck_tp_rate[j] = 0; - } + memset(tmp_mcs_tp_rate, 0, sizeof(tmp_mcs_tp_rate)); + memset(tmp_cck_tp_rate, 0, sizeof(tmp_cck_tp_rate)); + if (mi->supported[MINSTREL_CCK_GROUP]) + for (j = 0; j < ARRAY_SIZE(tmp_cck_tp_rate); j++) + tmp_cck_tp_rate[j] = MINSTREL_CCK_GROUP * MCS_GROUP_RATES; + + if (mi->supported[MINSTREL_VHT_GROUP_0]) + index = MINSTREL_VHT_GROUP_0 * MCS_GROUP_RATES; + else + index = MINSTREL_HT_GROUP_0 * MCS_GROUP_RATES; + + for (j = 0; j < ARRAY_SIZE(tmp_mcs_tp_rate); j++) + tmp_mcs_tp_rate[j] = index; /* Find best rate sets within all MCS groups*/ for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { @@ -575,7 +727,7 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) /* (re)Initialize group rate indexes */ for(j = 0; j < MAX_THR_RATES; j++) - tmp_group_tp_rate[j] = group; + tmp_group_tp_rate[j] = MCS_GROUP_RATES * group; for (i = 0; i < MCS_GROUP_RATES; i++) { if (!(mi->supported[group] & BIT(i))) @@ -585,8 +737,8 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) mrs = &mg->rates[i]; mrs->retry_updated = false; - minstrel_calc_rate_stats(mrs); - cur_prob = mrs->prob_ewma; + minstrel_calc_rate_stats(mp, mrs); + cur_prob = mrs->prob_avg; if (minstrel_ht_get_tp_avg(mi, group, i, cur_prob) == 0) continue; @@ -621,6 +773,11 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) /* try to sample all available rates during each interval */ mi->sample_count *= 8; + if (mp->new_avg) + mi->sample_count /= 2; + + if (sample) + minstrel_ht_rate_sample_switch(mp, mi); #ifdef CONFIG_MAC80211_DEBUGFS /* use fixed index if set */ @@ -628,6 +785,7 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) for (i = 0; i < 4; i++) mi->max_tp_rate[i] = mp->fixed_rate_idx; mi->max_prob_rate = mp->fixed_rate_idx; + mi->sample_mode = MINSTREL_SAMPLE_IDLE; } #endif @@ -731,15 +889,18 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband, struct minstrel_ht_sta_priv *msp = priv_sta; struct minstrel_ht_sta *mi = &msp->ht; struct ieee80211_tx_rate *ar = info->status.rates; - struct minstrel_rate_stats *rate, *rate2; + struct minstrel_rate_stats *rate, *rate2, *rate_sample = NULL; struct minstrel_priv *mp = priv; + u32 update_interval = mp->update_interval / 2; bool last, update = false; + bool sample_status = false; int i; if (!msp->is_ht) return mac80211_minstrel.tx_status_ext(priv, sband, &msp->legacy, st); + /* This packet was aggregated but doesn't carry status info */ if ((info->flags & IEEE80211_TX_CTL_AMPDU) && !(info->flags & IEEE80211_TX_STAT_AMPDU)) @@ -765,12 +926,17 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband, if (info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE) mi->sample_packets += info->status.ampdu_len; + if (mi->sample_mode != MINSTREL_SAMPLE_IDLE) + rate_sample = minstrel_get_ratestats(mi, mi->sample_rate); + last = !minstrel_ht_txstat_valid(mp, &ar[0]); for (i = 0; !last; i++) { last = (i == IEEE80211_TX_MAX_RATES - 1) || !minstrel_ht_txstat_valid(mp, &ar[i + 1]); rate = minstrel_ht_get_stats(mp, mi, &ar[i]); + if (rate == rate_sample) + sample_status = true; if (last) rate->success += info->status.ampdu_ack_len; @@ -778,44 +944,61 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband, rate->attempts += ar[i].count * info->status.ampdu_len; } - /* - * check for sudden death of spatial multiplexing, - * downgrade to a lower number of streams if necessary. - */ - rate = minstrel_get_ratestats(mi, mi->max_tp_rate[0]); - if (rate->attempts > 30 && - MINSTREL_FRAC(rate->success, rate->attempts) < - MINSTREL_FRAC(20, 100)) { - minstrel_downgrade_rate(mi, &mi->max_tp_rate[0], true); + switch (mi->sample_mode) { + case MINSTREL_SAMPLE_IDLE: + if (mp->new_avg && + (mp->hw->max_rates > 1 || + mi->total_packets_cur < SAMPLE_SWITCH_THR)) + update_interval /= 2; + break; + + case MINSTREL_SAMPLE_ACTIVE: + if (!sample_status) + break; + + mi->sample_mode = MINSTREL_SAMPLE_PENDING; update = true; - } + break; + + case MINSTREL_SAMPLE_PENDING: + if (sample_status) + break; - rate2 = minstrel_get_ratestats(mi, mi->max_tp_rate[1]); - if (rate2->attempts > 30 && - MINSTREL_FRAC(rate2->success, rate2->attempts) < - MINSTREL_FRAC(20, 100)) { - minstrel_downgrade_rate(mi, &mi->max_tp_rate[1], false); update = true; + minstrel_ht_update_stats(mp, mi, false); + break; + } + + + if (mp->hw->max_rates > 1) { + /* + * check for sudden death of spatial multiplexing, + * downgrade to a lower number of streams if necessary. + */ + rate = minstrel_get_ratestats(mi, mi->max_tp_rate[0]); + if (rate->attempts > 30 && + rate->success < rate->attempts / 4) { + minstrel_downgrade_rate(mi, &mi->max_tp_rate[0], true); + update = true; + } + + rate2 = minstrel_get_ratestats(mi, mi->max_tp_rate[1]); + if (rate2->attempts > 30 && + rate2->success < rate2->attempts / 4) { + minstrel_downgrade_rate(mi, &mi->max_tp_rate[1], false); + update = true; + } } - if (time_after(jiffies, mi->last_stats_update + - (mp->update_interval / 2 * HZ) / 1000)) { + if (time_after(jiffies, mi->last_stats_update + update_interval)) { update = true; - minstrel_ht_update_stats(mp, mi); + minstrel_ht_update_stats(mp, mi, true); } if (update) minstrel_ht_update_rates(mp, mi); } -static inline int -minstrel_get_duration(int index) -{ - const struct mcs_group *group = &minstrel_mcs_groups[index / MCS_GROUP_RATES]; - unsigned int duration = group->duration[index % MCS_GROUP_RATES]; - return duration << group->shift; -} - static void minstrel_calc_retransmit(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, int index) @@ -829,7 +1012,7 @@ minstrel_calc_retransmit(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, unsigned int overhead = 0, overhead_rtscts = 0; mrs = minstrel_get_ratestats(mi, index); - if (mrs->prob_ewma < MINSTREL_FRAC(1, 10)) { + if (mrs->prob_avg < MINSTREL_FRAC(1, 10)) { mrs->retry_count = 1; mrs->retry_count_rtscts = 1; return; @@ -886,7 +1069,7 @@ minstrel_ht_set_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, if (!mrs->retry_updated) minstrel_calc_retransmit(mp, mi, index); - if (mrs->prob_ewma < MINSTREL_FRAC(20, 100) || !mrs->retry_count) { + if (mrs->prob_avg < MINSTREL_FRAC(20, 100) || !mrs->retry_count) { ratetbl->rate[offset].count = 2; ratetbl->rate[offset].count_rts = 2; ratetbl->rate[offset].count_cts = 2; @@ -920,11 +1103,11 @@ minstrel_ht_set_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, } static inline int -minstrel_ht_get_prob_ewma(struct minstrel_ht_sta *mi, int rate) +minstrel_ht_get_prob_avg(struct minstrel_ht_sta *mi, int rate) { int group = rate / MCS_GROUP_RATES; rate %= MCS_GROUP_RATES; - return mi->groups[group].rates[rate].prob_ewma; + return mi->groups[group].rates[rate].prob_avg; } static int @@ -936,7 +1119,7 @@ minstrel_ht_get_max_amsdu_len(struct minstrel_ht_sta *mi) unsigned int duration; /* Disable A-MSDU if max_prob_rate is bad */ - if (mi->groups[group].rates[rate].prob_ewma < MINSTREL_FRAC(50, 100)) + if (mi->groups[group].rates[rate].prob_avg < MINSTREL_FRAC(50, 100)) return 1; duration = g->duration[rate]; @@ -959,7 +1142,7 @@ minstrel_ht_get_max_amsdu_len(struct minstrel_ht_sta *mi) * data packet size */ if (duration > MCS_DURATION(1, 0, 260) || - (minstrel_ht_get_prob_ewma(mi, mi->max_tp_rate[0]) < + (minstrel_ht_get_prob_avg(mi, mi->max_tp_rate[0]) < MINSTREL_FRAC(75, 100))) return 3200; @@ -980,14 +1163,18 @@ static void minstrel_ht_update_rates(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) { struct ieee80211_sta_rates *rates; + u16 first_rate = mi->max_tp_rate[0]; int i = 0; + if (mi->sample_mode == MINSTREL_SAMPLE_ACTIVE) + first_rate = mi->sample_rate; + rates = kzalloc(sizeof(*rates), GFP_ATOMIC); if (!rates) return; /* Start with max_tp_rate[0] */ - minstrel_ht_set_rate(mp, mi, rates, i++, mi->max_tp_rate[0]); + minstrel_ht_set_rate(mp, mi, rates, i++, first_rate); if (mp->hw->max_rates >= 3) { /* At least 3 tx rates supported, use max_tp_rate[1] next */ @@ -1012,6 +1199,11 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) int tp_rate1, tp_rate2; int sample_idx = 0; + if (mp->hw->max_rates == 1 && mp->sample_switch && + (mi->total_packets_cur >= SAMPLE_SWITCH_THR || + mp->sample_switch == 1)) + return -1; + if (mi->sample_wait > 0) { mi->sample_wait--; return -1; @@ -1055,10 +1247,25 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) * rate, to avoid wasting airtime. */ sample_dur = minstrel_get_duration(sample_idx); - if (mrs->prob_ewma > MINSTREL_FRAC(95, 100) || + if (mrs->prob_avg > MINSTREL_FRAC(95, 100) || minstrel_get_duration(mi->max_prob_rate) * 3 < sample_dur) return -1; + + /* + * For devices with no configurable multi-rate retry, skip sampling + * below the per-group max throughput rate, and only use one sampling + * attempt per rate + */ + if (mp->hw->max_rates == 1 && + (minstrel_get_duration(mg->max_group_tp_rate[0]) < sample_dur || + mrs->attempts)) + return -1; + + /* Skip already sampled slow rates */ + if (sample_dur >= minstrel_get_duration(tp_rate1) && mrs->attempts) + return -1; + /* * Make sure that lower rates get sampled only occasionally, * if the link is working perfectly. @@ -1318,7 +1525,7 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband, mi->supported[MINSTREL_CCK_GROUP] |= mi->cck_supported_short << 4; /* create an initial rate table with the lowest supported rates */ - minstrel_ht_update_stats(mp, mi); + minstrel_ht_update_stats(mp, mi, true); minstrel_ht_update_rates(mp, mi); return; @@ -1436,6 +1643,8 @@ minstrel_ht_alloc(struct ieee80211_hw *hw, struct dentry *debugfsdir) if (!mp) return NULL; + mp->sample_switch = -1; + /* contention window settings * Just an approximation. Using the per-queue values would complicate * the calculations and is probably unnecessary */ @@ -1461,12 +1670,17 @@ minstrel_ht_alloc(struct ieee80211_hw *hw, struct dentry *debugfsdir) mp->has_mrr = true; mp->hw = hw; - mp->update_interval = 100; + mp->update_interval = HZ / 10; + mp->new_avg = true; #ifdef CONFIG_MAC80211_DEBUGFS mp->fixed_rate_idx = (u32) -1; debugfs_create_u32("fixed_rate_idx", S_IRUGO | S_IWUGO, debugfsdir, &mp->fixed_rate_idx); + debugfs_create_u32("sample_switch", S_IRUGO | S_IWUSR, debugfsdir, + &mp->sample_switch); + debugfs_create_bool("new_avg", S_IRUGO | S_IWUSR, debugfsdir, + &mp->new_avg); #endif minstrel_ht_init_cck_rates(mp); @@ -1491,7 +1705,7 @@ static u32 minstrel_ht_get_expected_throughput(void *priv_sta) i = mi->max_tp_rate[0] / MCS_GROUP_RATES; j = mi->max_tp_rate[0] % MCS_GROUP_RATES; - prob = mi->groups[i].rates[j].prob_ewma; + prob = mi->groups[i].rates[j].prob_avg; /* convert tp_avg from pkt per second in kbps */ tp_avg = minstrel_ht_get_tp_avg(mi, i, j, prob) * 10; diff --git a/net/mac80211/rc80211_minstrel_ht.h b/net/mac80211/rc80211_minstrel_ht.h index 80296268c778..53ea3c29debf 100644 --- a/net/mac80211/rc80211_minstrel_ht.h +++ b/net/mac80211/rc80211_minstrel_ht.h @@ -33,6 +33,7 @@ struct mcs_group { u16 flags; u8 streams; u8 shift; + u8 bw; u16 duration[MCS_GROUP_RATES]; }; @@ -50,6 +51,12 @@ struct minstrel_mcs_group_data { struct minstrel_rate_stats rates[MCS_GROUP_RATES]; }; +enum minstrel_sample_mode { + MINSTREL_SAMPLE_IDLE, + MINSTREL_SAMPLE_ACTIVE, + MINSTREL_SAMPLE_PENDING, +}; + struct minstrel_ht_sta { struct ieee80211_sta *sta; @@ -71,6 +78,8 @@ struct minstrel_ht_sta { unsigned int overhead; unsigned int overhead_rtscts; + unsigned int total_packets_last; + unsigned int total_packets_cur; unsigned int total_packets; unsigned int sample_packets; @@ -82,6 +91,9 @@ struct minstrel_ht_sta { u8 sample_count; u8 sample_slow; + enum minstrel_sample_mode sample_mode; + u16 sample_rate; + /* current MCS group to be sampled */ u8 sample_group; @@ -107,6 +119,6 @@ struct minstrel_ht_sta_priv { void minstrel_ht_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir); int minstrel_ht_get_tp_avg(struct minstrel_ht_sta *mi, int group, int rate, - int prob_ewma); + int prob_avg); #endif diff --git a/net/mac80211/rc80211_minstrel_ht_debugfs.c b/net/mac80211/rc80211_minstrel_ht_debugfs.c index 5a6e9f3edc04..bebb71917742 100644 --- a/net/mac80211/rc80211_minstrel_ht_debugfs.c +++ b/net/mac80211/rc80211_minstrel_ht_debugfs.c @@ -98,8 +98,8 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p) p += sprintf(p, "%6u ", tx_time); tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100)); - tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma); - eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); + tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_avg); + eprob = MINSTREL_TRUNC(mrs->prob_avg * 1000); p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u" " %3u %3u %-3u " @@ -243,8 +243,8 @@ minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p) p += sprintf(p, "%u,", tx_time); tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100)); - tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma); - eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); + tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_avg); + eprob = MINSTREL_TRUNC(mrs->prob_avg * 1000); p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u,%u," "%u,%llu,%llu,", diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 3c1ab870fefe..0e05ff037672 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2447,11 +2447,13 @@ static void ieee80211_deliver_skb_to_local_stack(struct sk_buff *skb, skb->protocol == cpu_to_be16(ETH_P_PREAUTH)) && sdata->control_port_over_nl80211)) { struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); - bool noencrypt = status->flag & RX_FLAG_DECRYPTED; + bool noencrypt = !(status->flag & RX_FLAG_DECRYPTED); cfg80211_rx_control_port(dev, skb, noencrypt); dev_kfree_skb(skb); } else { + memset(skb->cb, 0, sizeof(skb->cb)); + /* deliver to local stack */ if (rx->napi) napi_gro_receive(rx->napi, skb); @@ -2546,8 +2548,6 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx) if (skb) { skb->protocol = eth_type_trans(skb, dev); - memset(skb->cb, 0, sizeof(skb->cb)); - ieee80211_deliver_skb_to_local_stack(skb, rx); } @@ -3467,9 +3467,18 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx) case cpu_to_le16(IEEE80211_STYPE_PROBE_RESP): /* process for all: mesh, mlme, ibss */ break; + case cpu_to_le16(IEEE80211_STYPE_DEAUTH): + if (is_multicast_ether_addr(mgmt->da) && + !is_broadcast_ether_addr(mgmt->da)) + return RX_DROP_MONITOR; + + /* process only for station/IBSS */ + if (sdata->vif.type != NL80211_IFTYPE_STATION && + sdata->vif.type != NL80211_IFTYPE_ADHOC) + return RX_DROP_MONITOR; + break; case cpu_to_le16(IEEE80211_STYPE_ASSOC_RESP): case cpu_to_le16(IEEE80211_STYPE_REASSOC_RESP): - case cpu_to_le16(IEEE80211_STYPE_DEAUTH): case cpu_to_le16(IEEE80211_STYPE_DISASSOC): if (is_multicast_ether_addr(mgmt->da) && !is_broadcast_ether_addr(mgmt->da)) diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index adf94ba1ed77..4d31d9688dc2 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -520,10 +520,33 @@ static int ieee80211_start_sw_scan(struct ieee80211_local *local, return 0; } +static bool __ieee80211_can_leave_ch(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_sub_if_data *sdata_iter; + + if (!ieee80211_is_radar_required(local)) + return true; + + if (!regulatory_pre_cac_allowed(local->hw.wiphy)) + return false; + + mutex_lock(&local->iflist_mtx); + list_for_each_entry(sdata_iter, &local->interfaces, list) { + if (sdata_iter->wdev.cac_started) { + mutex_unlock(&local->iflist_mtx); + return false; + } + } + mutex_unlock(&local->iflist_mtx); + + return true; +} + static bool ieee80211_can_scan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { - if (ieee80211_is_radar_required(local)) + if (!__ieee80211_can_leave_ch(sdata)) return false; if (!list_empty(&local->roc_list)) @@ -630,7 +653,10 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, lockdep_assert_held(&local->mtx); - if (local->scan_req || ieee80211_is_radar_required(local)) + if (local->scan_req) + return -EBUSY; + + if (!__ieee80211_can_leave_ch(sdata)) return -EBUSY; if (!ieee80211_can_scan(local, sdata)) { diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 95eb8220e2e4..0f5f40678885 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -210,6 +210,20 @@ struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata, return NULL; } +struct sta_info *sta_info_get_by_addrs(struct ieee80211_local *local, + const u8 *sta_addr, const u8 *vif_addr) +{ + struct rhlist_head *tmp; + struct sta_info *sta; + + for_each_sta_info(local, sta_addr, sta, tmp) { + if (ether_addr_equal(vif_addr, sta->sdata->vif.addr)) + return sta; + } + + return NULL; +} + struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata, int idx) { @@ -396,6 +410,9 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, skb_queue_head_init(&sta->ps_tx_buf[i]); skb_queue_head_init(&sta->tx_filtered[i]); sta->airtime[i].deficit = sta->airtime_weight; + atomic_set(&sta->airtime[i].aql_tx_pending, 0); + sta->airtime[i].aql_limit_low = local->aql_txq_limit_low[i]; + sta->airtime[i].aql_limit_high = local->aql_txq_limit_high[i]; } for (i = 0; i < IEEE80211_NUM_TIDS; i++) @@ -1065,7 +1082,6 @@ static void __sta_info_destroy_part2(struct sta_info *sta) cfg80211_del_sta_sinfo(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL); kfree(sinfo); - rate_control_remove_sta_debugfs(sta); ieee80211_sta_debugfs_remove(sta); cleanup_single_sta(sta); @@ -1894,6 +1910,44 @@ void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid, } EXPORT_SYMBOL(ieee80211_sta_register_airtime); +void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local, + struct sta_info *sta, u8 ac, + u16 tx_airtime, bool tx_completed) +{ + int tx_pending; + + if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) + return; + + if (!tx_completed) { + if (sta) + atomic_add(tx_airtime, + &sta->airtime[ac].aql_tx_pending); + + atomic_add(tx_airtime, &local->aql_total_pending_airtime); + return; + } + + if (sta) { + tx_pending = atomic_sub_return(tx_airtime, + &sta->airtime[ac].aql_tx_pending); + if (WARN_ONCE(tx_pending < 0, + "STA %pM AC %d txq pending airtime underflow: %u, %u", + sta->addr, ac, tx_pending, tx_airtime)) + atomic_cmpxchg(&sta->airtime[ac].aql_tx_pending, + tx_pending, 0); + } + + tx_pending = atomic_sub_return(tx_airtime, + &local->aql_total_pending_airtime); + if (WARN_ONCE(tx_pending < 0, + "Device %s AC %d pending airtime underflow: %u, %u", + wiphy_name(local->hw.wiphy), ac, tx_pending, + tx_airtime)) + atomic_cmpxchg(&local->aql_total_pending_airtime, + tx_pending, 0); +} + int sta_info_move_state(struct sta_info *sta, enum ieee80211_sta_state new_state) { @@ -1962,6 +2016,7 @@ int sta_info_move_state(struct sta_info *sta, case IEEE80211_STA_ASSOC: if (sta->sta_state == IEEE80211_STA_AUTH) { set_bit(WLAN_STA_ASSOC, &sta->_flags); + sta->assoc_at = ktime_get_boottime_ns(); ieee80211_recalc_min_chandef(sta->sdata); if (!sta->sta.support_p2p_ps) ieee80211_recalc_p2p_go_ps_allowed(sta->sdata); @@ -1979,6 +2034,10 @@ int sta_info_move_state(struct sta_info *sta, ieee80211_check_fast_xmit(sta); ieee80211_check_fast_rx(sta); } + if (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN || + sta->sdata->vif.type == NL80211_IFTYPE_AP) + cfg80211_send_layer2_update(sta->sdata->dev, + sta->sta.addr); break; default: break; @@ -2191,6 +2250,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, BIT_ULL(NL80211_STA_INFO_STA_FLAGS) | BIT_ULL(NL80211_STA_INFO_BSS_PARAM) | BIT_ULL(NL80211_STA_INFO_CONNECTED_TIME) | + BIT_ULL(NL80211_STA_INFO_ASSOC_AT_BOOTTIME) | BIT_ULL(NL80211_STA_INFO_RX_DROP_MISC); if (sdata->vif.type == NL80211_IFTYPE_STATION) { @@ -2199,6 +2259,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, } sinfo->connected_time = ktime_get_seconds() - sta->last_connected; + sinfo->assoc_at = sta->assoc_at; sinfo->inactive_time = jiffies_to_msecs(jiffies - ieee80211_sta_last_active(sta)); @@ -2451,7 +2512,8 @@ unsigned long ieee80211_sta_last_active(struct sta_info *sta) { struct ieee80211_sta_rx_stats *stats = sta_get_last_rx_stats(sta); - if (time_after(stats->last_rx, sta->status_stats.last_ack)) + if (!sta->status_stats.last_ack || + time_after(stats->last_rx, sta->status_stats.last_ack)) return stats->last_rx; return sta->status_stats.last_ack; } diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 3260d4234920..c00e28585f9d 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -132,8 +132,15 @@ struct airtime_info { u64 rx_airtime; u64 tx_airtime; s64 deficit; + atomic_t aql_tx_pending; /* Estimated airtime for frames pending */ + u32 aql_limit_low; + u32 aql_limit_high; }; +void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local, + struct sta_info *sta, u8 ac, + u16 tx_airtime, bool tx_completed); + struct sta_info; /** @@ -466,6 +473,7 @@ struct ieee80211_sta_rx_stats { * the station when it leaves powersave or polls for frames * @driver_buffered_tids: bitmap of TIDs the driver has data buffered on * @txq_buffered_tids: bitmap of TIDs that mac80211 has txq data buffered on + * @assoc_at: clock boottime (in ns) of last association * @last_connected: time (in seconds) when a station got connected * @last_seq_ctrl: last received seq/frag number from this STA (per TID * plus one for non-QoS frames) @@ -562,6 +570,7 @@ struct sta_info { unsigned long driver_buffered_tids; unsigned long txq_buffered_tids; + u64 assoc_at; long last_connected; /* Updated from RX path only, no locking requirements */ @@ -723,6 +732,10 @@ struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata, struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr); +/* user must hold sta_mtx or be in RCU critical section */ +struct sta_info *sta_info_get_by_addrs(struct ieee80211_local *local, + const u8 *sta_addr, const u8 *vif_addr); + #define for_each_sta_info(local, _addr, _sta, _tmp) \ rhl_for_each_entry_rcu(_sta, _tmp, \ sta_info_hash_lookup(local, _addr), hash_node) diff --git a/net/mac80211/status.c b/net/mac80211/status.c index a88e3bf17e9d..b720feaf9a74 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -254,14 +254,22 @@ static void ieee80211_set_bar_pending(struct sta_info *sta, u8 tid, u16 ssn) tid_tx->bar_pending = true; } -static int ieee80211_tx_radiotap_len(struct ieee80211_tx_info *info) +static int ieee80211_tx_radiotap_len(struct ieee80211_tx_info *info, + struct ieee80211_tx_status *status) { int len = sizeof(struct ieee80211_radiotap_header); /* IEEE80211_RADIOTAP_RATE rate */ - if (info->status.rates[0].idx >= 0 && - !(info->status.rates[0].flags & (IEEE80211_TX_RC_MCS | - IEEE80211_TX_RC_VHT_MCS))) + if (status && status->rate && !(status->rate->flags & + (RATE_INFO_FLAGS_MCS | + RATE_INFO_FLAGS_DMG | + RATE_INFO_FLAGS_EDMG | + RATE_INFO_FLAGS_VHT_MCS | + RATE_INFO_FLAGS_HE_MCS))) + len += 2; + else if (info->status.rates[0].idx >= 0 && + !(info->status.rates[0].flags & + (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS))) len += 2; /* IEEE80211_RADIOTAP_TX_FLAGS */ @@ -272,7 +280,14 @@ static int ieee80211_tx_radiotap_len(struct ieee80211_tx_info *info) /* IEEE80211_RADIOTAP_MCS * IEEE80211_RADIOTAP_VHT */ - if (info->status.rates[0].idx >= 0) { + if (status && status->rate) { + if (status->rate->flags & RATE_INFO_FLAGS_MCS) + len += 3; + else if (status->rate->flags & RATE_INFO_FLAGS_VHT_MCS) + len = ALIGN(len, 2) + 12; + else if (status->rate->flags & RATE_INFO_FLAGS_HE_MCS) + len = ALIGN(len, 2) + 12; + } else if (info->status.rates[0].idx >= 0) { if (info->status.rates[0].flags & IEEE80211_TX_RC_MCS) len += 3; else if (info->status.rates[0].flags & IEEE80211_TX_RC_VHT_MCS) @@ -286,12 +301,14 @@ static void ieee80211_add_tx_radiotap_header(struct ieee80211_local *local, struct ieee80211_supported_band *sband, struct sk_buff *skb, int retry_count, - int rtap_len, int shift) + int rtap_len, int shift, + struct ieee80211_tx_status *status) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_radiotap_header *rthdr; unsigned char *pos; + u16 legacy_rate = 0; u16 txflags; rthdr = skb_push(skb, rtap_len); @@ -310,14 +327,23 @@ ieee80211_add_tx_radiotap_header(struct ieee80211_local *local, */ /* IEEE80211_RADIOTAP_RATE */ - if (info->status.rates[0].idx >= 0 && - !(info->status.rates[0].flags & (IEEE80211_TX_RC_MCS | - IEEE80211_TX_RC_VHT_MCS))) { - u16 rate; + if (status && status->rate) { + if (!(status->rate->flags & (RATE_INFO_FLAGS_MCS | + RATE_INFO_FLAGS_DMG | + RATE_INFO_FLAGS_EDMG | + RATE_INFO_FLAGS_VHT_MCS | + RATE_INFO_FLAGS_HE_MCS))) + legacy_rate = status->rate->legacy; + } else if (info->status.rates[0].idx >= 0 && + !(info->status.rates[0].flags & (IEEE80211_TX_RC_MCS | + IEEE80211_TX_RC_VHT_MCS))) + legacy_rate = + sband->bitrates[info->status.rates[0].idx].bitrate; + + if (legacy_rate) { rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_RATE); - rate = sband->bitrates[info->status.rates[0].idx].bitrate; - *pos = DIV_ROUND_UP(rate, 5 * (1 << shift)); + *pos = DIV_ROUND_UP(legacy_rate, 5 * (1 << shift)); /* padding for tx flags */ pos += 2; } @@ -341,7 +367,140 @@ ieee80211_add_tx_radiotap_header(struct ieee80211_local *local, *pos = retry_count; pos++; - if (info->status.rates[0].idx < 0) + if (status && status->rate && + (status->rate->flags & RATE_INFO_FLAGS_MCS)) { + rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_MCS); + pos[0] = IEEE80211_RADIOTAP_MCS_HAVE_MCS | + IEEE80211_RADIOTAP_MCS_HAVE_GI | + IEEE80211_RADIOTAP_MCS_HAVE_BW; + if (status->rate->flags & RATE_INFO_FLAGS_SHORT_GI) + pos[1] |= IEEE80211_RADIOTAP_MCS_SGI; + if (status->rate->bw == RATE_INFO_BW_40) + pos[1] |= IEEE80211_RADIOTAP_MCS_BW_40; + pos[2] = status->rate->mcs; + pos += 3; + } else if (status && status->rate && + (status->rate->flags & RATE_INFO_FLAGS_VHT_MCS)) { + u16 known = local->hw.radiotap_vht_details & + (IEEE80211_RADIOTAP_VHT_KNOWN_GI | + IEEE80211_RADIOTAP_VHT_KNOWN_BANDWIDTH); + + rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_VHT); + + /* required alignment from rthdr */ + pos = (u8 *)rthdr + ALIGN(pos - (u8 *)rthdr, 2); + + /* u16 known - IEEE80211_RADIOTAP_VHT_KNOWN_* */ + put_unaligned_le16(known, pos); + pos += 2; + + /* u8 flags - IEEE80211_RADIOTAP_VHT_FLAG_* */ + if (status->rate->flags & RATE_INFO_FLAGS_SHORT_GI) + *pos |= IEEE80211_RADIOTAP_VHT_FLAG_SGI; + pos++; + + /* u8 bandwidth */ + switch (status->rate->bw) { + case RATE_INFO_BW_160: + *pos = 11; + break; + case RATE_INFO_BW_80: + *pos = 4; + break; + case RATE_INFO_BW_40: + *pos = 1; + break; + default: + *pos = 0; + break; + } + pos++; + + /* u8 mcs_nss[4] */ + *pos = (status->rate->mcs << 4) | status->rate->nss; + pos += 4; + + /* u8 coding */ + pos++; + /* u8 group_id */ + pos++; + /* u16 partial_aid */ + pos += 2; + } else if (status && status->rate && + (status->rate->flags & RATE_INFO_FLAGS_HE_MCS)) { + struct ieee80211_radiotap_he *he; + + rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_HE); + + /* required alignment from rthdr */ + pos = (u8 *)rthdr + ALIGN(pos - (u8 *)rthdr, 2); + he = (struct ieee80211_radiotap_he *)pos; + + he->data1 = cpu_to_le16(IEEE80211_RADIOTAP_HE_DATA1_FORMAT_SU | + IEEE80211_RADIOTAP_HE_DATA1_DATA_MCS_KNOWN | + IEEE80211_RADIOTAP_HE_DATA1_DATA_DCM_KNOWN | + IEEE80211_RADIOTAP_HE_DATA1_BW_RU_ALLOC_KNOWN); + + he->data2 = cpu_to_le16(IEEE80211_RADIOTAP_HE_DATA2_GI_KNOWN); + +#define HE_PREP(f, val) le16_encode_bits(val, IEEE80211_RADIOTAP_HE_##f) + + he->data6 |= HE_PREP(DATA6_NSTS, status->rate->nss); + +#define CHECK_GI(s) \ + BUILD_BUG_ON(IEEE80211_RADIOTAP_HE_DATA5_GI_##s != \ + (int)NL80211_RATE_INFO_HE_GI_##s) + + CHECK_GI(0_8); + CHECK_GI(1_6); + CHECK_GI(3_2); + + he->data3 |= HE_PREP(DATA3_DATA_MCS, status->rate->mcs); + he->data3 |= HE_PREP(DATA3_DATA_DCM, status->rate->he_dcm); + + he->data5 |= HE_PREP(DATA5_GI, status->rate->he_gi); + + switch (status->rate->bw) { + case RATE_INFO_BW_20: + he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, + IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_20MHZ); + break; + case RATE_INFO_BW_40: + he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, + IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_40MHZ); + break; + case RATE_INFO_BW_80: + he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, + IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_80MHZ); + break; + case RATE_INFO_BW_160: + he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, + IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_160MHZ); + break; + case RATE_INFO_BW_HE_RU: +#define CHECK_RU_ALLOC(s) \ + BUILD_BUG_ON(IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_##s##T != \ + NL80211_RATE_INFO_HE_RU_ALLOC_##s + 4) + + CHECK_RU_ALLOC(26); + CHECK_RU_ALLOC(52); + CHECK_RU_ALLOC(106); + CHECK_RU_ALLOC(242); + CHECK_RU_ALLOC(484); + CHECK_RU_ALLOC(996); + CHECK_RU_ALLOC(2x996); + + he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, + status->rate->he_ru_alloc + 4); + break; + default: + WARN_ONCE(1, "Invalid SU BW %d\n", status->rate->bw); + } + + pos += sizeof(struct ieee80211_radiotap_he); + } + + if ((status && status->rate) || info->status.rates[0].idx < 0) return; /* IEEE80211_RADIOTAP_MCS @@ -511,12 +670,26 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local, struct sk_buff *skb, bool dropped) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + u16 tx_time_est = ieee80211_info_get_tx_time_est(info); struct ieee80211_hdr *hdr = (void *)skb->data; bool acked = info->flags & IEEE80211_TX_STAT_ACK; if (dropped) acked = false; + if (tx_time_est) { + struct sta_info *sta; + + rcu_read_lock(); + + sta = sta_info_get_by_addrs(local, hdr->addr1, hdr->addr2); + ieee80211_sta_update_pending_airtime(local, sta, + skb_get_queue_mapping(skb), + tx_time_est, + true); + rcu_read_unlock(); + } + if (info->flags & IEEE80211_TX_INTFL_MLME_CONN_TX) { struct ieee80211_sub_if_data *sdata; @@ -645,7 +818,8 @@ static int ieee80211_tx_get_rates(struct ieee80211_hw *hw, void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb, struct ieee80211_supported_band *sband, - int retry_count, int shift, bool send_to_cooked) + int retry_count, int shift, bool send_to_cooked, + struct ieee80211_tx_status *status) { struct sk_buff *skb2; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); @@ -654,14 +828,14 @@ void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb, int rtap_len; /* send frame to monitor interfaces now */ - rtap_len = ieee80211_tx_radiotap_len(info); + rtap_len = ieee80211_tx_radiotap_len(info, status); if (WARN_ON_ONCE(skb_headroom(skb) < rtap_len)) { pr_err("ieee80211_tx_status: headroom too small\n"); dev_kfree_skb(skb); return; } ieee80211_add_tx_radiotap_header(local, sband, skb, retry_count, - rtap_len, shift); + rtap_len, shift, status); /* XXX: is this sufficient for BPF? */ skb_reset_mac_header(skb); @@ -717,6 +891,7 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw, struct ieee80211_bar *bar; int shift = 0; int tid = IEEE80211_NUM_TIDS; + u16 tx_time_est; rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count); @@ -826,6 +1001,17 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw, ieee80211_sta_register_airtime(&sta->sta, tid, info->status.tx_time, 0); + if ((tx_time_est = ieee80211_info_get_tx_time_est(info)) > 0) { + /* Do this here to avoid the expensive lookup of the sta + * in ieee80211_report_used_skb(). + */ + ieee80211_sta_update_pending_airtime(local, sta, + skb_get_queue_mapping(skb), + tx_time_est, + true); + ieee80211_info_set_tx_time_est(info, 0); + } + if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { if (info->flags & IEEE80211_TX_STAT_ACK) { if (sta->status_stats.lost_packets) @@ -870,7 +1056,8 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw, I802_DEBUG_INC(local->dot11FailedCount); } - if (ieee80211_is_nullfunc(fc) && ieee80211_has_pm(fc) && + if ((ieee80211_is_nullfunc(fc) || ieee80211_is_qos_nullfunc(fc)) && + ieee80211_has_pm(fc) && ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS) && !(info->flags & IEEE80211_TX_CTL_INJECTED) && local->ps_sdata && !(local->scanning)) { @@ -901,7 +1088,8 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw, } /* send to monitor interfaces */ - ieee80211_tx_monitor(local, skb, sband, retry_count, shift, send_to_cooked); + ieee80211_tx_monitor(local, skb, sband, retry_count, shift, + send_to_cooked, status); } void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) @@ -912,19 +1100,13 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) .skb = skb, .info = IEEE80211_SKB_CB(skb), }; - struct rhlist_head *tmp; struct sta_info *sta; rcu_read_lock(); - for_each_sta_info(local, hdr->addr1, sta, tmp) { - /* skip wrong virtual interface */ - if (!ether_addr_equal(hdr->addr2, sta->sdata->vif.addr)) - continue; - + sta = sta_info_get_by_addrs(local, hdr->addr1, hdr->addr2); + if (sta) status.sta = &sta->sta; - break; - } __ieee80211_tx_status(hw, &status); rcu_read_unlock(); diff --git a/net/mac80211/tkip.c b/net/mac80211/tkip.c index 727dc9f3f3b3..e7f57bb18f6e 100644 --- a/net/mac80211/tkip.c +++ b/net/mac80211/tkip.c @@ -263,9 +263,21 @@ int ieee80211_tkip_decrypt_data(struct arc4_ctx *ctx, if ((keyid >> 6) != key->conf.keyidx) return TKIP_DECRYPT_INVALID_KEYIDX; - if (rx_ctx->ctx.state != TKIP_STATE_NOT_INIT && - (iv32 < rx_ctx->iv32 || - (iv32 == rx_ctx->iv32 && iv16 <= rx_ctx->iv16))) + /* Reject replays if the received TSC is smaller than or equal to the + * last received value in a valid message, but with an exception for + * the case where a new key has been set and no valid frame using that + * key has yet received and the local RSC was initialized to 0. This + * exception allows the very first frame sent by the transmitter to be + * accepted even if that transmitter were to use TSC 0 (IEEE 802.11 + * described TSC to be initialized to 1 whenever a new key is taken into + * use). + */ + if (iv32 < rx_ctx->iv32 || + (iv32 == rx_ctx->iv32 && + (iv16 < rx_ctx->iv16 || + (iv16 == rx_ctx->iv16 && + (rx_ctx->iv32 || rx_ctx->iv16 || + rx_ctx->ctx.state != TKIP_STATE_NOT_INIT))))) return TKIP_DECRYPT_REPLAY; if (only_iv) { diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 3bb4459b52c7..427f51a0a994 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -408,20 +408,20 @@ TRACE_EVENT(drv_bss_info_changed, __field(u32, basic_rates) __array(int, mcast_rate, NUM_NL80211_BANDS) __field(u16, ht_operation_mode) - __field(s32, cqm_rssi_thold); - __field(s32, cqm_rssi_hyst); - __field(u32, channel_width); - __field(u32, channel_cfreq1); + __field(s32, cqm_rssi_thold) + __field(s32, cqm_rssi_hyst) + __field(u32, channel_width) + __field(u32, channel_cfreq1) __dynamic_array(u32, arp_addr_list, info->arp_addr_cnt > IEEE80211_BSS_ARP_ADDR_LIST_LEN ? IEEE80211_BSS_ARP_ADDR_LIST_LEN : - info->arp_addr_cnt); - __field(int, arp_addr_cnt); - __field(bool, qos); - __field(bool, idle); - __field(bool, ps); - __dynamic_array(u8, ssid, info->ssid_len); - __field(bool, hidden_ssid); + info->arp_addr_cnt) + __field(int, arp_addr_cnt) + __field(bool, qos) + __field(bool, idle) + __field(bool, ps) + __dynamic_array(u8, ssid, info->ssid_len) + __field(bool, hidden_ssid) __field(int, txpower) __field(u8, p2p_oppps_ctwindow) ), @@ -1242,9 +1242,10 @@ TRACE_EVENT(drv_remain_on_channel, ) ); -DEFINE_EVENT(local_only_evt, drv_cancel_remain_on_channel, - TP_PROTO(struct ieee80211_local *local), - TP_ARGS(local) +DEFINE_EVENT(local_sdata_evt, drv_cancel_remain_on_channel, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata), + TP_ARGS(local, sdata) ); TRACE_EVENT(drv_set_ringparam, @@ -1671,8 +1672,8 @@ TRACE_EVENT(drv_start_ap, VIF_ENTRY __field(u8, dtimper) __field(u16, bcnint) - __dynamic_array(u8, ssid, info->ssid_len); - __field(bool, hidden_ssid); + __dynamic_array(u8, ssid, info->ssid_len) + __field(bool, hidden_ssid) ), TP_fast_assign( @@ -1738,7 +1739,7 @@ TRACE_EVENT(drv_join_ibss, VIF_ENTRY __field(u8, dtimper) __field(u16, bcnint) - __dynamic_array(u8, ssid, info->ssid_len); + __dynamic_array(u8, ssid, info->ssid_len) ), TP_fast_assign( diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index f13eb2f61ccf..87def9cb91ff 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -162,6 +162,7 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, break; } case NL80211_BAND_5GHZ: + case NL80211_BAND_6GHZ: if (r->flags & IEEE80211_RATE_MANDATORY_A) mrate = r->bitrate; break; @@ -1616,7 +1617,7 @@ static bool ieee80211_queue_skb(struct ieee80211_local *local, static bool ieee80211_tx_frags(struct ieee80211_local *local, struct ieee80211_vif *vif, - struct ieee80211_sta *sta, + struct sta_info *sta, struct sk_buff_head *skbs, bool txpending) { @@ -1678,7 +1679,7 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local, spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); info->control.vif = vif; - control.sta = sta; + control.sta = sta ? &sta->sta : NULL; __skb_unlink(skb, skbs); drv_tx(local, &control, skb); @@ -1697,7 +1698,6 @@ static bool __ieee80211_tx(struct ieee80211_local *local, struct ieee80211_tx_info *info; struct ieee80211_sub_if_data *sdata; struct ieee80211_vif *vif; - struct ieee80211_sta *pubsta; struct sk_buff *skb; bool result = true; __le16 fc; @@ -1712,11 +1712,6 @@ static bool __ieee80211_tx(struct ieee80211_local *local, if (sta && !sta->uploaded) sta = NULL; - if (sta) - pubsta = &sta->sta; - else - pubsta = NULL; - switch (sdata->vif.type) { case NL80211_IFTYPE_MONITOR: if (sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) { @@ -1743,8 +1738,7 @@ static bool __ieee80211_tx(struct ieee80211_local *local, break; } - result = ieee80211_tx_frags(local, vif, pubsta, skbs, - txpending); + result = ieee80211_tx_frags(local, vif, sta, skbs, txpending); ieee80211_tpt_led_trig_tx(local, fc, led_len); @@ -2262,6 +2256,15 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, payload[7]); } + /* + * Initialize skb->priority for QoS frames. This is put in the TID field + * of the frame before passing it to the driver. + */ + if (ieee80211_is_data_qos(hdr->frame_control)) { + u8 *p = ieee80211_get_qos_ctl(hdr); + skb->priority = *p & IEEE80211_QOS_CTL_TAG1D_MASK; + } + memset(info, 0, sizeof(*info)); info->flags = IEEE80211_TX_CTL_REQ_TX_STATUS | @@ -2276,6 +2279,9 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, * isn't always enough to find the interface to use; for proper * VLAN/WDS support we will need a different mechanism (which * likely isn't going to be monitor interfaces). + * + * This is necessary, for example, for old hostapd versions that + * don't use nl80211-based management TX/RX. */ sdata = IEEE80211_DEV_TO_SUB_IF(dev); @@ -2423,6 +2429,33 @@ static int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata, return 0; } +static int ieee80211_store_ack_skb(struct ieee80211_local *local, + struct sk_buff *skb, + u32 *info_flags) +{ + struct sk_buff *ack_skb = skb_clone_sk(skb); + u16 info_id = 0; + + if (ack_skb) { + unsigned long flags; + int id; + + spin_lock_irqsave(&local->ack_status_lock, flags); + id = idr_alloc(&local->ack_status_frames, ack_skb, + 1, 0x2000, GFP_ATOMIC); + spin_unlock_irqrestore(&local->ack_status_lock, flags); + + if (id >= 0) { + info_id = id; + *info_flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; + } else { + kfree_skb(ack_skb); + } + } + + return info_id; +} + /** * ieee80211_build_hdr - build 802.11 header in the given frame * @sdata: virtual interface to build the header for @@ -2716,26 +2749,8 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, } if (unlikely(!multicast && skb->sk && - skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS)) { - struct sk_buff *ack_skb = skb_clone_sk(skb); - - if (ack_skb) { - unsigned long flags; - int id; - - spin_lock_irqsave(&local->ack_status_lock, flags); - id = idr_alloc(&local->ack_status_frames, ack_skb, - 1, 0x10000, GFP_ATOMIC); - spin_unlock_irqrestore(&local->ack_status_lock, flags); - - if (id >= 0) { - info_id = id; - info_flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; - } else { - kfree_skb(ack_skb); - } - } - } + skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS)) + info_id = ieee80211_store_ack_skb(local, skb, &info_flags); /* * If the skb is shared we need to obtain our own copy. @@ -3528,7 +3543,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, struct ieee80211_sub_if_data, u.ap); __skb_queue_tail(&tx.skbs, skb); - ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false); + ieee80211_tx_frags(local, &sdata->vif, sta, &tx.skbs, false); return true; } @@ -3546,6 +3561,11 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, ieee80211_tx_result r; struct ieee80211_vif *vif = txq->vif; + WARN_ON_ONCE(softirq_count() == 0); + + if (!ieee80211_txq_airtime_check(hw, txq)) + return NULL; + begin: spin_lock_bh(&fq->lock); @@ -3656,6 +3676,21 @@ begin: } IEEE80211_SKB_CB(skb)->control.vif = vif; + + if (wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) { + u32 airtime; + + airtime = ieee80211_calc_expected_tx_airtime(hw, vif, txq->sta, + skb->len); + if (airtime) { + airtime = ieee80211_info_set_tx_time_est(info, airtime); + ieee80211_sta_update_pending_airtime(local, tx.sta, + txq->ac, + airtime, + false); + } + } + return skb; out: @@ -3669,7 +3704,8 @@ struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw, u8 ac) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_txq *ret = NULL; - struct txq_info *txqi = NULL; + struct txq_info *txqi = NULL, *head = NULL; + bool found_eligible_txq = false; spin_lock_bh(&local->active_txq_lock[ac]); @@ -3680,13 +3716,30 @@ struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw, u8 ac) if (!txqi) goto out; + if (txqi == head) { + if (!found_eligible_txq) + goto out; + else + found_eligible_txq = false; + } + + if (!head) + head = txqi; + if (txqi->txq.sta) { struct sta_info *sta = container_of(txqi->txq.sta, - struct sta_info, sta); + struct sta_info, sta); + bool aql_check = ieee80211_txq_airtime_check(hw, &txqi->txq); + s64 deficit = sta->airtime[txqi->txq.ac].deficit; - if (sta->airtime[txqi->txq.ac].deficit < 0) { + if (aql_check) + found_eligible_txq = true; + + if (deficit < 0) sta->airtime[txqi->txq.ac].deficit += sta->airtime_weight; + + if (deficit < 0 || !aql_check) { list_move_tail(&txqi->schedule_order, &local->active_txqs[txqi->txq.ac]); goto begin; @@ -3740,6 +3793,33 @@ void __ieee80211_schedule_txq(struct ieee80211_hw *hw, } EXPORT_SYMBOL(__ieee80211_schedule_txq); +bool ieee80211_txq_airtime_check(struct ieee80211_hw *hw, + struct ieee80211_txq *txq) +{ + struct sta_info *sta; + struct ieee80211_local *local = hw_to_local(hw); + + if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) + return true; + + if (!txq->sta) + return true; + + sta = container_of(txq->sta, struct sta_info, sta); + if (atomic_read(&sta->airtime[txq->ac].aql_tx_pending) < + sta->airtime[txq->ac].aql_limit_low) + return true; + + if (atomic_read(&local->aql_total_pending_airtime) < + local->aql_threshold && + atomic_read(&sta->airtime[txq->ac].aql_tx_pending) < + sta->airtime[txq->ac].aql_limit_high) + return true; + + return false; +} +EXPORT_SYMBOL(ieee80211_txq_airtime_check); + bool ieee80211_txq_may_transmit(struct ieee80211_hw *hw, struct ieee80211_txq *txq) { @@ -3869,18 +3949,15 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, } } - next = skb; - while (next) { - skb = next; - next = skb->next; - - skb->prev = NULL; - skb->next = NULL; + skb_list_walk_safe(skb, skb, next) { + skb_mark_not_on_list(skb); skb = ieee80211_build_hdr(sdata, skb, info_flags, sta, ctrl_flags); - if (IS_ERR(skb)) + if (IS_ERR(skb)) { + kfree_skb_list(next); goto out; + } ieee80211_tx_stats(dev, skb->len); @@ -4647,7 +4724,8 @@ struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw, if (!sband) return bcn; - ieee80211_tx_monitor(hw_to_local(hw), copy, sband, 1, shift, false); + ieee80211_tx_monitor(hw_to_local(hw), copy, sband, 1, shift, false, + NULL); return bcn; } diff --git a/net/mac80211/util.c b/net/mac80211/util.c index ad1e58184c4e..decd46b38393 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -247,7 +247,8 @@ static void __ieee80211_wake_txqs(struct ieee80211_sub_if_data *sdata, int ac) struct sta_info *sta; int i; - spin_lock_bh(&fq->lock); + local_bh_disable(); + spin_lock(&fq->lock); if (sdata->vif.type == NL80211_IFTYPE_AP) ps = &sdata->bss->ps; @@ -273,9 +274,9 @@ static void __ieee80211_wake_txqs(struct ieee80211_sub_if_data *sdata, int ac) &txqi->flags)) continue; - spin_unlock_bh(&fq->lock); + spin_unlock(&fq->lock); drv_wake_tx_queue(local, txqi); - spin_lock_bh(&fq->lock); + spin_lock(&fq->lock); } } @@ -288,12 +289,14 @@ static void __ieee80211_wake_txqs(struct ieee80211_sub_if_data *sdata, int ac) (ps && atomic_read(&ps->num_sta_ps)) || ac != vif->txq->ac) goto out; - spin_unlock_bh(&fq->lock); + spin_unlock(&fq->lock); drv_wake_tx_queue(local, txqi); + local_bh_enable(); return; out: - spin_unlock_bh(&fq->lock); + spin_unlock(&fq->lock); + local_bh_enable(); } static void @@ -1060,16 +1063,22 @@ _ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, elem_parse_failed = true; break; case WLAN_EID_VHT_OPERATION: - if (elen >= sizeof(struct ieee80211_vht_operation)) + if (elen >= sizeof(struct ieee80211_vht_operation)) { elems->vht_operation = (void *)pos; - else - elem_parse_failed = true; + if (calc_crc) + crc = crc32_be(crc, pos - 2, elen + 2); + break; + } + elem_parse_failed = true; break; case WLAN_EID_OPMODE_NOTIF: - if (elen > 0) + if (elen > 0) { elems->opmode_notif = pos; - else - elem_parse_failed = true; + if (calc_crc) + crc = crc32_be(crc, pos - 2, elen + 2); + break; + } + elem_parse_failed = true; break; case WLAN_EID_MESH_ID: elems->mesh_id = pos; @@ -1200,6 +1209,13 @@ _ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, elems->cisco_dtpc_elem = pos; break; + case WLAN_EID_ADDBA_EXT: + if (elen != sizeof(struct ieee80211_addba_ext_ie)) { + elem_parse_failed = true; + break; + } + elems->addba_ext_ie = (void *)pos; + break; case WLAN_EID_TIMEOUT_INTERVAL: if (elen >= sizeof(struct ieee80211_timeout_interval_ie)) elems->timeout_int = (void *)pos; @@ -1233,6 +1249,10 @@ _ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, WLAN_EID_EXT_MULTIPLE_BSSID_CONFIGURATION && elen == 3) { elems->mbssid_config_ie = (void *)&pos[1]; + } else if (pos[0] == WLAN_EID_EXT_HE_SPR && + elen >= sizeof(*elems->he_spr) && + elen >= ieee80211_he_spr_size(&pos[1])) { + elems->he_spr = (void *)&pos[1]; } break; default: @@ -1572,7 +1592,8 @@ void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, } void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata, - const u8 *bssid, u16 stype, u16 reason, + const u8 *da, const u8 *bssid, + u16 stype, u16 reason, bool send_frame, u8 *frame_buf) { struct ieee80211_local *local = sdata->local; @@ -1583,7 +1604,7 @@ void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata, mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | stype); mgmt->duration = 0; /* initialize only */ mgmt->seq_ctrl = 0; /* initialize only */ - memcpy(mgmt->da, bssid, ETH_ALEN); + memcpy(mgmt->da, da, ETH_ALEN); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); memcpy(mgmt->bssid, bssid, ETH_ALEN); /* u.deauth.reason_code == u.disassoc.reason_code */ @@ -2409,11 +2430,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) /* add back keys */ list_for_each_entry(sdata, &local->interfaces, list) - ieee80211_reset_crypto_tx_tailroom(sdata); - - list_for_each_entry(sdata, &local->interfaces, list) - if (ieee80211_sdata_running(sdata)) - ieee80211_enable_keys(sdata); + ieee80211_reenable_keys(sdata); /* Reconfigure sched scan if it was interrupted by FW restart */ mutex_lock(&local->mtx); @@ -2702,6 +2719,27 @@ u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, return pos; } +u8 ieee80211_ie_len_he_cap(struct ieee80211_sub_if_data *sdata, u8 iftype) +{ + const struct ieee80211_sta_he_cap *he_cap; + struct ieee80211_supported_band *sband; + u8 n; + + sband = ieee80211_get_sband(sdata); + if (!sband) + return 0; + + he_cap = ieee80211_get_he_iftype_cap(sband, iftype); + if (!he_cap) + return 0; + + n = ieee80211_he_mcs_nss_size(&he_cap->he_cap_elem); + return 2 + 1 + + sizeof(he_cap->he_cap_elem) + n + + ieee80211_he_ppe_size(he_cap->ppe_thres[0], + he_cap->he_cap_elem.phy_cap_info); +} + u8 *ieee80211_ie_build_he_cap(u8 *pos, const struct ieee80211_sta_he_cap *he_cap, u8 *end) @@ -2891,6 +2929,34 @@ u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, return pos + sizeof(struct ieee80211_vht_operation); } +u8 *ieee80211_ie_build_he_oper(u8 *pos) +{ + struct ieee80211_he_operation *he_oper; + u32 he_oper_params; + + *pos++ = WLAN_EID_EXTENSION; + *pos++ = 1 + sizeof(struct ieee80211_he_operation); + *pos++ = WLAN_EID_EXT_HE_OPERATION; + + he_oper_params = 0; + he_oper_params |= u32_encode_bits(1023, /* disabled */ + IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK); + he_oper_params |= u32_encode_bits(1, + IEEE80211_HE_OPERATION_ER_SU_DISABLE); + he_oper_params |= u32_encode_bits(1, + IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED); + + he_oper = (struct ieee80211_he_operation *)pos; + he_oper->he_oper_params = cpu_to_le32(he_oper_params); + + /* don't require special HE peer rates */ + he_oper->he_mcs_nss_set = cpu_to_le16(0xffff); + + /* TODO add VHT operational and 6GHz operational subelement? */ + + return pos + sizeof(struct ieee80211_vht_operation); +} + bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper, struct cfg80211_chan_def *chandef) { @@ -2927,10 +2993,22 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, int cf0, cf1; int ccfs0, ccfs1, ccfs2; int ccf0, ccf1; + u32 vht_cap; + bool support_80_80 = false; + bool support_160 = false; if (!oper || !htop) return false; + vht_cap = hw->wiphy->bands[chandef->chan->band]->vht_cap.cap; + support_160 = (vht_cap & (IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK | + IEEE80211_VHT_CAP_EXT_NSS_BW_MASK)); + support_80_80 = ((vht_cap & + IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) || + (vht_cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ && + vht_cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) || + ((vht_cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) >> + IEEE80211_VHT_CAP_EXT_NSS_BW_SHIFT > 1)); ccfs0 = oper->center_freq_seg0_idx; ccfs1 = oper->center_freq_seg1_idx; ccfs2 = (le16_to_cpu(htop->operation_mode) & @@ -2958,10 +3036,10 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, unsigned int diff; diff = abs(ccf1 - ccf0); - if (diff == 8) { + if ((diff == 8) && support_160) { new.width = NL80211_CHAN_WIDTH_160; new.center_freq1 = cf1; - } else if (diff > 8) { + } else if ((diff > 8) && support_80_80) { new.width = NL80211_CHAN_WIDTH_80P80; new.center_freq2 = cf1; } diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index b20ff28d9f30..ccdcb9ad9ac7 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -4,7 +4,7 @@ * * Portions of this file * Copyright(c) 2015 - 2016 Intel Deutschland GmbH - * Copyright (C) 2018 Intel Corporation + * Copyright (C) 2018 - 2019 Intel Corporation */ #include <linux/ieee80211.h> @@ -349,6 +349,14 @@ enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta) cap_width == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) return IEEE80211_STA_RX_BW_160; + /* + * If this is non-zero, then it does support 160 MHz after all, + * in one form or the other. We don't distinguish here (or even + * above) between 160 and 80+80 yet. + */ + if (vht_cap->cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) + return IEEE80211_STA_RX_BW_160; + return IEEE80211_STA_RX_BW_80; } diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index ee72779729e5..91bf32af55e9 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -946,7 +946,8 @@ ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx) info = IEEE80211_SKB_CB(skb); - if (info->control.hw_key) + if (info->control.hw_key && + !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIE)) return TX_CONTINUE; if (WARN_ON(skb_tailroom(skb) < sizeof(*mmie))) @@ -962,6 +963,9 @@ ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx) bip_ipn_set64(mmie->sequence_number, pn64); + if (info->control.hw_key) + return TX_CONTINUE; + bip_aad(skb, aad); /* diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index c312741df2ce..4701edffb1f7 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -617,16 +617,15 @@ static struct net_device *inet6_fib_lookup_dev(struct net *net, struct net_device *dev; struct dst_entry *dst; struct flowi6 fl6; - int err; if (!ipv6_stub) return ERR_PTR(-EAFNOSUPPORT); memset(&fl6, 0, sizeof(fl6)); memcpy(&fl6.daddr, addr, sizeof(struct in6_addr)); - err = ipv6_stub->ipv6_dst_lookup(net, NULL, &dst, &fl6); - if (err) - return ERR_PTR(err); + dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); + if (IS_ERR(dst)) + return ERR_CAST(dst); dev = dst->dev; dev_hold(dev); diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index d25e91d7bdc1..44b675016393 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -133,12 +133,12 @@ static int mpls_xmit(struct sk_buff *skb) mpls_stats_inc_outucastpkts(out_dev, skb); if (rt) { - if (rt->rt_gw_family == AF_INET) - err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gw4, - skb); - else if (rt->rt_gw_family == AF_INET6) + if (rt->rt_gw_family == AF_INET6) err = neigh_xmit(NEIGH_ND_TABLE, out_dev, &rt->rt_gw6, skb); + else + err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gw4, + skb); } else if (rt6) { if (ipv6_addr_v4mapped(&rt6->rt6i_gateway)) { /* 6PE (RFC 4798) */ diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig new file mode 100644 index 000000000000..49f6054e7f4e --- /dev/null +++ b/net/mptcp/Kconfig @@ -0,0 +1,28 @@ + +config MPTCP + bool "MPTCP: Multipath TCP" + depends on INET + select SKB_EXTENSIONS + select CRYPTO_LIB_SHA256 + help + Multipath TCP (MPTCP) connections send and receive data over multiple + subflows in order to utilize multiple network paths. Each subflow + uses the TCP protocol, and TCP options carry header information for + MPTCP. + +if MPTCP + +config MPTCP_IPV6 + bool "MPTCP: IPv6 support for Multipath TCP" + select IPV6 + default y + +config MPTCP_HMAC_TEST + bool "Tests for MPTCP HMAC implementation" + help + This option enable boot time self-test for the HMAC implementation + used by the MPTCP code + + Say N if you are unsure. + +endif diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile new file mode 100644 index 000000000000..4e98d9edfd0a --- /dev/null +++ b/net/mptcp/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_MPTCP) += mptcp.o + +mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c new file mode 100644 index 000000000000..40d1bb18fd60 --- /dev/null +++ b/net/mptcp/crypto.c @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP cryptographic functions + * Copyright (c) 2017 - 2019, Intel Corporation. + * + * Note: This code is based on mptcp_ctrl.c, mptcp_ipv4.c, and + * mptcp_ipv6 from multipath-tcp.org, authored by: + * + * SĂ©bastien BarrĂ© <sebastien.barre@uclouvain.be> + * Christoph Paasch <christoph.paasch@uclouvain.be> + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi> + * Gregory Detal <gregory.detal@uclouvain.be> + * Fabien DuchĂŞne <fabien.duchene@uclouvain.be> + * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de> + * Lavkesh Lahngir <lavkesh51@gmail.com> + * Andreas Ripke <ripke@neclab.eu> + * Vlad Dogaru <vlad.dogaru@intel.com> + * Octavian Purdila <octavian.purdila@intel.com> + * John Ronan <jronan@tssg.org> + * Catalin Nicutar <catalin.nicutar@gmail.com> + * Brandon Heller <brandonh@stanford.edu> + */ + +#include <linux/kernel.h> +#include <crypto/sha.h> +#include <asm/unaligned.h> + +#include "protocol.h" + +#define SHA256_DIGEST_WORDS (SHA256_DIGEST_SIZE / 4) + +void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn) +{ + __be32 mptcp_hashed_key[SHA256_DIGEST_WORDS]; + __be64 input = cpu_to_be64(key); + struct sha256_state state; + + sha256_init(&state); + sha256_update(&state, (__force u8 *)&input, sizeof(input)); + sha256_final(&state, (u8 *)mptcp_hashed_key); + + if (token) + *token = be32_to_cpu(mptcp_hashed_key[0]); + if (idsn) + *idsn = be64_to_cpu(*((__be64 *)&mptcp_hashed_key[6])); +} + +void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2, + void *hmac) +{ + u8 input[SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE]; + __be32 mptcp_hashed_key[SHA256_DIGEST_WORDS]; + __be32 *hash_out = (__force __be32 *)hmac; + struct sha256_state state; + u8 key1be[8]; + u8 key2be[8]; + int i; + + put_unaligned_be64(key1, key1be); + put_unaligned_be64(key2, key2be); + + /* Generate key xored with ipad */ + memset(input, 0x36, SHA_MESSAGE_BYTES); + for (i = 0; i < 8; i++) + input[i] ^= key1be[i]; + for (i = 0; i < 8; i++) + input[i + 8] ^= key2be[i]; + + put_unaligned_be32(nonce1, &input[SHA256_BLOCK_SIZE]); + put_unaligned_be32(nonce2, &input[SHA256_BLOCK_SIZE + 4]); + + sha256_init(&state); + sha256_update(&state, input, SHA256_BLOCK_SIZE + 8); + + /* emit sha256(K1 || msg) on the second input block, so we can + * reuse 'input' for the last hashing + */ + sha256_final(&state, &input[SHA256_BLOCK_SIZE]); + + /* Prepare second part of hmac */ + memset(input, 0x5C, SHA_MESSAGE_BYTES); + for (i = 0; i < 8; i++) + input[i] ^= key1be[i]; + for (i = 0; i < 8; i++) + input[i + 8] ^= key2be[i]; + + sha256_init(&state); + sha256_update(&state, input, SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE); + sha256_final(&state, (u8 *)mptcp_hashed_key); + + /* takes only first 160 bits */ + for (i = 0; i < 5; i++) + hash_out[i] = mptcp_hashed_key[i]; +} + +#ifdef CONFIG_MPTCP_HMAC_TEST +struct test_cast { + char *key; + char *msg; + char *result; +}; + +/* we can't reuse RFC 4231 test vectors, as we have constraint on the + * input and key size, and we truncate the output. + */ +static struct test_cast tests[] = { + { + .key = "0b0b0b0b0b0b0b0b", + .msg = "48692054", + .result = "8385e24fb4235ac37556b6b886db106284a1da67", + }, + { + .key = "aaaaaaaaaaaaaaaa", + .msg = "dddddddd", + .result = "2c5e219164ff1dca1c4a92318d847bb6b9d44492", + }, + { + .key = "0102030405060708", + .msg = "cdcdcdcd", + .result = "e73b9ba9969969cefb04aa0d6df18ec2fcc075b6", + }, +}; + +static int __init test_mptcp_crypto(void) +{ + char hmac[20], hmac_hex[41]; + u32 nonce1, nonce2; + u64 key1, key2; + int i, j; + + for (i = 0; i < ARRAY_SIZE(tests); ++i) { + /* mptcp hmap will convert to be before computing the hmac */ + key1 = be64_to_cpu(*((__be64 *)&tests[i].key[0])); + key2 = be64_to_cpu(*((__be64 *)&tests[i].key[8])); + nonce1 = be32_to_cpu(*((__be32 *)&tests[i].msg[0])); + nonce2 = be32_to_cpu(*((__be32 *)&tests[i].msg[4])); + + mptcp_crypto_hmac_sha(key1, key2, nonce1, nonce2, hmac); + for (j = 0; j < 20; ++j) + sprintf(&hmac_hex[j << 1], "%02x", hmac[j] & 0xff); + hmac_hex[40] = 0; + + if (memcmp(hmac_hex, tests[i].result, 40)) + pr_err("test %d failed, got %s expected %s", i, + hmac_hex, tests[i].result); + else + pr_info("test %d [ ok ]", i); + } + return 0; +} + +late_initcall(test_mptcp_crypto); +#endif diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c new file mode 100644 index 000000000000..8e39585d37f3 --- /dev/null +++ b/net/mptcp/ctrl.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2019, Tessares SA. + */ + +#include <linux/sysctl.h> + +#include <net/net_namespace.h> +#include <net/netns/generic.h> + +#include "protocol.h" + +#define MPTCP_SYSCTL_PATH "net/mptcp" + +static int mptcp_pernet_id; +struct mptcp_pernet { + struct ctl_table_header *ctl_table_hdr; + + int mptcp_enabled; +}; + +static struct mptcp_pernet *mptcp_get_pernet(struct net *net) +{ + return net_generic(net, mptcp_pernet_id); +} + +int mptcp_is_enabled(struct net *net) +{ + return mptcp_get_pernet(net)->mptcp_enabled; +} + +static struct ctl_table mptcp_sysctl_table[] = { + { + .procname = "enabled", + .maxlen = sizeof(int), + .mode = 0644, + /* users with CAP_NET_ADMIN or root (not and) can change this + * value, same as other sysctl or the 'net' tree. + */ + .proc_handler = proc_dointvec, + }, + {} +}; + +static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) +{ + pernet->mptcp_enabled = 1; +} + +static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) +{ + struct ctl_table_header *hdr; + struct ctl_table *table; + + table = mptcp_sysctl_table; + if (!net_eq(net, &init_net)) { + table = kmemdup(table, sizeof(mptcp_sysctl_table), GFP_KERNEL); + if (!table) + goto err_alloc; + } + + table[0].data = &pernet->mptcp_enabled; + + hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table); + if (!hdr) + goto err_reg; + + pernet->ctl_table_hdr = hdr; + + return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static void mptcp_pernet_del_table(struct mptcp_pernet *pernet) +{ + struct ctl_table *table = pernet->ctl_table_hdr->ctl_table_arg; + + unregister_net_sysctl_table(pernet->ctl_table_hdr); + + kfree(table); +} + +static int __net_init mptcp_net_init(struct net *net) +{ + struct mptcp_pernet *pernet = mptcp_get_pernet(net); + + mptcp_pernet_set_defaults(pernet); + + return mptcp_pernet_new_table(net, pernet); +} + +/* Note: the callback will only be called per extra netns */ +static void __net_exit mptcp_net_exit(struct net *net) +{ + struct mptcp_pernet *pernet = mptcp_get_pernet(net); + + mptcp_pernet_del_table(pernet); +} + +static struct pernet_operations mptcp_pernet_ops = { + .init = mptcp_net_init, + .exit = mptcp_net_exit, + .id = &mptcp_pernet_id, + .size = sizeof(struct mptcp_pernet), +}; + +void __init mptcp_init(void) +{ + mptcp_proto_init(); + + if (register_pernet_subsys(&mptcp_pernet_ops) < 0) + panic("Failed to register MPTCP pernet subsystem.\n"); +} + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +int __init mptcpv6_init(void) +{ + int err; + + err = mptcp_proto_v6_init(); + + return err; +} +#endif diff --git a/net/mptcp/options.c b/net/mptcp/options.c new file mode 100644 index 000000000000..45acd877bef3 --- /dev/null +++ b/net/mptcp/options.c @@ -0,0 +1,586 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2017 - 2019, Intel Corporation. + */ + +#include <linux/kernel.h> +#include <net/tcp.h> +#include <net/mptcp.h> +#include "protocol.h" + +static bool mptcp_cap_flag_sha256(u8 flags) +{ + return (flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA256; +} + +void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, + int opsize, struct tcp_options_received *opt_rx) +{ + struct mptcp_options_received *mp_opt = &opt_rx->mptcp; + u8 subtype = *ptr >> 4; + int expected_opsize; + u8 version; + u8 flags; + + switch (subtype) { + case MPTCPOPT_MP_CAPABLE: + /* strict size checking */ + if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { + if (skb->len > tcp_hdr(skb)->doff << 2) + expected_opsize = TCPOLEN_MPTCP_MPC_ACK_DATA; + else + expected_opsize = TCPOLEN_MPTCP_MPC_ACK; + } else { + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) + expected_opsize = TCPOLEN_MPTCP_MPC_SYNACK; + else + expected_opsize = TCPOLEN_MPTCP_MPC_SYN; + } + if (opsize != expected_opsize) + break; + + /* try to be gentle vs future versions on the initial syn */ + version = *ptr++ & MPTCP_VERSION_MASK; + if (opsize != TCPOLEN_MPTCP_MPC_SYN) { + if (version != MPTCP_SUPPORTED_VERSION) + break; + } else if (version < MPTCP_SUPPORTED_VERSION) { + break; + } + + flags = *ptr++; + if (!mptcp_cap_flag_sha256(flags) || + (flags & MPTCP_CAP_EXTENSIBILITY)) + break; + + /* RFC 6824, Section 3.1: + * "For the Checksum Required bit (labeled "A"), if either + * host requires the use of checksums, checksums MUST be used. + * In other words, the only way for checksums not to be used + * is if both hosts in their SYNs set A=0." + * + * Section 3.3.0: + * "If a checksum is not present when its use has been + * negotiated, the receiver MUST close the subflow with a RST as + * it is considered broken." + * + * We don't implement DSS checksum - fall back to TCP. + */ + if (flags & MPTCP_CAP_CHECKSUM_REQD) + break; + + mp_opt->mp_capable = 1; + if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) { + mp_opt->sndr_key = get_unaligned_be64(ptr); + ptr += 8; + } + if (opsize >= TCPOLEN_MPTCP_MPC_ACK) { + mp_opt->rcvr_key = get_unaligned_be64(ptr); + ptr += 8; + } + if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA) { + /* Section 3.1.: + * "the data parameters in a MP_CAPABLE are semantically + * equivalent to those in a DSS option and can be used + * interchangeably." + */ + mp_opt->dss = 1; + mp_opt->use_map = 1; + mp_opt->mpc_map = 1; + mp_opt->data_len = get_unaligned_be16(ptr); + ptr += 2; + } + pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d", + version, flags, opsize, mp_opt->sndr_key, + mp_opt->rcvr_key, mp_opt->data_len); + break; + + case MPTCPOPT_DSS: + pr_debug("DSS"); + ptr++; + + /* we must clear 'mpc_map' be able to detect MP_CAPABLE + * map vs DSS map in mptcp_incoming_options(), and reconstruct + * map info accordingly + */ + mp_opt->mpc_map = 0; + flags = (*ptr++) & MPTCP_DSS_FLAG_MASK; + mp_opt->data_fin = (flags & MPTCP_DSS_DATA_FIN) != 0; + mp_opt->dsn64 = (flags & MPTCP_DSS_DSN64) != 0; + mp_opt->use_map = (flags & MPTCP_DSS_HAS_MAP) != 0; + mp_opt->ack64 = (flags & MPTCP_DSS_ACK64) != 0; + mp_opt->use_ack = (flags & MPTCP_DSS_HAS_ACK); + + pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d", + mp_opt->data_fin, mp_opt->dsn64, + mp_opt->use_map, mp_opt->ack64, + mp_opt->use_ack); + + expected_opsize = TCPOLEN_MPTCP_DSS_BASE; + + if (mp_opt->use_ack) { + if (mp_opt->ack64) + expected_opsize += TCPOLEN_MPTCP_DSS_ACK64; + else + expected_opsize += TCPOLEN_MPTCP_DSS_ACK32; + } + + if (mp_opt->use_map) { + if (mp_opt->dsn64) + expected_opsize += TCPOLEN_MPTCP_DSS_MAP64; + else + expected_opsize += TCPOLEN_MPTCP_DSS_MAP32; + } + + /* RFC 6824, Section 3.3: + * If a checksum is present, but its use had + * not been negotiated in the MP_CAPABLE handshake, + * the checksum field MUST be ignored. + */ + if (opsize != expected_opsize && + opsize != expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) + break; + + mp_opt->dss = 1; + + if (mp_opt->use_ack) { + if (mp_opt->ack64) { + mp_opt->data_ack = get_unaligned_be64(ptr); + ptr += 8; + } else { + mp_opt->data_ack = get_unaligned_be32(ptr); + ptr += 4; + } + + pr_debug("data_ack=%llu", mp_opt->data_ack); + } + + if (mp_opt->use_map) { + if (mp_opt->dsn64) { + mp_opt->data_seq = get_unaligned_be64(ptr); + ptr += 8; + } else { + mp_opt->data_seq = get_unaligned_be32(ptr); + ptr += 4; + } + + mp_opt->subflow_seq = get_unaligned_be32(ptr); + ptr += 4; + + mp_opt->data_len = get_unaligned_be16(ptr); + ptr += 2; + + pr_debug("data_seq=%llu subflow_seq=%u data_len=%u", + mp_opt->data_seq, mp_opt->subflow_seq, + mp_opt->data_len); + } + + break; + + default: + break; + } +} + +void mptcp_get_options(const struct sk_buff *skb, + struct tcp_options_received *opt_rx) +{ + const unsigned char *ptr; + const struct tcphdr *th = tcp_hdr(skb); + int length = (th->doff * 4) - sizeof(struct tcphdr); + + ptr = (const unsigned char *)(th + 1); + + while (length > 0) { + int opcode = *ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize = *ptr++; + if (opsize < 2) /* "silly options" */ + return; + if (opsize > length) + return; /* don't parse partial options */ + if (opcode == TCPOPT_MPTCP) + mptcp_parse_option(skb, ptr, opsize, opt_rx); + ptr += opsize - 2; + length -= opsize; + } + } +} + +bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, + unsigned int *size, struct mptcp_out_options *opts) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + /* we will use snd_isn to detect first pkt [re]transmission + * in mptcp_established_options_mp() + */ + subflow->snd_isn = TCP_SKB_CB(skb)->end_seq; + if (subflow->request_mptcp) { + pr_debug("local_key=%llu", subflow->local_key); + opts->suboptions = OPTION_MPTCP_MPC_SYN; + opts->sndr_key = subflow->local_key; + *size = TCPOLEN_MPTCP_MPC_SYN; + return true; + } + return false; +} + +void mptcp_rcv_synsent(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct tcp_sock *tp = tcp_sk(sk); + + pr_debug("subflow=%p", subflow); + if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) { + subflow->mp_capable = 1; + subflow->can_ack = 1; + subflow->remote_key = tp->rx_opt.mptcp.sndr_key; + } else { + tcp_sk(sk)->is_mptcp = 0; + } +} + +static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, + unsigned int *size, + unsigned int remaining, + struct mptcp_out_options *opts) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_ext *mpext; + unsigned int data_len; + + pr_debug("subflow=%p fourth_ack=%d seq=%x:%x remaining=%d", subflow, + subflow->fourth_ack, subflow->snd_isn, + skb ? TCP_SKB_CB(skb)->seq : 0, remaining); + + if (subflow->mp_capable && !subflow->fourth_ack && skb && + subflow->snd_isn == TCP_SKB_CB(skb)->seq) { + /* When skb is not available, we better over-estimate the + * emitted options len. A full DSS option is longer than + * TCPOLEN_MPTCP_MPC_ACK_DATA, so let's the caller try to fit + * that. + */ + mpext = mptcp_get_ext(skb); + data_len = mpext ? mpext->data_len : 0; + + /* we will check ext_copy.data_len in mptcp_write_options() to + * discriminate between TCPOLEN_MPTCP_MPC_ACK_DATA and + * TCPOLEN_MPTCP_MPC_ACK + */ + opts->ext_copy.data_len = data_len; + opts->suboptions = OPTION_MPTCP_MPC_ACK; + opts->sndr_key = subflow->local_key; + opts->rcvr_key = subflow->remote_key; + + /* Section 3.1. + * The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK + * packets that start the first subflow of an MPTCP connection, + * as well as the first packet that carries data + */ + if (data_len > 0) + *size = ALIGN(TCPOLEN_MPTCP_MPC_ACK_DATA, 4); + else + *size = TCPOLEN_MPTCP_MPC_ACK; + + pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d", + subflow, subflow->local_key, subflow->remote_key, + data_len); + + return true; + } + return false; +} + +static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow, + struct mptcp_ext *ext) +{ + ext->data_fin = 1; + + if (!ext->use_map) { + /* RFC6824 requires a DSS mapping with specific values + * if DATA_FIN is set but no data payload is mapped + */ + ext->use_map = 1; + ext->dsn64 = 1; + ext->data_seq = mptcp_sk(subflow->conn)->write_seq; + ext->subflow_seq = 0; + ext->data_len = 1; + } else { + /* If there's an existing DSS mapping, DATA_FIN consumes + * 1 additional byte of mapping space. + */ + ext->data_len++; + } +} + +static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, + unsigned int *size, + unsigned int remaining, + struct mptcp_out_options *opts) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + unsigned int dss_size = 0; + struct mptcp_ext *mpext; + struct mptcp_sock *msk; + unsigned int ack_size; + bool ret = false; + u8 tcp_fin; + + if (skb) { + mpext = mptcp_get_ext(skb); + tcp_fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; + } else { + mpext = NULL; + tcp_fin = 0; + } + + if (!skb || (mpext && mpext->use_map) || tcp_fin) { + unsigned int map_size; + + map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64; + + remaining -= map_size; + dss_size = map_size; + if (mpext) + opts->ext_copy = *mpext; + + if (skb && tcp_fin && + subflow->conn->sk_state != TCP_ESTABLISHED) + mptcp_write_data_fin(subflow, &opts->ext_copy); + ret = true; + } + + opts->ext_copy.use_ack = 0; + msk = mptcp_sk(subflow->conn); + if (!msk || !READ_ONCE(msk->can_ack)) { + *size = ALIGN(dss_size, 4); + return ret; + } + + ack_size = TCPOLEN_MPTCP_DSS_ACK64; + + /* Add kind/length/subtype/flag overhead if mapping is not populated */ + if (dss_size == 0) + ack_size += TCPOLEN_MPTCP_DSS_BASE; + + dss_size += ack_size; + + opts->ext_copy.data_ack = msk->ack_seq; + opts->ext_copy.ack64 = 1; + opts->ext_copy.use_ack = 1; + + *size = ALIGN(dss_size, 4); + return true; +} + +bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, + unsigned int *size, unsigned int remaining, + struct mptcp_out_options *opts) +{ + unsigned int opt_size = 0; + bool ret = false; + + if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts)) + ret = true; + else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining, + opts)) + ret = true; + + /* we reserved enough space for the above options, and exceeding the + * TCP option space would be fatal + */ + if (WARN_ON_ONCE(opt_size > remaining)) + return false; + + *size += opt_size; + remaining -= opt_size; + + return ret; +} + +bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, + struct mptcp_out_options *opts) +{ + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + + if (subflow_req->mp_capable) { + opts->suboptions = OPTION_MPTCP_MPC_SYNACK; + opts->sndr_key = subflow_req->local_key; + *size = TCPOLEN_MPTCP_MPC_SYNACK; + pr_debug("subflow_req=%p, local_key=%llu", + subflow_req, subflow_req->local_key); + return true; + } + return false; +} + +static bool check_fourth_ack(struct mptcp_subflow_context *subflow, + struct sk_buff *skb, + struct mptcp_options_received *mp_opt) +{ + /* here we can process OoO, in-window pkts, only in-sequence 4th ack + * are relevant + */ + if (likely(subflow->fourth_ack || + TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1)) + return true; + + if (mp_opt->use_ack) + subflow->fourth_ack = 1; + + if (subflow->can_ack) + return true; + + /* If the first established packet does not contain MP_CAPABLE + data + * then fallback to TCP + */ + if (!mp_opt->mp_capable) { + subflow->mp_capable = 0; + tcp_sk(mptcp_subflow_tcp_sock(subflow))->is_mptcp = 0; + return false; + } + subflow->remote_key = mp_opt->sndr_key; + subflow->can_ack = 1; + return true; +} + +void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, + struct tcp_options_received *opt_rx) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_options_received *mp_opt; + struct mptcp_ext *mpext; + + mp_opt = &opt_rx->mptcp; + if (!check_fourth_ack(subflow, skb, mp_opt)) + return; + + if (!mp_opt->dss) + return; + + mpext = skb_ext_add(skb, SKB_EXT_MPTCP); + if (!mpext) + return; + + memset(mpext, 0, sizeof(*mpext)); + + if (mp_opt->use_map) { + if (mp_opt->mpc_map) { + /* this is an MP_CAPABLE carrying MPTCP data + * we know this map the first chunk of data + */ + mptcp_crypto_key_sha(subflow->remote_key, NULL, + &mpext->data_seq); + mpext->data_seq++; + mpext->subflow_seq = 1; + mpext->dsn64 = 1; + mpext->mpc_map = 1; + } else { + mpext->data_seq = mp_opt->data_seq; + mpext->subflow_seq = mp_opt->subflow_seq; + mpext->dsn64 = mp_opt->dsn64; + } + mpext->data_len = mp_opt->data_len; + mpext->use_map = 1; + } + + if (mp_opt->use_ack) { + mpext->data_ack = mp_opt->data_ack; + mpext->use_ack = 1; + mpext->ack64 = mp_opt->ack64; + } + + mpext->data_fin = mp_opt->data_fin; +} + +void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) +{ + if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | + OPTION_MPTCP_MPC_ACK) & opts->suboptions) { + u8 len; + + if (OPTION_MPTCP_MPC_SYN & opts->suboptions) + len = TCPOLEN_MPTCP_MPC_SYN; + else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) + len = TCPOLEN_MPTCP_MPC_SYNACK; + else if (opts->ext_copy.data_len) + len = TCPOLEN_MPTCP_MPC_ACK_DATA; + else + len = TCPOLEN_MPTCP_MPC_ACK; + + *ptr++ = htonl((TCPOPT_MPTCP << 24) | (len << 16) | + (MPTCPOPT_MP_CAPABLE << 12) | + (MPTCP_SUPPORTED_VERSION << 8) | + MPTCP_CAP_HMAC_SHA256); + + if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & + opts->suboptions)) + goto mp_capable_done; + + put_unaligned_be64(opts->sndr_key, ptr); + ptr += 2; + if (!((OPTION_MPTCP_MPC_ACK) & opts->suboptions)) + goto mp_capable_done; + + put_unaligned_be64(opts->rcvr_key, ptr); + ptr += 2; + if (!opts->ext_copy.data_len) + goto mp_capable_done; + + put_unaligned_be32(opts->ext_copy.data_len << 16 | + TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); + ptr += 1; + } + +mp_capable_done: + if (opts->ext_copy.use_ack || opts->ext_copy.use_map) { + struct mptcp_ext *mpext = &opts->ext_copy; + u8 len = TCPOLEN_MPTCP_DSS_BASE; + u8 flags = 0; + + if (mpext->use_ack) { + len += TCPOLEN_MPTCP_DSS_ACK64; + flags = MPTCP_DSS_HAS_ACK | MPTCP_DSS_ACK64; + } + + if (mpext->use_map) { + len += TCPOLEN_MPTCP_DSS_MAP64; + + /* Use only 64-bit mapping flags for now, add + * support for optional 32-bit mappings later. + */ + flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64; + if (mpext->data_fin) + flags |= MPTCP_DSS_DATA_FIN; + } + + *ptr++ = htonl((TCPOPT_MPTCP << 24) | + (len << 16) | + (MPTCPOPT_DSS << 12) | + (flags)); + + if (mpext->use_ack) { + put_unaligned_be64(mpext->data_ack, ptr); + ptr += 2; + } + + if (mpext->use_map) { + put_unaligned_be64(mpext->data_seq, ptr); + ptr += 2; + put_unaligned_be32(mpext->subflow_seq, ptr); + ptr += 1; + put_unaligned_be32(mpext->data_len << 16 | + TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); + } + } +} diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c new file mode 100644 index 000000000000..030dee668e0a --- /dev/null +++ b/net/mptcp/protocol.c @@ -0,0 +1,1252 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2017 - 2019, Intel Corporation. + */ + +#define pr_fmt(fmt) "MPTCP: " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/sched/signal.h> +#include <linux/atomic.h> +#include <net/sock.h> +#include <net/inet_common.h> +#include <net/inet_hashtables.h> +#include <net/protocol.h> +#include <net/tcp.h> +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +#include <net/transp_v6.h> +#endif +#include <net/mptcp.h> +#include "protocol.h" + +#define MPTCP_SAME_STATE TCP_MAX_STATES + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +struct mptcp6_sock { + struct mptcp_sock msk; + struct ipv6_pinfo np; +}; +#endif + +/* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not + * completed yet or has failed, return the subflow socket. + * Otherwise return NULL. + */ +static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) +{ + if (!msk->subflow || READ_ONCE(msk->can_ack)) + return NULL; + + return msk->subflow; +} + +static bool __mptcp_needs_tcp_fallback(const struct mptcp_sock *msk) +{ + return msk->first && !sk_is_mptcp(msk->first); +} + +static struct socket *__mptcp_tcp_fallback(struct mptcp_sock *msk) +{ + sock_owned_by_me((const struct sock *)msk); + + if (likely(!__mptcp_needs_tcp_fallback(msk))) + return NULL; + + if (msk->subflow) { + release_sock((struct sock *)msk); + return msk->subflow; + } + + return NULL; +} + +static bool __mptcp_can_create_subflow(const struct mptcp_sock *msk) +{ + return !msk->first; +} + +static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + struct socket *ssock; + int err; + + ssock = __mptcp_nmpc_socket(msk); + if (ssock) + goto set_state; + + if (!__mptcp_can_create_subflow(msk)) + return ERR_PTR(-EINVAL); + + err = mptcp_subflow_create_socket(sk, &ssock); + if (err) + return ERR_PTR(err); + + msk->first = ssock->sk; + msk->subflow = ssock; + subflow = mptcp_subflow_ctx(ssock->sk); + list_add(&subflow->node, &msk->conn_list); + subflow->request_mptcp = 1; + +set_state: + if (state != MPTCP_SAME_STATE) + inet_sk_state_store(sk, state); + return ssock; +} + +static struct sock *mptcp_subflow_get(const struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + + sock_owned_by_me((const struct sock *)msk); + + mptcp_for_each_subflow(msk, subflow) { + return mptcp_subflow_tcp_sock(subflow); + } + + return NULL; +} + +static bool mptcp_ext_cache_refill(struct mptcp_sock *msk) +{ + if (!msk->cached_ext) + msk->cached_ext = __skb_ext_alloc(); + + return !!msk->cached_ext; +} + +static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + + sock_owned_by_me(sk); + + mptcp_for_each_subflow(msk, subflow) { + if (subflow->data_avail) + return mptcp_subflow_tcp_sock(subflow); + } + + return NULL; +} + +static inline bool mptcp_skb_can_collapse_to(const struct mptcp_sock *msk, + const struct sk_buff *skb, + const struct mptcp_ext *mpext) +{ + if (!tcp_skb_can_collapse_to(skb)) + return false; + + /* can collapse only if MPTCP level sequence is in order */ + return mpext && mpext->data_seq + mpext->data_len == msk->write_seq; +} + +static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, + struct msghdr *msg, long *timeo, int *pmss_now, + int *ps_goal) +{ + int mss_now, avail_size, size_goal, ret; + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_ext *mpext = NULL; + struct sk_buff *skb, *tail; + bool can_collapse = false; + struct page_frag *pfrag; + size_t psize; + + /* use the mptcp page cache so that we can easily move the data + * from one substream to another, but do per subflow memory accounting + */ + pfrag = sk_page_frag(sk); + while (!sk_page_frag_refill(ssk, pfrag) || + !mptcp_ext_cache_refill(msk)) { + ret = sk_stream_wait_memory(ssk, timeo); + if (ret) + return ret; + if (unlikely(__mptcp_needs_tcp_fallback(msk))) + return 0; + } + + /* compute copy limit */ + mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags); + *pmss_now = mss_now; + *ps_goal = size_goal; + avail_size = size_goal; + skb = tcp_write_queue_tail(ssk); + if (skb) { + mpext = skb_ext_find(skb, SKB_EXT_MPTCP); + + /* Limit the write to the size available in the + * current skb, if any, so that we create at most a new skb. + * Explicitly tells TCP internals to avoid collapsing on later + * queue management operation, to avoid breaking the ext <-> + * SSN association set here + */ + can_collapse = (size_goal - skb->len > 0) && + mptcp_skb_can_collapse_to(msk, skb, mpext); + if (!can_collapse) + TCP_SKB_CB(skb)->eor = 1; + else + avail_size = size_goal - skb->len; + } + psize = min_t(size_t, pfrag->size - pfrag->offset, avail_size); + + /* Copy to page */ + pr_debug("left=%zu", msg_data_left(msg)); + psize = copy_page_from_iter(pfrag->page, pfrag->offset, + min_t(size_t, msg_data_left(msg), psize), + &msg->msg_iter); + pr_debug("left=%zu", msg_data_left(msg)); + if (!psize) + return -EINVAL; + + /* tell the TCP stack to delay the push so that we can safely + * access the skb after the sendpages call + */ + ret = do_tcp_sendpages(ssk, pfrag->page, pfrag->offset, psize, + msg->msg_flags | MSG_SENDPAGE_NOTLAST); + if (ret <= 0) + return ret; + if (unlikely(ret < psize)) + iov_iter_revert(&msg->msg_iter, psize - ret); + + /* if the tail skb extension is still the cached one, collapsing + * really happened. Note: we can't check for 'same skb' as the sk_buff + * hdr on tail can be transmitted, freed and re-allocated by the + * do_tcp_sendpages() call + */ + tail = tcp_write_queue_tail(ssk); + if (mpext && tail && mpext == skb_ext_find(tail, SKB_EXT_MPTCP)) { + WARN_ON_ONCE(!can_collapse); + mpext->data_len += ret; + goto out; + } + + skb = tcp_write_queue_tail(ssk); + mpext = __skb_ext_set(skb, SKB_EXT_MPTCP, msk->cached_ext); + msk->cached_ext = NULL; + + memset(mpext, 0, sizeof(*mpext)); + mpext->data_seq = msk->write_seq; + mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; + mpext->data_len = ret; + mpext->use_map = 1; + mpext->dsn64 = 1; + + pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d", + mpext->data_seq, mpext->subflow_seq, mpext->data_len, + mpext->dsn64); + +out: + pfrag->offset += ret; + msk->write_seq += ret; + mptcp_subflow_ctx(ssk)->rel_write_seq += ret; + + return ret; +} + +static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk) +{ + struct socket *sock; + + if (likely(sk_stream_is_writeable(ssk))) + return; + + sock = READ_ONCE(ssk->sk_socket); + + if (sock) { + clear_bit(MPTCP_SEND_SPACE, &msk->flags); + smp_mb__after_atomic(); + /* set NOSPACE only after clearing SEND_SPACE flag */ + set_bit(SOCK_NOSPACE, &sock->flags); + } +} + +static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) +{ + int mss_now = 0, size_goal = 0, ret = 0; + struct mptcp_sock *msk = mptcp_sk(sk); + struct socket *ssock; + size_t copied = 0; + struct sock *ssk; + long timeo; + + if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) + return -EOPNOTSUPP; + + lock_sock(sk); + ssock = __mptcp_tcp_fallback(msk); + if (unlikely(ssock)) { +fallback: + pr_debug("fallback passthrough"); + ret = sock_sendmsg(ssock, msg); + return ret >= 0 ? ret + copied : (copied ? copied : ret); + } + + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + + ssk = mptcp_subflow_get(msk); + if (!ssk) { + release_sock(sk); + return -ENOTCONN; + } + + pr_debug("conn_list->subflow=%p", ssk); + + lock_sock(ssk); + while (msg_data_left(msg)) { + ret = mptcp_sendmsg_frag(sk, ssk, msg, &timeo, &mss_now, + &size_goal); + if (ret < 0) + break; + if (ret == 0 && unlikely(__mptcp_needs_tcp_fallback(msk))) { + release_sock(ssk); + ssock = __mptcp_tcp_fallback(msk); + goto fallback; + } + + copied += ret; + } + + if (copied) { + ret = copied; + tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, + size_goal); + } + + ssk_check_wmem(msk, ssk); + release_sock(ssk); + release_sock(sk); + return ret; +} + +int mptcp_read_actor(read_descriptor_t *desc, struct sk_buff *skb, + unsigned int offset, size_t len) +{ + struct mptcp_read_arg *arg = desc->arg.data; + size_t copy_len; + + copy_len = min(desc->count, len); + + if (likely(arg->msg)) { + int err; + + err = skb_copy_datagram_msg(skb, offset, arg->msg, copy_len); + if (err) { + pr_debug("error path"); + desc->error = err; + return err; + } + } else { + pr_debug("Flushing skb payload"); + } + + desc->count -= copy_len; + + pr_debug("consumed %zu bytes, %zu left", copy_len, desc->count); + return copy_len; +} + +static void mptcp_wait_data(struct sock *sk, long *timeo) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + struct mptcp_sock *msk = mptcp_sk(sk); + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + + sk_wait_event(sk, timeo, + test_and_clear_bit(MPTCP_DATA_READY, &msk->flags), &wait); + + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); +} + +static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, int *addr_len) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_subflow_context *subflow; + bool more_data_avail = false; + struct mptcp_read_arg arg; + read_descriptor_t desc; + bool wait_data = false; + struct socket *ssock; + struct tcp_sock *tp; + bool done = false; + struct sock *ssk; + int copied = 0; + int target; + long timeo; + + if (msg->msg_flags & ~(MSG_WAITALL | MSG_DONTWAIT)) + return -EOPNOTSUPP; + + lock_sock(sk); + ssock = __mptcp_tcp_fallback(msk); + if (unlikely(ssock)) { +fallback: + pr_debug("fallback-read subflow=%p", + mptcp_subflow_ctx(ssock->sk)); + copied = sock_recvmsg(ssock, msg, flags); + return copied; + } + + arg.msg = msg; + desc.arg.data = &arg; + desc.error = 0; + + timeo = sock_rcvtimeo(sk, nonblock); + + len = min_t(size_t, len, INT_MAX); + target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); + + while (!done) { + u32 map_remaining; + int bytes_read; + + ssk = mptcp_subflow_recv_lookup(msk); + pr_debug("msk=%p ssk=%p", msk, ssk); + if (!ssk) + goto wait_for_data; + + subflow = mptcp_subflow_ctx(ssk); + tp = tcp_sk(ssk); + + lock_sock(ssk); + do { + /* try to read as much data as available */ + map_remaining = subflow->map_data_len - + mptcp_subflow_get_map_offset(subflow); + desc.count = min_t(size_t, len - copied, map_remaining); + pr_debug("reading %zu bytes, copied %d", desc.count, + copied); + bytes_read = tcp_read_sock(ssk, &desc, + mptcp_read_actor); + if (bytes_read < 0) { + if (!copied) + copied = bytes_read; + done = true; + goto next; + } + + pr_debug("msk ack_seq=%llx -> %llx", msk->ack_seq, + msk->ack_seq + bytes_read); + msk->ack_seq += bytes_read; + copied += bytes_read; + if (copied >= len) { + done = true; + goto next; + } + if (tp->urg_data && tp->urg_seq == tp->copied_seq) { + pr_err("Urgent data present, cannot proceed"); + done = true; + goto next; + } +next: + more_data_avail = mptcp_subflow_data_available(ssk); + } while (more_data_avail && !done); + release_sock(ssk); + continue; + +wait_for_data: + more_data_avail = false; + + /* only the master socket status is relevant here. The exit + * conditions mirror closely tcp_recvmsg() + */ + if (copied >= target) + break; + + if (copied) { + if (sk->sk_err || + sk->sk_state == TCP_CLOSE || + (sk->sk_shutdown & RCV_SHUTDOWN) || + !timeo || + signal_pending(current)) + break; + } else { + if (sk->sk_err) { + copied = sock_error(sk); + break; + } + + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + + if (sk->sk_state == TCP_CLOSE) { + copied = -ENOTCONN; + break; + } + + if (!timeo) { + copied = -EAGAIN; + break; + } + + if (signal_pending(current)) { + copied = sock_intr_errno(timeo); + break; + } + } + + pr_debug("block timeout %ld", timeo); + wait_data = true; + mptcp_wait_data(sk, &timeo); + if (unlikely(__mptcp_tcp_fallback(msk))) + goto fallback; + } + + if (more_data_avail) { + if (!test_bit(MPTCP_DATA_READY, &msk->flags)) + set_bit(MPTCP_DATA_READY, &msk->flags); + } else if (!wait_data) { + clear_bit(MPTCP_DATA_READY, &msk->flags); + + /* .. race-breaker: ssk might get new data after last + * data_available() returns false. + */ + ssk = mptcp_subflow_recv_lookup(msk); + if (unlikely(ssk)) + set_bit(MPTCP_DATA_READY, &msk->flags); + } + + release_sock(sk); + return copied; +} + +/* subflow sockets can be either outgoing (connect) or incoming + * (accept). + * + * Outgoing subflows use in-kernel sockets. + * Incoming subflows do not have their own 'struct socket' allocated, + * so we need to use tcp_close() after detaching them from the mptcp + * parent socket. + */ +static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, + struct mptcp_subflow_context *subflow, + long timeout) +{ + struct socket *sock = READ_ONCE(ssk->sk_socket); + + list_del(&subflow->node); + + if (sock && sock != sk->sk_socket) { + /* outgoing subflow */ + sock_release(sock); + } else { + /* incoming subflow */ + tcp_close(ssk, timeout); + } +} + +static int __mptcp_init_sock(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + INIT_LIST_HEAD(&msk->conn_list); + __set_bit(MPTCP_SEND_SPACE, &msk->flags); + + msk->first = NULL; + + return 0; +} + +static int mptcp_init_sock(struct sock *sk) +{ + if (!mptcp_is_enabled(sock_net(sk))) + return -ENOPROTOOPT; + + return __mptcp_init_sock(sk); +} + +static void mptcp_subflow_shutdown(struct sock *ssk, int how) +{ + lock_sock(ssk); + + switch (ssk->sk_state) { + case TCP_LISTEN: + if (!(how & RCV_SHUTDOWN)) + break; + /* fall through */ + case TCP_SYN_SENT: + tcp_disconnect(ssk, O_NONBLOCK); + break; + default: + ssk->sk_shutdown |= how; + tcp_shutdown(ssk, how); + break; + } + + /* Wake up anyone sleeping in poll. */ + ssk->sk_state_change(ssk); + release_sock(ssk); +} + +/* Called with msk lock held, releases such lock before returning */ +static void mptcp_close(struct sock *sk, long timeout) +{ + struct mptcp_subflow_context *subflow, *tmp; + struct mptcp_sock *msk = mptcp_sk(sk); + LIST_HEAD(conn_list); + + lock_sock(sk); + + mptcp_token_destroy(msk->token); + inet_sk_state_store(sk, TCP_CLOSE); + + list_splice_init(&msk->conn_list, &conn_list); + + release_sock(sk); + + list_for_each_entry_safe(subflow, tmp, &conn_list, node) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + __mptcp_close_ssk(sk, ssk, subflow, timeout); + } + + sk_common_release(sk); +} + +static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) +{ +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + const struct ipv6_pinfo *ssk6 = inet6_sk(ssk); + struct ipv6_pinfo *msk6 = inet6_sk(msk); + + msk->sk_v6_daddr = ssk->sk_v6_daddr; + msk->sk_v6_rcv_saddr = ssk->sk_v6_rcv_saddr; + + if (msk6 && ssk6) { + msk6->saddr = ssk6->saddr; + msk6->flow_label = ssk6->flow_label; + } +#endif + + inet_sk(msk)->inet_num = inet_sk(ssk)->inet_num; + inet_sk(msk)->inet_dport = inet_sk(ssk)->inet_dport; + inet_sk(msk)->inet_sport = inet_sk(ssk)->inet_sport; + inet_sk(msk)->inet_daddr = inet_sk(ssk)->inet_daddr; + inet_sk(msk)->inet_saddr = inet_sk(ssk)->inet_saddr; + inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr; +} + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk) +{ + unsigned int offset = sizeof(struct mptcp6_sock) - sizeof(struct ipv6_pinfo); + + return (struct ipv6_pinfo *)(((u8 *)sk) + offset); +} +#endif + +static struct sock *mptcp_sk_clone_lock(const struct sock *sk) +{ + struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC); + + if (!nsk) + return NULL; + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if (nsk->sk_family == AF_INET6) + inet_sk(nsk)->pinet6 = mptcp_inet6_sk(nsk); +#endif + + return nsk; +} + +static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, + bool kern) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct socket *listener; + struct sock *newsk; + + listener = __mptcp_nmpc_socket(msk); + if (WARN_ON_ONCE(!listener)) { + *err = -EINVAL; + return NULL; + } + + pr_debug("msk=%p, listener=%p", msk, mptcp_subflow_ctx(listener->sk)); + newsk = inet_csk_accept(listener->sk, flags, err, kern); + if (!newsk) + return NULL; + + pr_debug("msk=%p, subflow is mptcp=%d", msk, sk_is_mptcp(newsk)); + + if (sk_is_mptcp(newsk)) { + struct mptcp_subflow_context *subflow; + struct sock *new_mptcp_sock; + struct sock *ssk = newsk; + u64 ack_seq; + + subflow = mptcp_subflow_ctx(newsk); + lock_sock(sk); + + local_bh_disable(); + new_mptcp_sock = mptcp_sk_clone_lock(sk); + if (!new_mptcp_sock) { + *err = -ENOBUFS; + local_bh_enable(); + release_sock(sk); + mptcp_subflow_shutdown(newsk, SHUT_RDWR + 1); + tcp_close(newsk, 0); + return NULL; + } + + __mptcp_init_sock(new_mptcp_sock); + + msk = mptcp_sk(new_mptcp_sock); + msk->local_key = subflow->local_key; + msk->token = subflow->token; + msk->subflow = NULL; + msk->first = newsk; + + mptcp_token_update_accept(newsk, new_mptcp_sock); + + msk->write_seq = subflow->idsn + 1; + if (subflow->can_ack) { + msk->can_ack = true; + msk->remote_key = subflow->remote_key; + mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq); + ack_seq++; + msk->ack_seq = ack_seq; + } + newsk = new_mptcp_sock; + mptcp_copy_inaddrs(newsk, ssk); + list_add(&subflow->node, &msk->conn_list); + + /* will be fully established at mptcp_stream_accept() + * completion. + */ + inet_sk_state_store(new_mptcp_sock, TCP_SYN_RECV); + bh_unlock_sock(new_mptcp_sock); + local_bh_enable(); + release_sock(sk); + + /* the subflow can already receive packet, avoid racing with + * the receive path and process the pending ones + */ + lock_sock(ssk); + subflow->rel_write_seq = 1; + subflow->tcp_sock = ssk; + subflow->conn = new_mptcp_sock; + if (unlikely(!skb_queue_empty(&ssk->sk_receive_queue))) + mptcp_subflow_data_available(ssk); + release_sock(ssk); + } + + return newsk; +} + +static void mptcp_destroy(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + if (msk->cached_ext) + __skb_ext_put(msk->cached_ext); +} + +static int mptcp_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + int ret = -EOPNOTSUPP; + struct socket *ssock; + struct sock *ssk; + + pr_debug("msk=%p", msk); + + /* @@ the meaning of setsockopt() when the socket is connected and + * there are multiple subflows is not defined. + */ + lock_sock(sk); + ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE); + if (IS_ERR(ssock)) { + release_sock(sk); + return ret; + } + + ssk = ssock->sk; + sock_hold(ssk); + release_sock(sk); + + ret = tcp_setsockopt(ssk, level, optname, optval, optlen); + sock_put(ssk); + + return ret; +} + +static int mptcp_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *option) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + int ret = -EOPNOTSUPP; + struct socket *ssock; + struct sock *ssk; + + pr_debug("msk=%p", msk); + + /* @@ the meaning of getsockopt() when the socket is connected and + * there are multiple subflows is not defined. + */ + lock_sock(sk); + ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE); + if (IS_ERR(ssock)) { + release_sock(sk); + return ret; + } + + ssk = ssock->sk; + sock_hold(ssk); + release_sock(sk); + + ret = tcp_getsockopt(ssk, level, optname, optval, option); + sock_put(ssk); + + return ret; +} + +static int mptcp_get_port(struct sock *sk, unsigned short snum) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct socket *ssock; + + ssock = __mptcp_nmpc_socket(msk); + pr_debug("msk=%p, subflow=%p", msk, ssock); + if (WARN_ON_ONCE(!ssock)) + return -EINVAL; + + return inet_csk_get_port(ssock->sk, snum); +} + +void mptcp_finish_connect(struct sock *ssk) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk; + struct sock *sk; + u64 ack_seq; + + subflow = mptcp_subflow_ctx(ssk); + + if (!subflow->mp_capable) + return; + + sk = subflow->conn; + msk = mptcp_sk(sk); + + pr_debug("msk=%p, token=%u", sk, subflow->token); + + mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq); + ack_seq++; + subflow->map_seq = ack_seq; + subflow->map_subflow_seq = 1; + subflow->rel_write_seq = 1; + + /* the socket is not connected yet, no msk/subflow ops can access/race + * accessing the field below + */ + WRITE_ONCE(msk->remote_key, subflow->remote_key); + WRITE_ONCE(msk->local_key, subflow->local_key); + WRITE_ONCE(msk->token, subflow->token); + WRITE_ONCE(msk->write_seq, subflow->idsn + 1); + WRITE_ONCE(msk->ack_seq, ack_seq); + WRITE_ONCE(msk->can_ack, 1); +} + +static void mptcp_sock_graft(struct sock *sk, struct socket *parent) +{ + write_lock_bh(&sk->sk_callback_lock); + rcu_assign_pointer(sk->sk_wq, &parent->wq); + sk_set_socket(sk, parent); + sk->sk_uid = SOCK_INODE(parent)->i_uid; + write_unlock_bh(&sk->sk_callback_lock); +} + +static bool mptcp_memory_free(const struct sock *sk, int wake) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + return wake ? test_bit(MPTCP_SEND_SPACE, &msk->flags) : true; +} + +static struct proto mptcp_prot = { + .name = "MPTCP", + .owner = THIS_MODULE, + .init = mptcp_init_sock, + .close = mptcp_close, + .accept = mptcp_accept, + .setsockopt = mptcp_setsockopt, + .getsockopt = mptcp_getsockopt, + .shutdown = tcp_shutdown, + .destroy = mptcp_destroy, + .sendmsg = mptcp_sendmsg, + .recvmsg = mptcp_recvmsg, + .hash = inet_hash, + .unhash = inet_unhash, + .get_port = mptcp_get_port, + .stream_memory_free = mptcp_memory_free, + .obj_size = sizeof(struct mptcp_sock), + .no_autobind = true, +}; + +static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct mptcp_sock *msk = mptcp_sk(sock->sk); + struct socket *ssock; + int err; + + lock_sock(sock->sk); + ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE); + if (IS_ERR(ssock)) { + err = PTR_ERR(ssock); + goto unlock; + } + + err = ssock->ops->bind(ssock, uaddr, addr_len); + if (!err) + mptcp_copy_inaddrs(sock->sk, ssock->sk); + +unlock: + release_sock(sock->sk); + return err; +} + +static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct mptcp_sock *msk = mptcp_sk(sock->sk); + struct socket *ssock; + int err; + + lock_sock(sock->sk); + ssock = __mptcp_socket_create(msk, TCP_SYN_SENT); + if (IS_ERR(ssock)) { + err = PTR_ERR(ssock); + goto unlock; + } + +#ifdef CONFIG_TCP_MD5SIG + /* no MPTCP if MD5SIG is enabled on this socket or we may run out of + * TCP option space. + */ + if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info)) + mptcp_subflow_ctx(ssock->sk)->request_mptcp = 0; +#endif + + err = ssock->ops->connect(ssock, uaddr, addr_len, flags); + inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); + mptcp_copy_inaddrs(sock->sk, ssock->sk); + +unlock: + release_sock(sock->sk); + return err; +} + +static int mptcp_v4_getname(struct socket *sock, struct sockaddr *uaddr, + int peer) +{ + if (sock->sk->sk_prot == &tcp_prot) { + /* we are being invoked from __sys_accept4, after + * mptcp_accept() has just accepted a non-mp-capable + * flow: sk is a tcp_sk, not an mptcp one. + * + * Hand the socket over to tcp so all further socket ops + * bypass mptcp. + */ + sock->ops = &inet_stream_ops; + } + + return inet_getname(sock, uaddr, peer); +} + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +static int mptcp_v6_getname(struct socket *sock, struct sockaddr *uaddr, + int peer) +{ + if (sock->sk->sk_prot == &tcpv6_prot) { + /* we are being invoked from __sys_accept4 after + * mptcp_accept() has accepted a non-mp-capable + * subflow: sk is a tcp_sk, not mptcp. + * + * Hand the socket over to tcp so all further + * socket ops bypass mptcp. + */ + sock->ops = &inet6_stream_ops; + } + + return inet6_getname(sock, uaddr, peer); +} +#endif + +static int mptcp_listen(struct socket *sock, int backlog) +{ + struct mptcp_sock *msk = mptcp_sk(sock->sk); + struct socket *ssock; + int err; + + pr_debug("msk=%p", msk); + + lock_sock(sock->sk); + ssock = __mptcp_socket_create(msk, TCP_LISTEN); + if (IS_ERR(ssock)) { + err = PTR_ERR(ssock); + goto unlock; + } + + err = ssock->ops->listen(ssock, backlog); + inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); + if (!err) + mptcp_copy_inaddrs(sock->sk, ssock->sk); + +unlock: + release_sock(sock->sk); + return err; +} + +static bool is_tcp_proto(const struct proto *p) +{ +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + return p == &tcp_prot || p == &tcpv6_prot; +#else + return p == &tcp_prot; +#endif +} + +static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, + int flags, bool kern) +{ + struct mptcp_sock *msk = mptcp_sk(sock->sk); + struct socket *ssock; + int err; + + pr_debug("msk=%p", msk); + + lock_sock(sock->sk); + if (sock->sk->sk_state != TCP_LISTEN) + goto unlock_fail; + + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) + goto unlock_fail; + + sock_hold(ssock->sk); + release_sock(sock->sk); + + err = ssock->ops->accept(sock, newsock, flags, kern); + if (err == 0 && !is_tcp_proto(newsock->sk->sk_prot)) { + struct mptcp_sock *msk = mptcp_sk(newsock->sk); + struct mptcp_subflow_context *subflow; + + /* set ssk->sk_socket of accept()ed flows to mptcp socket. + * This is needed so NOSPACE flag can be set from tcp stack. + */ + list_for_each_entry(subflow, &msk->conn_list, node) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if (!ssk->sk_socket) + mptcp_sock_graft(ssk, newsock); + } + + inet_sk_state_store(newsock->sk, TCP_ESTABLISHED); + } + + sock_put(ssock->sk); + return err; + +unlock_fail: + release_sock(sock->sk); + return -EINVAL; +} + +static __poll_t mptcp_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait) +{ + struct sock *sk = sock->sk; + struct mptcp_sock *msk; + struct socket *ssock; + __poll_t mask = 0; + + msk = mptcp_sk(sk); + lock_sock(sk); + ssock = __mptcp_nmpc_socket(msk); + if (ssock) { + mask = ssock->ops->poll(file, ssock, wait); + release_sock(sk); + return mask; + } + + release_sock(sk); + sock_poll_wait(file, sock, wait); + lock_sock(sk); + ssock = __mptcp_tcp_fallback(msk); + if (unlikely(ssock)) + return ssock->ops->poll(file, ssock, NULL); + + if (test_bit(MPTCP_DATA_READY, &msk->flags)) + mask = EPOLLIN | EPOLLRDNORM; + if (sk_stream_is_writeable(sk) && + test_bit(MPTCP_SEND_SPACE, &msk->flags)) + mask |= EPOLLOUT | EPOLLWRNORM; + if (sk->sk_shutdown & RCV_SHUTDOWN) + mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; + + release_sock(sk); + + return mask; +} + +static int mptcp_shutdown(struct socket *sock, int how) +{ + struct mptcp_sock *msk = mptcp_sk(sock->sk); + struct mptcp_subflow_context *subflow; + int ret = 0; + + pr_debug("sk=%p, how=%d", msk, how); + + lock_sock(sock->sk); + + if (how == SHUT_WR || how == SHUT_RDWR) + inet_sk_state_store(sock->sk, TCP_FIN_WAIT1); + + how++; + + if ((how & ~SHUTDOWN_MASK) || !how) { + ret = -EINVAL; + goto out_unlock; + } + + if (sock->state == SS_CONNECTING) { + if ((1 << sock->sk->sk_state) & + (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE)) + sock->state = SS_DISCONNECTING; + else + sock->state = SS_CONNECTED; + } + + mptcp_for_each_subflow(msk, subflow) { + struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); + + mptcp_subflow_shutdown(tcp_sk, how); + } + +out_unlock: + release_sock(sock->sk); + + return ret; +} + +static const struct proto_ops mptcp_stream_ops = { + .family = PF_INET, + .owner = THIS_MODULE, + .release = inet_release, + .bind = mptcp_bind, + .connect = mptcp_stream_connect, + .socketpair = sock_no_socketpair, + .accept = mptcp_stream_accept, + .getname = mptcp_v4_getname, + .poll = mptcp_poll, + .ioctl = inet_ioctl, + .gettstamp = sock_gettstamp, + .listen = mptcp_listen, + .shutdown = mptcp_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet_sendmsg, + .recvmsg = inet_recvmsg, + .mmap = sock_no_mmap, + .sendpage = inet_sendpage, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_sock_common_setsockopt, + .compat_getsockopt = compat_sock_common_getsockopt, +#endif +}; + +static struct inet_protosw mptcp_protosw = { + .type = SOCK_STREAM, + .protocol = IPPROTO_MPTCP, + .prot = &mptcp_prot, + .ops = &mptcp_stream_ops, + .flags = INET_PROTOSW_ICSK, +}; + +void mptcp_proto_init(void) +{ + mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo; + + mptcp_subflow_init(); + + if (proto_register(&mptcp_prot, 1) != 0) + panic("Failed to register MPTCP proto.\n"); + + inet_register_protosw(&mptcp_protosw); +} + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +static const struct proto_ops mptcp_v6_stream_ops = { + .family = PF_INET6, + .owner = THIS_MODULE, + .release = inet6_release, + .bind = mptcp_bind, + .connect = mptcp_stream_connect, + .socketpair = sock_no_socketpair, + .accept = mptcp_stream_accept, + .getname = mptcp_v6_getname, + .poll = mptcp_poll, + .ioctl = inet6_ioctl, + .gettstamp = sock_gettstamp, + .listen = mptcp_listen, + .shutdown = mptcp_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet6_sendmsg, + .recvmsg = inet6_recvmsg, + .mmap = sock_no_mmap, + .sendpage = inet_sendpage, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_sock_common_setsockopt, + .compat_getsockopt = compat_sock_common_getsockopt, +#endif +}; + +static struct proto mptcp_v6_prot; + +static void mptcp_v6_destroy(struct sock *sk) +{ + mptcp_destroy(sk); + inet6_destroy_sock(sk); +} + +static struct inet_protosw mptcp_v6_protosw = { + .type = SOCK_STREAM, + .protocol = IPPROTO_MPTCP, + .prot = &mptcp_v6_prot, + .ops = &mptcp_v6_stream_ops, + .flags = INET_PROTOSW_ICSK, +}; + +int mptcp_proto_v6_init(void) +{ + int err; + + mptcp_v6_prot = mptcp_prot; + strcpy(mptcp_v6_prot.name, "MPTCPv6"); + mptcp_v6_prot.slab = NULL; + mptcp_v6_prot.destroy = mptcp_v6_destroy; + mptcp_v6_prot.obj_size = sizeof(struct mptcp6_sock); + + err = proto_register(&mptcp_v6_prot, 1); + if (err) + return err; + + err = inet6_register_protosw(&mptcp_v6_protosw); + if (err) + proto_unregister(&mptcp_v6_prot); + + return err; +} +#endif diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h new file mode 100644 index 000000000000..8a99a2930284 --- /dev/null +++ b/net/mptcp/protocol.h @@ -0,0 +1,240 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Multipath TCP + * + * Copyright (c) 2017 - 2019, Intel Corporation. + */ + +#ifndef __MPTCP_PROTOCOL_H +#define __MPTCP_PROTOCOL_H + +#include <linux/random.h> +#include <net/tcp.h> +#include <net/inet_connection_sock.h> + +#define MPTCP_SUPPORTED_VERSION 1 + +/* MPTCP option bits */ +#define OPTION_MPTCP_MPC_SYN BIT(0) +#define OPTION_MPTCP_MPC_SYNACK BIT(1) +#define OPTION_MPTCP_MPC_ACK BIT(2) + +/* MPTCP option subtypes */ +#define MPTCPOPT_MP_CAPABLE 0 +#define MPTCPOPT_MP_JOIN 1 +#define MPTCPOPT_DSS 2 +#define MPTCPOPT_ADD_ADDR 3 +#define MPTCPOPT_RM_ADDR 4 +#define MPTCPOPT_MP_PRIO 5 +#define MPTCPOPT_MP_FAIL 6 +#define MPTCPOPT_MP_FASTCLOSE 7 + +/* MPTCP suboption lengths */ +#define TCPOLEN_MPTCP_MPC_SYN 4 +#define TCPOLEN_MPTCP_MPC_SYNACK 12 +#define TCPOLEN_MPTCP_MPC_ACK 20 +#define TCPOLEN_MPTCP_MPC_ACK_DATA 22 +#define TCPOLEN_MPTCP_DSS_BASE 4 +#define TCPOLEN_MPTCP_DSS_ACK32 4 +#define TCPOLEN_MPTCP_DSS_ACK64 8 +#define TCPOLEN_MPTCP_DSS_MAP32 10 +#define TCPOLEN_MPTCP_DSS_MAP64 14 +#define TCPOLEN_MPTCP_DSS_CHECKSUM 2 + +/* MPTCP MP_CAPABLE flags */ +#define MPTCP_VERSION_MASK (0x0F) +#define MPTCP_CAP_CHECKSUM_REQD BIT(7) +#define MPTCP_CAP_EXTENSIBILITY BIT(6) +#define MPTCP_CAP_HMAC_SHA256 BIT(0) +#define MPTCP_CAP_FLAG_MASK (0x3F) + +/* MPTCP DSS flags */ +#define MPTCP_DSS_DATA_FIN BIT(4) +#define MPTCP_DSS_DSN64 BIT(3) +#define MPTCP_DSS_HAS_MAP BIT(2) +#define MPTCP_DSS_ACK64 BIT(1) +#define MPTCP_DSS_HAS_ACK BIT(0) +#define MPTCP_DSS_FLAG_MASK (0x1F) + +/* MPTCP socket flags */ +#define MPTCP_DATA_READY BIT(0) +#define MPTCP_SEND_SPACE BIT(1) + +/* MPTCP connection sock */ +struct mptcp_sock { + /* inet_connection_sock must be the first member */ + struct inet_connection_sock sk; + u64 local_key; + u64 remote_key; + u64 write_seq; + u64 ack_seq; + u32 token; + unsigned long flags; + bool can_ack; + struct list_head conn_list; + struct skb_ext *cached_ext; /* for the next sendmsg */ + struct socket *subflow; /* outgoing connect/listener/!mp_capable */ + struct sock *first; +}; + +#define mptcp_for_each_subflow(__msk, __subflow) \ + list_for_each_entry(__subflow, &((__msk)->conn_list), node) + +static inline struct mptcp_sock *mptcp_sk(const struct sock *sk) +{ + return (struct mptcp_sock *)sk; +} + +struct mptcp_subflow_request_sock { + struct tcp_request_sock sk; + u16 mp_capable : 1, + mp_join : 1, + backup : 1, + remote_key_valid : 1; + u64 local_key; + u64 remote_key; + u64 idsn; + u32 token; + u32 ssn_offset; +}; + +static inline struct mptcp_subflow_request_sock * +mptcp_subflow_rsk(const struct request_sock *rsk) +{ + return (struct mptcp_subflow_request_sock *)rsk; +} + +/* MPTCP subflow context */ +struct mptcp_subflow_context { + struct list_head node;/* conn_list of subflows */ + u64 local_key; + u64 remote_key; + u64 idsn; + u64 map_seq; + u32 snd_isn; + u32 token; + u32 rel_write_seq; + u32 map_subflow_seq; + u32 ssn_offset; + u32 map_data_len; + u32 request_mptcp : 1, /* send MP_CAPABLE */ + mp_capable : 1, /* remote is MPTCP capable */ + fourth_ack : 1, /* send initial DSS */ + conn_finished : 1, + map_valid : 1, + mpc_map : 1, + data_avail : 1, + rx_eof : 1, + can_ack : 1; /* only after processing the remote a key */ + + struct sock *tcp_sock; /* tcp sk backpointer */ + struct sock *conn; /* parent mptcp_sock */ + const struct inet_connection_sock_af_ops *icsk_af_ops; + void (*tcp_data_ready)(struct sock *sk); + void (*tcp_state_change)(struct sock *sk); + void (*tcp_write_space)(struct sock *sk); + + struct rcu_head rcu; +}; + +static inline struct mptcp_subflow_context * +mptcp_subflow_ctx(const struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + /* Use RCU on icsk_ulp_data only for sock diag code */ + return (__force struct mptcp_subflow_context *)icsk->icsk_ulp_data; +} + +static inline struct sock * +mptcp_subflow_tcp_sock(const struct mptcp_subflow_context *subflow) +{ + return subflow->tcp_sock; +} + +static inline u64 +mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow) +{ + return tcp_sk(mptcp_subflow_tcp_sock(subflow))->copied_seq - + subflow->ssn_offset - + subflow->map_subflow_seq; +} + +static inline u64 +mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow) +{ + return subflow->map_seq + mptcp_subflow_get_map_offset(subflow); +} + +int mptcp_is_enabled(struct net *net); +bool mptcp_subflow_data_available(struct sock *sk); +void mptcp_subflow_init(void); +int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock); + +static inline void mptcp_subflow_tcp_fallback(struct sock *sk, + struct mptcp_subflow_context *ctx) +{ + sk->sk_data_ready = ctx->tcp_data_ready; + sk->sk_state_change = ctx->tcp_state_change; + sk->sk_write_space = ctx->tcp_write_space; + + inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops; +} + +extern const struct inet_connection_sock_af_ops ipv4_specific; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +extern const struct inet_connection_sock_af_ops ipv6_specific; +#endif + +void mptcp_proto_init(void); +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +int mptcp_proto_v6_init(void); +#endif + +struct mptcp_read_arg { + struct msghdr *msg; +}; + +int mptcp_read_actor(read_descriptor_t *desc, struct sk_buff *skb, + unsigned int offset, size_t len); + +void mptcp_get_options(const struct sk_buff *skb, + struct tcp_options_received *opt_rx); + +void mptcp_finish_connect(struct sock *sk); + +int mptcp_token_new_request(struct request_sock *req); +void mptcp_token_destroy_request(u32 token); +int mptcp_token_new_connect(struct sock *sk); +int mptcp_token_new_accept(u32 token); +void mptcp_token_update_accept(struct sock *sk, struct sock *conn); +void mptcp_token_destroy(u32 token); + +void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn); +static inline void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn) +{ + /* we might consider a faster version that computes the key as a + * hash of some information available in the MPTCP socket. Use + * random data at the moment, as it's probably the safest option + * in case multiple sockets are opened in different namespaces at + * the same time. + */ + get_random_bytes(key, sizeof(u64)); + mptcp_crypto_key_sha(*key, token, idsn); +} + +void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2, + void *hash_out); + +static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb) +{ + return (struct mptcp_ext *)skb_ext_find(skb, SKB_EXT_MPTCP); +} + +static inline bool before64(__u64 seq1, __u64 seq2) +{ + return (__s64)(seq1 - seq2) < 0; +} + +#define after64(seq2, seq1) before64(seq1, seq2) + +#endif /* __MPTCP_PROTOCOL_H */ diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c new file mode 100644 index 000000000000..65122edf60aa --- /dev/null +++ b/net/mptcp/subflow.c @@ -0,0 +1,865 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2017 - 2019, Intel Corporation. + */ + +#define pr_fmt(fmt) "MPTCP: " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/netdevice.h> +#include <net/sock.h> +#include <net/inet_common.h> +#include <net/inet_hashtables.h> +#include <net/protocol.h> +#include <net/tcp.h> +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +#include <net/ip6_route.h> +#endif +#include <net/mptcp.h> +#include "protocol.h" + +static int subflow_rebuild_header(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + int err = 0; + + if (subflow->request_mptcp && !subflow->token) { + pr_debug("subflow=%p", sk); + err = mptcp_token_new_connect(sk); + } + + if (err) + return err; + + return subflow->icsk_af_ops->rebuild_header(sk); +} + +static void subflow_req_destructor(struct request_sock *req) +{ + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + + pr_debug("subflow_req=%p", subflow_req); + + if (subflow_req->mp_capable) + mptcp_token_destroy_request(subflow_req->token); + tcp_request_sock_ops.destructor(req); +} + +static void subflow_init_req(struct request_sock *req, + const struct sock *sk_listener, + struct sk_buff *skb) +{ + struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + struct tcp_options_received rx_opt; + + pr_debug("subflow_req=%p, listener=%p", subflow_req, listener); + + memset(&rx_opt.mptcp, 0, sizeof(rx_opt.mptcp)); + mptcp_get_options(skb, &rx_opt); + + subflow_req->mp_capable = 0; + subflow_req->remote_key_valid = 0; + +#ifdef CONFIG_TCP_MD5SIG + /* no MPTCP if MD5SIG is enabled on this socket or we may run out of + * TCP option space. + */ + if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) + return; +#endif + + if (rx_opt.mptcp.mp_capable && listener->request_mptcp) { + int err; + + err = mptcp_token_new_request(req); + if (err == 0) + subflow_req->mp_capable = 1; + + subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; + } +} + +static void subflow_v4_init_req(struct request_sock *req, + const struct sock *sk_listener, + struct sk_buff *skb) +{ + tcp_rsk(req)->is_mptcp = 1; + + tcp_request_sock_ipv4_ops.init_req(req, sk_listener, skb); + + subflow_init_req(req, sk_listener, skb); +} + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +static void subflow_v6_init_req(struct request_sock *req, + const struct sock *sk_listener, + struct sk_buff *skb) +{ + tcp_rsk(req)->is_mptcp = 1; + + tcp_request_sock_ipv6_ops.init_req(req, sk_listener, skb); + + subflow_init_req(req, sk_listener, skb); +} +#endif + +static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + subflow->icsk_af_ops->sk_rx_dst_set(sk, skb); + + if (subflow->conn && !subflow->conn_finished) { + pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk), + subflow->remote_key); + mptcp_finish_connect(sk); + subflow->conn_finished = 1; + + if (skb) { + pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq); + subflow->ssn_offset = TCP_SKB_CB(skb)->seq; + } + } +} + +static struct request_sock_ops subflow_request_sock_ops; +static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops; + +static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + pr_debug("subflow=%p", subflow); + + /* Never answer to SYNs sent to broadcast or multicast */ + if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) + goto drop; + + return tcp_conn_request(&subflow_request_sock_ops, + &subflow_request_sock_ipv4_ops, + sk, skb); +drop: + tcp_listendrop(sk); + return 0; +} + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops; +static struct inet_connection_sock_af_ops subflow_v6_specific; +static struct inet_connection_sock_af_ops subflow_v6m_specific; + +static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + pr_debug("subflow=%p", subflow); + + if (skb->protocol == htons(ETH_P_IP)) + return subflow_v4_conn_request(sk, skb); + + if (!ipv6_unicast_destination(skb)) + goto drop; + + return tcp_conn_request(&subflow_request_sock_ops, + &subflow_request_sock_ipv6_ops, sk, skb); + +drop: + tcp_listendrop(sk); + return 0; /* don't send reset */ +} +#endif + +static struct sock *subflow_syn_recv_sock(const struct sock *sk, + struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst, + struct request_sock *req_unhash, + bool *own_req) +{ + struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk); + struct mptcp_subflow_request_sock *subflow_req; + struct tcp_options_received opt_rx; + struct sock *child; + + pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn); + + if (tcp_rsk(req)->is_mptcp == 0) + goto create_child; + + /* if the sk is MP_CAPABLE, we try to fetch the client key */ + subflow_req = mptcp_subflow_rsk(req); + if (subflow_req->mp_capable) { + if (TCP_SKB_CB(skb)->seq != subflow_req->ssn_offset + 1) { + /* here we can receive and accept an in-window, + * out-of-order pkt, which will not carry the MP_CAPABLE + * opt even on mptcp enabled paths + */ + goto create_child; + } + + opt_rx.mptcp.mp_capable = 0; + mptcp_get_options(skb, &opt_rx); + if (opt_rx.mptcp.mp_capable) { + subflow_req->remote_key = opt_rx.mptcp.sndr_key; + subflow_req->remote_key_valid = 1; + } else { + subflow_req->mp_capable = 0; + } + } + +create_child: + child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, + req_unhash, own_req); + + if (child && *own_req) { + struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child); + + /* we have null ctx on TCP fallback, not fatal on MPC + * handshake + */ + if (!ctx) + return child; + + if (ctx->mp_capable) { + if (mptcp_token_new_accept(ctx->token)) + goto close_child; + } + } + + return child; + +close_child: + pr_debug("closing child socket"); + tcp_send_active_reset(child, GFP_ATOMIC); + inet_csk_prepare_forced_close(child); + tcp_done(child); + return NULL; +} + +static struct inet_connection_sock_af_ops subflow_specific; + +enum mapping_status { + MAPPING_OK, + MAPPING_INVALID, + MAPPING_EMPTY, + MAPPING_DATA_FIN +}; + +static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq) +{ + if ((u32)seq == (u32)old_seq) + return old_seq; + + /* Assume map covers data not mapped yet. */ + return seq | ((old_seq + old_data_len + 1) & GENMASK_ULL(63, 32)); +} + +static void warn_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) +{ + WARN_ONCE(1, "Bad mapping: ssn=%d map_seq=%d map_data_len=%d", + ssn, subflow->map_subflow_seq, subflow->map_data_len); +} + +static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + unsigned int skb_consumed; + + skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq; + if (WARN_ON_ONCE(skb_consumed >= skb->len)) + return true; + + return skb->len - skb_consumed <= subflow->map_data_len - + mptcp_subflow_get_map_offset(subflow); +} + +static bool validate_mapping(struct sock *ssk, struct sk_buff *skb) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; + + if (unlikely(before(ssn, subflow->map_subflow_seq))) { + /* Mapping covers data later in the subflow stream, + * currently unsupported. + */ + warn_bad_map(subflow, ssn); + return false; + } + if (unlikely(!before(ssn, subflow->map_subflow_seq + + subflow->map_data_len))) { + /* Mapping does covers past subflow data, invalid */ + warn_bad_map(subflow, ssn + skb->len); + return false; + } + return true; +} + +static enum mapping_status get_mapping_status(struct sock *ssk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct mptcp_ext *mpext; + struct sk_buff *skb; + u16 data_len; + u64 map_seq; + + skb = skb_peek(&ssk->sk_receive_queue); + if (!skb) + return MAPPING_EMPTY; + + mpext = mptcp_get_ext(skb); + if (!mpext || !mpext->use_map) { + if (!subflow->map_valid && !skb->len) { + /* the TCP stack deliver 0 len FIN pkt to the receive + * queue, that is the only 0len pkts ever expected here, + * and we can admit no mapping only for 0 len pkts + */ + if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) + WARN_ONCE(1, "0len seq %d:%d flags %x", + TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq, + TCP_SKB_CB(skb)->tcp_flags); + sk_eat_skb(ssk, skb); + return MAPPING_EMPTY; + } + + if (!subflow->map_valid) + return MAPPING_INVALID; + + goto validate_seq; + } + + pr_debug("seq=%llu is64=%d ssn=%u data_len=%u data_fin=%d", + mpext->data_seq, mpext->dsn64, mpext->subflow_seq, + mpext->data_len, mpext->data_fin); + + data_len = mpext->data_len; + if (data_len == 0) { + pr_err("Infinite mapping not handled"); + return MAPPING_INVALID; + } + + if (mpext->data_fin == 1) { + if (data_len == 1) { + pr_debug("DATA_FIN with no payload"); + if (subflow->map_valid) { + /* A DATA_FIN might arrive in a DSS + * option before the previous mapping + * has been fully consumed. Continue + * handling the existing mapping. + */ + skb_ext_del(skb, SKB_EXT_MPTCP); + return MAPPING_OK; + } else { + return MAPPING_DATA_FIN; + } + } + + /* Adjust for DATA_FIN using 1 byte of sequence space */ + data_len--; + } + + if (!mpext->dsn64) { + map_seq = expand_seq(subflow->map_seq, subflow->map_data_len, + mpext->data_seq); + pr_debug("expanded seq=%llu", subflow->map_seq); + } else { + map_seq = mpext->data_seq; + } + + if (subflow->map_valid) { + /* Allow replacing only with an identical map */ + if (subflow->map_seq == map_seq && + subflow->map_subflow_seq == mpext->subflow_seq && + subflow->map_data_len == data_len) { + skb_ext_del(skb, SKB_EXT_MPTCP); + return MAPPING_OK; + } + + /* If this skb data are fully covered by the current mapping, + * the new map would need caching, which is not supported + */ + if (skb_is_fully_mapped(ssk, skb)) + return MAPPING_INVALID; + + /* will validate the next map after consuming the current one */ + return MAPPING_OK; + } + + subflow->map_seq = map_seq; + subflow->map_subflow_seq = mpext->subflow_seq; + subflow->map_data_len = data_len; + subflow->map_valid = 1; + subflow->mpc_map = mpext->mpc_map; + pr_debug("new map seq=%llu subflow_seq=%u data_len=%u", + subflow->map_seq, subflow->map_subflow_seq, + subflow->map_data_len); + +validate_seq: + /* we revalidate valid mapping on new skb, because we must ensure + * the current skb is completely covered by the available mapping + */ + if (!validate_mapping(ssk, skb)) + return MAPPING_INVALID; + + skb_ext_del(skb, SKB_EXT_MPTCP); + return MAPPING_OK; +} + +static bool subflow_check_data_avail(struct sock *ssk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + enum mapping_status status; + struct mptcp_sock *msk; + struct sk_buff *skb; + + pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk, + subflow->data_avail, skb_peek(&ssk->sk_receive_queue)); + if (subflow->data_avail) + return true; + + if (!subflow->conn) + return false; + + msk = mptcp_sk(subflow->conn); + for (;;) { + u32 map_remaining; + size_t delta; + u64 ack_seq; + u64 old_ack; + + status = get_mapping_status(ssk); + pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status); + if (status == MAPPING_INVALID) { + ssk->sk_err = EBADMSG; + goto fatal; + } + + if (status != MAPPING_OK) + return false; + + skb = skb_peek(&ssk->sk_receive_queue); + if (WARN_ON_ONCE(!skb)) + return false; + + /* if msk lacks the remote key, this subflow must provide an + * MP_CAPABLE-based mapping + */ + if (unlikely(!READ_ONCE(msk->can_ack))) { + if (!subflow->mpc_map) { + ssk->sk_err = EBADMSG; + goto fatal; + } + WRITE_ONCE(msk->remote_key, subflow->remote_key); + WRITE_ONCE(msk->ack_seq, subflow->map_seq); + WRITE_ONCE(msk->can_ack, true); + } + + old_ack = READ_ONCE(msk->ack_seq); + ack_seq = mptcp_subflow_get_mapped_dsn(subflow); + pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack, + ack_seq); + if (ack_seq == old_ack) + break; + + /* only accept in-sequence mapping. Old values are spurious + * retransmission; we can hit "future" values on active backup + * subflow switch, we relay on retransmissions to get + * in-sequence data. + * Cuncurrent subflows support will require subflow data + * reordering + */ + map_remaining = subflow->map_data_len - + mptcp_subflow_get_map_offset(subflow); + if (before64(ack_seq, old_ack)) + delta = min_t(size_t, old_ack - ack_seq, map_remaining); + else + delta = min_t(size_t, ack_seq - old_ack, map_remaining); + + /* discard mapped data */ + pr_debug("discarding %zu bytes, current map len=%d", delta, + map_remaining); + if (delta) { + struct mptcp_read_arg arg = { + .msg = NULL, + }; + read_descriptor_t desc = { + .count = delta, + .arg.data = &arg, + }; + int ret; + + ret = tcp_read_sock(ssk, &desc, mptcp_read_actor); + if (ret < 0) { + ssk->sk_err = -ret; + goto fatal; + } + if (ret < delta) + return false; + if (delta == map_remaining) + subflow->map_valid = 0; + } + } + return true; + +fatal: + /* fatal protocol error, close the socket */ + /* This barrier is coupled with smp_rmb() in tcp_poll() */ + smp_wmb(); + ssk->sk_error_report(ssk); + tcp_set_state(ssk, TCP_CLOSE); + tcp_send_active_reset(ssk, GFP_ATOMIC); + return false; +} + +bool mptcp_subflow_data_available(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct sk_buff *skb; + + /* check if current mapping is still valid */ + if (subflow->map_valid && + mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) { + subflow->map_valid = 0; + subflow->data_avail = 0; + + pr_debug("Done with mapping: seq=%u data_len=%u", + subflow->map_subflow_seq, + subflow->map_data_len); + } + + if (!subflow_check_data_avail(sk)) { + subflow->data_avail = 0; + return false; + } + + skb = skb_peek(&sk->sk_receive_queue); + subflow->data_avail = skb && + before(tcp_sk(sk)->copied_seq, TCP_SKB_CB(skb)->end_seq); + return subflow->data_avail; +} + +static void subflow_data_ready(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct sock *parent = subflow->conn; + + if (!parent || !subflow->mp_capable) { + subflow->tcp_data_ready(sk); + + if (parent) + parent->sk_data_ready(parent); + return; + } + + if (mptcp_subflow_data_available(sk)) { + set_bit(MPTCP_DATA_READY, &mptcp_sk(parent)->flags); + + parent->sk_data_ready(parent); + } +} + +static void subflow_write_space(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct sock *parent = subflow->conn; + + sk_stream_write_space(sk); + if (parent && sk_stream_is_writeable(sk)) { + set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags); + smp_mb__after_atomic(); + /* set SEND_SPACE before sk_stream_write_space clears NOSPACE */ + sk_stream_write_space(parent); + } +} + +static struct inet_connection_sock_af_ops * +subflow_default_af_ops(struct sock *sk) +{ +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if (sk->sk_family == AF_INET6) + return &subflow_v6_specific; +#endif + return &subflow_specific; +} + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +void mptcpv6_handle_mapped(struct sock *sk, bool mapped) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_connection_sock_af_ops *target; + + target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk); + + pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d", + subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped); + + if (likely(icsk->icsk_af_ops == target)) + return; + + subflow->icsk_af_ops = icsk->icsk_af_ops; + icsk->icsk_af_ops = target; +} +#endif + +int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) +{ + struct mptcp_subflow_context *subflow; + struct net *net = sock_net(sk); + struct socket *sf; + int err; + + err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP, + &sf); + if (err) + return err; + + lock_sock(sf->sk); + + /* kernel sockets do not by default acquire net ref, but TCP timer + * needs it. + */ + sf->sk->sk_net_refcnt = 1; + get_net(net); +#ifdef CONFIG_PROC_FS + this_cpu_add(*net->core.sock_inuse, 1); +#endif + err = tcp_set_ulp(sf->sk, "mptcp"); + release_sock(sf->sk); + + if (err) + return err; + + subflow = mptcp_subflow_ctx(sf->sk); + pr_debug("subflow=%p", subflow); + + *new_sock = sf; + sock_hold(sk); + subflow->conn = sk; + + return 0; +} + +static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk, + gfp_t priority) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct mptcp_subflow_context *ctx; + + ctx = kzalloc(sizeof(*ctx), priority); + if (!ctx) + return NULL; + + rcu_assign_pointer(icsk->icsk_ulp_data, ctx); + INIT_LIST_HEAD(&ctx->node); + + pr_debug("subflow=%p", ctx); + + ctx->tcp_sock = sk; + + return ctx; +} + +static void __subflow_state_change(struct sock *sk) +{ + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (skwq_has_sleeper(wq)) + wake_up_interruptible_all(&wq->wait); + rcu_read_unlock(); +} + +static bool subflow_is_done(const struct sock *sk) +{ + return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE; +} + +static void subflow_state_change(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct sock *parent = READ_ONCE(subflow->conn); + + __subflow_state_change(sk); + + /* as recvmsg() does not acquire the subflow socket for ssk selection + * a fin packet carrying a DSS can be unnoticed if we don't trigger + * the data available machinery here. + */ + if (parent && subflow->mp_capable && mptcp_subflow_data_available(sk)) { + set_bit(MPTCP_DATA_READY, &mptcp_sk(parent)->flags); + + parent->sk_data_ready(parent); + } + + if (parent && !(parent->sk_shutdown & RCV_SHUTDOWN) && + !subflow->rx_eof && subflow_is_done(sk)) { + subflow->rx_eof = 1; + parent->sk_shutdown |= RCV_SHUTDOWN; + __subflow_state_change(parent); + } +} + +static int subflow_ulp_init(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct mptcp_subflow_context *ctx; + struct tcp_sock *tp = tcp_sk(sk); + int err = 0; + + /* disallow attaching ULP to a socket unless it has been + * created with sock_create_kern() + */ + if (!sk->sk_kern_sock) { + err = -EOPNOTSUPP; + goto out; + } + + ctx = subflow_create_ctx(sk, GFP_KERNEL); + if (!ctx) { + err = -ENOMEM; + goto out; + } + + pr_debug("subflow=%p, family=%d", ctx, sk->sk_family); + + tp->is_mptcp = 1; + ctx->icsk_af_ops = icsk->icsk_af_ops; + icsk->icsk_af_ops = subflow_default_af_ops(sk); + ctx->tcp_data_ready = sk->sk_data_ready; + ctx->tcp_state_change = sk->sk_state_change; + ctx->tcp_write_space = sk->sk_write_space; + sk->sk_data_ready = subflow_data_ready; + sk->sk_write_space = subflow_write_space; + sk->sk_state_change = subflow_state_change; +out: + return err; +} + +static void subflow_ulp_release(struct sock *sk) +{ + struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(sk); + + if (!ctx) + return; + + if (ctx->conn) + sock_put(ctx->conn); + + kfree_rcu(ctx, rcu); +} + +static void subflow_ulp_fallback(struct sock *sk, + struct mptcp_subflow_context *old_ctx) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + mptcp_subflow_tcp_fallback(sk, old_ctx); + icsk->icsk_ulp_ops = NULL; + rcu_assign_pointer(icsk->icsk_ulp_data, NULL); + tcp_sk(sk)->is_mptcp = 0; +} + +static void subflow_ulp_clone(const struct request_sock *req, + struct sock *newsk, + const gfp_t priority) +{ + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk); + struct mptcp_subflow_context *new_ctx; + + if (!tcp_rsk(req)->is_mptcp || !subflow_req->mp_capable) { + subflow_ulp_fallback(newsk, old_ctx); + return; + } + + new_ctx = subflow_create_ctx(newsk, priority); + if (!new_ctx) { + subflow_ulp_fallback(newsk, old_ctx); + return; + } + + /* see comments in subflow_syn_recv_sock(), MPTCP connection is fully + * established only after we receive the remote key + */ + new_ctx->conn_finished = 1; + new_ctx->icsk_af_ops = old_ctx->icsk_af_ops; + new_ctx->tcp_data_ready = old_ctx->tcp_data_ready; + new_ctx->tcp_state_change = old_ctx->tcp_state_change; + new_ctx->tcp_write_space = old_ctx->tcp_write_space; + new_ctx->mp_capable = 1; + new_ctx->fourth_ack = subflow_req->remote_key_valid; + new_ctx->can_ack = subflow_req->remote_key_valid; + new_ctx->remote_key = subflow_req->remote_key; + new_ctx->local_key = subflow_req->local_key; + new_ctx->token = subflow_req->token; + new_ctx->ssn_offset = subflow_req->ssn_offset; + new_ctx->idsn = subflow_req->idsn; +} + +static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = { + .name = "mptcp", + .owner = THIS_MODULE, + .init = subflow_ulp_init, + .release = subflow_ulp_release, + .clone = subflow_ulp_clone, +}; + +static int subflow_ops_init(struct request_sock_ops *subflow_ops) +{ + subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock); + subflow_ops->slab_name = "request_sock_subflow"; + + subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name, + subflow_ops->obj_size, 0, + SLAB_ACCOUNT | + SLAB_TYPESAFE_BY_RCU, + NULL); + if (!subflow_ops->slab) + return -ENOMEM; + + subflow_ops->destructor = subflow_req_destructor; + + return 0; +} + +void mptcp_subflow_init(void) +{ + subflow_request_sock_ops = tcp_request_sock_ops; + if (subflow_ops_init(&subflow_request_sock_ops) != 0) + panic("MPTCP: failed to init subflow request sock ops\n"); + + subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; + subflow_request_sock_ipv4_ops.init_req = subflow_v4_init_req; + + subflow_specific = ipv4_specific; + subflow_specific.conn_request = subflow_v4_conn_request; + subflow_specific.syn_recv_sock = subflow_syn_recv_sock; + subflow_specific.sk_rx_dst_set = subflow_finish_connect; + subflow_specific.rebuild_header = subflow_rebuild_header; + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; + subflow_request_sock_ipv6_ops.init_req = subflow_v6_init_req; + + subflow_v6_specific = ipv6_specific; + subflow_v6_specific.conn_request = subflow_v6_conn_request; + subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock; + subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect; + subflow_v6_specific.rebuild_header = subflow_rebuild_header; + + subflow_v6m_specific = subflow_v6_specific; + subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit; + subflow_v6m_specific.send_check = ipv4_specific.send_check; + subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len; + subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced; + subflow_v6m_specific.net_frag_header_len = 0; +#endif + + if (tcp_register_ulp(&subflow_ulp_ops) != 0) + panic("MPTCP: failed to register subflows to ULP\n"); +} diff --git a/net/mptcp/token.c b/net/mptcp/token.c new file mode 100644 index 000000000000..84d887806090 --- /dev/null +++ b/net/mptcp/token.c @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP token management + * Copyright (c) 2017 - 2019, Intel Corporation. + * + * Note: This code is based on mptcp_ctrl.c from multipath-tcp.org, + * authored by: + * + * SĂ©bastien BarrĂ© <sebastien.barre@uclouvain.be> + * Christoph Paasch <christoph.paasch@uclouvain.be> + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi> + * Gregory Detal <gregory.detal@uclouvain.be> + * Fabien DuchĂŞne <fabien.duchene@uclouvain.be> + * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de> + * Lavkesh Lahngir <lavkesh51@gmail.com> + * Andreas Ripke <ripke@neclab.eu> + * Vlad Dogaru <vlad.dogaru@intel.com> + * Octavian Purdila <octavian.purdila@intel.com> + * John Ronan <jronan@tssg.org> + * Catalin Nicutar <catalin.nicutar@gmail.com> + * Brandon Heller <brandonh@stanford.edu> + */ + +#define pr_fmt(fmt) "MPTCP: " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/radix-tree.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <net/sock.h> +#include <net/inet_common.h> +#include <net/protocol.h> +#include <net/mptcp.h> +#include "protocol.h" + +static RADIX_TREE(token_tree, GFP_ATOMIC); +static RADIX_TREE(token_req_tree, GFP_ATOMIC); +static DEFINE_SPINLOCK(token_tree_lock); +static int token_used __read_mostly; + +/** + * mptcp_token_new_request - create new key/idsn/token for subflow_request + * @req - the request socket + * + * This function is called when a new mptcp connection is coming in. + * + * It creates a unique token to identify the new mptcp connection, + * a secret local key and the initial data sequence number (idsn). + * + * Returns 0 on success. + */ +int mptcp_token_new_request(struct request_sock *req) +{ + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + int err; + + while (1) { + u32 token; + + mptcp_crypto_key_gen_sha(&subflow_req->local_key, + &subflow_req->token, + &subflow_req->idsn); + pr_debug("req=%p local_key=%llu, token=%u, idsn=%llu\n", + req, subflow_req->local_key, subflow_req->token, + subflow_req->idsn); + + token = subflow_req->token; + spin_lock_bh(&token_tree_lock); + if (!radix_tree_lookup(&token_req_tree, token) && + !radix_tree_lookup(&token_tree, token)) + break; + spin_unlock_bh(&token_tree_lock); + } + + err = radix_tree_insert(&token_req_tree, + subflow_req->token, &token_used); + spin_unlock_bh(&token_tree_lock); + return err; +} + +/** + * mptcp_token_new_connect - create new key/idsn/token for subflow + * @sk - the socket that will initiate a connection + * + * This function is called when a new outgoing mptcp connection is + * initiated. + * + * It creates a unique token to identify the new mptcp connection, + * a secret local key and the initial data sequence number (idsn). + * + * On success, the mptcp connection can be found again using + * the computed token at a later time, this is needed to process + * join requests. + * + * returns 0 on success. + */ +int mptcp_token_new_connect(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct sock *mptcp_sock = subflow->conn; + int err; + + while (1) { + u32 token; + + mptcp_crypto_key_gen_sha(&subflow->local_key, &subflow->token, + &subflow->idsn); + + pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n", + sk, subflow->local_key, subflow->token, subflow->idsn); + + token = subflow->token; + spin_lock_bh(&token_tree_lock); + if (!radix_tree_lookup(&token_req_tree, token) && + !radix_tree_lookup(&token_tree, token)) + break; + spin_unlock_bh(&token_tree_lock); + } + err = radix_tree_insert(&token_tree, subflow->token, mptcp_sock); + spin_unlock_bh(&token_tree_lock); + + return err; +} + +/** + * mptcp_token_new_accept - insert token for later processing + * @token: the token to insert to the tree + * + * Called when a SYN packet creates a new logical connection, i.e. + * is not a join request. + * + * We don't have an mptcp socket yet at that point. + * This is paired with mptcp_token_update_accept, called on accept(). + */ +int mptcp_token_new_accept(u32 token) +{ + int err; + + spin_lock_bh(&token_tree_lock); + err = radix_tree_insert(&token_tree, token, &token_used); + spin_unlock_bh(&token_tree_lock); + + return err; +} + +/** + * mptcp_token_update_accept - update token to map to mptcp socket + * @conn: the new struct mptcp_sock + * @sk: the initial subflow for this mptcp socket + * + * Called when the first mptcp socket is created on accept to + * refresh the dummy mapping (done to reserve the token) with + * the mptcp_socket structure that wasn't allocated before. + */ +void mptcp_token_update_accept(struct sock *sk, struct sock *conn) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + void __rcu **slot; + + spin_lock_bh(&token_tree_lock); + slot = radix_tree_lookup_slot(&token_tree, subflow->token); + WARN_ON_ONCE(!slot); + if (slot) { + WARN_ON_ONCE(rcu_access_pointer(*slot) != &token_used); + radix_tree_replace_slot(&token_tree, slot, conn); + } + spin_unlock_bh(&token_tree_lock); +} + +/** + * mptcp_token_destroy_request - remove mptcp connection/token + * @token - token of mptcp connection to remove + * + * Remove not-yet-fully-established incoming connection identified + * by @token. + */ +void mptcp_token_destroy_request(u32 token) +{ + spin_lock_bh(&token_tree_lock); + radix_tree_delete(&token_req_tree, token); + spin_unlock_bh(&token_tree_lock); +} + +/** + * mptcp_token_destroy - remove mptcp connection/token + * @token - token of mptcp connection to remove + * + * Remove the connection identified by @token. + */ +void mptcp_token_destroy(u32 token) +{ + spin_lock_bh(&token_tree_lock); + radix_tree_delete(&token_tree, token); + spin_unlock_bh(&token_tree_lock); +} diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index 0b3f0673e1a2..e37102546be6 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -64,6 +64,17 @@ enum { NCSI_MODE_MAX }; +/* Supported media status bits for Mellanox Mac affinity command. + * Bit (0-2) for different protocol support; Bit 1 for RBT support, + * bit 1 for SMBUS support and bit 2 for PCIE support. Bit (3-5) + * for different protocol availability. Bit 4 for RBT, bit 4 for + * SMBUS and bit 5 for PCIE. + */ +enum { + MLX_MC_RBT_SUPPORT = 0x01, /* MC supports RBT */ + MLX_MC_RBT_AVL = 0x08, /* RBT medium is available */ +}; + /* OEM Vendor Manufacture ID */ #define NCSI_OEM_MFR_MLX_ID 0x8119 #define NCSI_OEM_MFR_BCM_ID 0x113d @@ -72,9 +83,15 @@ enum { /* Mellanox specific OEM Command */ #define NCSI_OEM_MLX_CMD_GMA 0x00 /* CMD ID for Get MAC */ #define NCSI_OEM_MLX_CMD_GMA_PARAM 0x1b /* Parameter for GMA */ +#define NCSI_OEM_MLX_CMD_SMAF 0x01 /* CMD ID for Set MC Affinity */ +#define NCSI_OEM_MLX_CMD_SMAF_PARAM 0x07 /* Parameter for SMAF */ /* OEM Command payload lengths*/ #define NCSI_OEM_BCM_CMD_GMA_LEN 12 #define NCSI_OEM_MLX_CMD_GMA_LEN 8 +#define NCSI_OEM_MLX_CMD_SMAF_LEN 60 +/* Offset in OEM request */ +#define MLX_SMAF_MAC_ADDR_OFFSET 8 /* Offset for MAC in SMAF */ +#define MLX_SMAF_MED_SUPPORT_OFFSET 14 /* Offset for medium in SMAF */ /* Mac address offset in OEM response */ #define BCM_MAC_ADDR_OFFSET 28 #define MLX_MAC_ADDR_OFFSET 8 @@ -251,6 +268,8 @@ enum { ncsi_dev_state_probe_deselect = 0x0201, ncsi_dev_state_probe_package, ncsi_dev_state_probe_channel, + ncsi_dev_state_probe_mlx_gma, + ncsi_dev_state_probe_mlx_smaf, ncsi_dev_state_probe_cis, ncsi_dev_state_probe_gvi, ncsi_dev_state_probe_gc, @@ -264,9 +283,7 @@ enum { ncsi_dev_state_config_ev, ncsi_dev_state_config_sma, ncsi_dev_state_config_ebf, -#if IS_ENABLED(CONFIG_IPV6) - ncsi_dev_state_config_egmf, -#endif + ncsi_dev_state_config_dgmf, ncsi_dev_state_config_ecnt, ncsi_dev_state_config_ec, ncsi_dev_state_config_ae, @@ -295,9 +312,6 @@ struct ncsi_dev_priv { #define NCSI_DEV_RESET 8 /* Reset state of NC */ unsigned int gma_flag; /* OEM GMA flag */ spinlock_t lock; /* Protect the NCSI device */ -#if IS_ENABLED(CONFIG_IPV6) - unsigned int inet6_addr_num; /* Number of IPv6 addresses */ -#endif unsigned int package_probe_id;/* Current ID during probe */ unsigned int package_num; /* Number of packages */ struct list_head packages; /* List of packages */ @@ -316,6 +330,7 @@ struct ncsi_dev_priv { struct list_head vlan_vids; /* List of active VLAN IDs */ bool multi_package; /* Enable multiple packages */ + bool mlx_multi_host; /* Enable multi host Mellanox */ u32 package_whitelist; /* Packages to configure */ }; diff --git a/net/ncsi/ncsi-cmd.c b/net/ncsi/ncsi-cmd.c index 5c3fad8cba57..ba9ae482141b 100644 --- a/net/ncsi/ncsi-cmd.c +++ b/net/ncsi/ncsi-cmd.c @@ -54,7 +54,7 @@ static void ncsi_cmd_build_header(struct ncsi_pkt_hdr *h, checksum = ncsi_calculate_checksum((unsigned char *)h, sizeof(*h) + nca->payload); pchecksum = (__be32 *)((void *)h + sizeof(struct ncsi_pkt_hdr) + - nca->payload); + ALIGN(nca->payload, 4)); *pchecksum = htonl(checksum); } @@ -309,14 +309,21 @@ static struct ncsi_request *ncsi_alloc_command(struct ncsi_cmd_arg *nca) int ncsi_xmit_cmd(struct ncsi_cmd_arg *nca) { + struct ncsi_cmd_handler *nch = NULL; struct ncsi_request *nr; + unsigned char type; struct ethhdr *eh; - struct ncsi_cmd_handler *nch = NULL; int i, ret; + /* Use OEM generic handler for Netlink request */ + if (nca->req_flags == NCSI_REQ_FLAG_NETLINK_DRIVEN) + type = NCSI_PKT_CMD_OEM; + else + type = nca->type; + /* Search for the handler */ for (i = 0; i < ARRAY_SIZE(ncsi_cmd_handlers); i++) { - if (ncsi_cmd_handlers[i].type == nca->type) { + if (ncsi_cmd_handlers[i].type == type) { if (ncsi_cmd_handlers[i].handler) nch = &ncsi_cmd_handlers[i]; else @@ -362,7 +369,15 @@ int ncsi_xmit_cmd(struct ncsi_cmd_arg *nca) eh = skb_push(nr->cmd, sizeof(*eh)); eh->h_proto = htons(ETH_P_NCSI); eth_broadcast_addr(eh->h_dest); - eth_broadcast_addr(eh->h_source); + + /* If mac address received from device then use it for + * source address as unicast address else use broadcast + * address as source address + */ + if (nca->ndp->gma_flag == 1) + memcpy(eh->h_source, nca->ndp->ndev.dev->dev_addr, ETH_ALEN); + else + eth_broadcast_addr(eh->h_source); /* Start the timer for the request that might not have * corresponding response. Given NCSI is an internal diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 755aab66dcab..1f387be7827b 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -8,13 +8,14 @@ #include <linux/init.h> #include <linux/netdevice.h> #include <linux/skbuff.h> +#include <linux/of.h> +#include <linux/platform_device.h> #include <net/ncsi.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/addrconf.h> #include <net/ipv6.h> -#include <net/if_inet6.h> #include <net/genetlink.h> #include "internal.h" @@ -731,6 +732,34 @@ static int ncsi_oem_gma_handler_mlx(struct ncsi_cmd_arg *nca) return ret; } +static int ncsi_oem_smaf_mlx(struct ncsi_cmd_arg *nca) +{ + union { + u8 data_u8[NCSI_OEM_MLX_CMD_SMAF_LEN]; + u32 data_u32[NCSI_OEM_MLX_CMD_SMAF_LEN / sizeof(u32)]; + } u; + int ret = 0; + + memset(&u, 0, sizeof(u)); + u.data_u32[0] = ntohl(NCSI_OEM_MFR_MLX_ID); + u.data_u8[5] = NCSI_OEM_MLX_CMD_SMAF; + u.data_u8[6] = NCSI_OEM_MLX_CMD_SMAF_PARAM; + memcpy(&u.data_u8[MLX_SMAF_MAC_ADDR_OFFSET], + nca->ndp->ndev.dev->dev_addr, ETH_ALEN); + u.data_u8[MLX_SMAF_MED_SUPPORT_OFFSET] = + (MLX_MC_RBT_AVL | MLX_MC_RBT_SUPPORT); + + nca->payload = NCSI_OEM_MLX_CMD_SMAF_LEN; + nca->data = u.data_u8; + + ret = ncsi_xmit_cmd(nca); + if (ret) + netdev_err(nca->ndp->ndev.dev, + "NCSI: Failed to transmit cmd 0x%x during probe\n", + nca->type); + return ret; +} + /* OEM Command handlers initialization */ static struct ncsi_oem_gma_handler { unsigned int mfr_id; @@ -765,9 +794,6 @@ static int ncsi_gma_handler(struct ncsi_cmd_arg *nca, unsigned int mf_id) return -1; } - /* Set the flag for GMA command which should only be called once */ - nca->ndp->gma_flag = 1; - /* Get Mac address from NCSI device */ return nch->handler(nca); } @@ -978,9 +1004,7 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) case ncsi_dev_state_config_ev: case ncsi_dev_state_config_sma: case ncsi_dev_state_config_ebf: -#if IS_ENABLED(CONFIG_IPV6) - case ncsi_dev_state_config_egmf: -#endif + case ncsi_dev_state_config_dgmf: case ncsi_dev_state_config_ecnt: case ncsi_dev_state_config_ec: case ncsi_dev_state_config_ae: @@ -1033,23 +1057,23 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) } else if (nd->state == ncsi_dev_state_config_ebf) { nca.type = NCSI_PKT_CMD_EBF; nca.dwords[0] = nc->caps[NCSI_CAP_BC].cap; - if (ncsi_channel_is_tx(ndp, nc)) + /* if multicast global filtering is supported then + * disable it so that all multicast packet will be + * forwarded to management controller + */ + if (nc->caps[NCSI_CAP_GENERIC].cap & + NCSI_CAP_GENERIC_MC) + nd->state = ncsi_dev_state_config_dgmf; + else if (ncsi_channel_is_tx(ndp, nc)) nd->state = ncsi_dev_state_config_ecnt; else nd->state = ncsi_dev_state_config_ec; -#if IS_ENABLED(CONFIG_IPV6) - if (ndp->inet6_addr_num > 0 && - (nc->caps[NCSI_CAP_GENERIC].cap & - NCSI_CAP_GENERIC_MC)) - nd->state = ncsi_dev_state_config_egmf; - } else if (nd->state == ncsi_dev_state_config_egmf) { - nca.type = NCSI_PKT_CMD_EGMF; - nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap; + } else if (nd->state == ncsi_dev_state_config_dgmf) { + nca.type = NCSI_PKT_CMD_DGMF; if (ncsi_channel_is_tx(ndp, nc)) nd->state = ncsi_dev_state_config_ecnt; else nd->state = ncsi_dev_state_config_ec; -#endif /* CONFIG_IPV6 */ } else if (nd->state == ncsi_dev_state_config_ecnt) { if (np->preferred_channel && nc != np->preferred_channel) @@ -1316,8 +1340,38 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) break; } nd->state = ncsi_dev_state_probe_cis; + if (IS_ENABLED(CONFIG_NCSI_OEM_CMD_GET_MAC) && + ndp->mlx_multi_host) + nd->state = ncsi_dev_state_probe_mlx_gma; + schedule_work(&ndp->work); break; +#if IS_ENABLED(CONFIG_NCSI_OEM_CMD_GET_MAC) + case ncsi_dev_state_probe_mlx_gma: + ndp->pending_req_num = 1; + + nca.type = NCSI_PKT_CMD_OEM; + nca.package = ndp->active_package->id; + nca.channel = 0; + ret = ncsi_oem_gma_handler_mlx(&nca); + if (ret) + goto error; + + nd->state = ncsi_dev_state_probe_mlx_smaf; + break; + case ncsi_dev_state_probe_mlx_smaf: + ndp->pending_req_num = 1; + + nca.type = NCSI_PKT_CMD_OEM; + nca.package = ndp->active_package->id; + nca.channel = 0; + ret = ncsi_oem_smaf_mlx(&nca); + if (ret) + goto error; + + nd->state = ncsi_dev_state_probe_cis; + break; +#endif /* CONFIG_NCSI_OEM_CMD_GET_MAC */ case ncsi_dev_state_probe_cis: ndp->pending_req_num = NCSI_RESERVED_CHANNEL; @@ -1483,70 +1537,6 @@ out: return -ENODEV; } -#if IS_ENABLED(CONFIG_IPV6) -static int ncsi_inet6addr_event(struct notifier_block *this, - unsigned long event, void *data) -{ - struct inet6_ifaddr *ifa = data; - struct net_device *dev = ifa->idev->dev; - struct ncsi_dev *nd = ncsi_find_dev(dev); - struct ncsi_dev_priv *ndp = nd ? TO_NCSI_DEV_PRIV(nd) : NULL; - struct ncsi_package *np; - struct ncsi_channel *nc; - struct ncsi_cmd_arg nca; - bool action; - int ret; - - if (!ndp || (ipv6_addr_type(&ifa->addr) & - (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK))) - return NOTIFY_OK; - - switch (event) { - case NETDEV_UP: - action = (++ndp->inet6_addr_num) == 1; - nca.type = NCSI_PKT_CMD_EGMF; - break; - case NETDEV_DOWN: - action = (--ndp->inet6_addr_num == 0); - nca.type = NCSI_PKT_CMD_DGMF; - break; - default: - return NOTIFY_OK; - } - - /* We might not have active channel or packages. The IPv6 - * required multicast will be enabled when active channel - * or packages are chosen. - */ - np = ndp->active_package; - nc = ndp->active_channel; - if (!action || !np || !nc) - return NOTIFY_OK; - - /* We needn't enable or disable it if the function isn't supported */ - if (!(nc->caps[NCSI_CAP_GENERIC].cap & NCSI_CAP_GENERIC_MC)) - return NOTIFY_OK; - - nca.ndp = ndp; - nca.req_flags = 0; - nca.package = np->id; - nca.channel = nc->id; - nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap; - ret = ncsi_xmit_cmd(&nca); - if (ret) { - netdev_warn(dev, "Fail to %s global multicast filter (%d)\n", - (event == NETDEV_UP) ? "enable" : "disable", ret); - return NOTIFY_DONE; - } - - return NOTIFY_OK; -} - -static struct notifier_block ncsi_inet6addr_notifier = { - .notifier_call = ncsi_inet6addr_event, -}; -#endif /* CONFIG_IPV6 */ - static int ncsi_kick_channels(struct ncsi_dev_priv *ndp) { struct ncsi_dev *nd = &ndp->ndev; @@ -1691,6 +1681,8 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev, { struct ncsi_dev_priv *ndp; struct ncsi_dev *nd; + struct platform_device *pdev; + struct device_node *np; unsigned long flags; int i; @@ -1725,11 +1717,6 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev, } spin_lock_irqsave(&ncsi_dev_lock, flags); -#if IS_ENABLED(CONFIG_IPV6) - ndp->inet6_addr_num = 0; - if (list_empty(&ncsi_dev_list)) - register_inet6addr_notifier(&ncsi_inet6addr_notifier); -#endif list_add_tail_rcu(&ndp->node, &ncsi_dev_list); spin_unlock_irqrestore(&ncsi_dev_lock, flags); @@ -1742,6 +1729,13 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev, /* Set up generic netlink interface */ ncsi_init_netlink(dev); + pdev = to_platform_device(dev->dev.parent); + if (pdev) { + np = pdev->dev.of_node; + if (np && of_get_property(np, "mlx,multi-host", NULL)) + ndp->mlx_multi_host = true; + } + return nd; } EXPORT_SYMBOL_GPL(ncsi_register_dev); @@ -1896,10 +1890,6 @@ void ncsi_unregister_dev(struct ncsi_dev *nd) spin_lock_irqsave(&ncsi_dev_lock, flags); list_del_rcu(&ndp->node); -#if IS_ENABLED(CONFIG_IPV6) - if (list_empty(&ncsi_dev_list)) - unregister_inet6addr_notifier(&ncsi_inet6addr_notifier); -#endif spin_unlock_irqrestore(&ncsi_dev_lock, flags); ncsi_unregister_netlink(nd->dev); diff --git a/net/ncsi/ncsi-pkt.h b/net/ncsi/ncsi-pkt.h index a8e9def593f2..80938b338fee 100644 --- a/net/ncsi/ncsi-pkt.h +++ b/net/ncsi/ncsi-pkt.h @@ -387,6 +387,9 @@ struct ncsi_aen_hncdsc_pkt { #define NCSI_PKT_CMD_OEM 0x50 /* OEM */ #define NCSI_PKT_CMD_PLDM 0x51 /* PLDM request over NCSI over RBT */ #define NCSI_PKT_CMD_GPUUID 0x52 /* Get package UUID */ +#define NCSI_PKT_CMD_QPNPR 0x56 /* Query Pending NC PLDM request */ +#define NCSI_PKT_CMD_SNPR 0x57 /* Send NC PLDM Reply */ + /* NCSI packet responses */ #define NCSI_PKT_RSP_CIS (NCSI_PKT_CMD_CIS + 0x80) @@ -419,6 +422,8 @@ struct ncsi_aen_hncdsc_pkt { #define NCSI_PKT_RSP_OEM (NCSI_PKT_CMD_OEM + 0x80) #define NCSI_PKT_RSP_PLDM (NCSI_PKT_CMD_PLDM + 0x80) #define NCSI_PKT_RSP_GPUUID (NCSI_PKT_CMD_GPUUID + 0x80) +#define NCSI_PKT_RSP_QPNPR (NCSI_PKT_CMD_QPNPR + 0x80) +#define NCSI_PKT_RSP_SNPR (NCSI_PKT_CMD_SNPR + 0x80) /* NCSI response code/reason */ #define NCSI_PKT_RSP_C_COMPLETED 0x0000 /* Command Completed */ diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index 7581bf919885..a94bb59793f0 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -47,7 +47,8 @@ static int ncsi_validate_rsp_pkt(struct ncsi_request *nr, if (ntohs(h->code) != NCSI_PKT_RSP_C_COMPLETED || ntohs(h->reason) != NCSI_PKT_RSP_R_NO_ERROR) { netdev_dbg(nr->ndp->ndev.dev, - "NCSI: non zero response/reason code\n"); + "NCSI: non zero response/reason code %04xh, %04xh\n", + ntohs(h->code), ntohs(h->reason)); return -EPERM; } @@ -55,7 +56,7 @@ static int ncsi_validate_rsp_pkt(struct ncsi_request *nr, * sender doesn't support checksum according to NCSI * specification. */ - pchecksum = (__be32 *)((void *)(h + 1) + payload - 4); + pchecksum = (__be32 *)((void *)(h + 1) + ALIGN(payload, 4) - 4); if (ntohl(*pchecksum) == 0) return 0; @@ -63,7 +64,9 @@ static int ncsi_validate_rsp_pkt(struct ncsi_request *nr, sizeof(*h) + payload - 4); if (*pchecksum != htonl(checksum)) { - netdev_dbg(nr->ndp->ndev.dev, "NCSI: checksum mismatched\n"); + netdev_dbg(nr->ndp->ndev.dev, + "NCSI: checksum mismatched; recd: %08x calc: %08x\n", + *pchecksum, htonl(checksum)); return -EINVAL; } @@ -624,6 +627,9 @@ static int ncsi_rsp_handler_oem_mlx_gma(struct ncsi_request *nr) saddr.sa_family = ndev->type; ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE; memcpy(saddr.sa_data, &rsp->data[MLX_MAC_ADDR_OFFSET], ETH_ALEN); + /* Set the flag for GMA command which should only be called once */ + ndp->gma_flag = 1; + ret = ops->ndo_set_mac_address(ndev, &saddr); if (ret < 0) netdev_warn(ndev, "NCSI: 'Writing mac address to device failed\n"); @@ -668,6 +674,9 @@ static int ncsi_rsp_handler_oem_bcm_gma(struct ncsi_request *nr) if (!is_valid_ether_addr((const u8 *)saddr.sa_data)) return -ENXIO; + /* Set the flag for GMA command which should only be called once */ + ndp->gma_flag = 1; + ret = ops->ndo_set_mac_address(ndev, &saddr); if (ret < 0) netdev_warn(ndev, "NCSI: 'Writing mac address to device failed\n"); @@ -1035,6 +1044,11 @@ static int ncsi_rsp_handler_gpuuid(struct ncsi_request *nr) return 0; } +static int ncsi_rsp_handler_pldm(struct ncsi_request *nr) +{ + return 0; +} + static int ncsi_rsp_handler_netlink(struct ncsi_request *nr) { struct ncsi_dev_priv *ndp = nr->ndp; @@ -1083,13 +1097,15 @@ static struct ncsi_rsp_handler { { NCSI_PKT_RSP_GVI, 40, ncsi_rsp_handler_gvi }, { NCSI_PKT_RSP_GC, 32, ncsi_rsp_handler_gc }, { NCSI_PKT_RSP_GP, -1, ncsi_rsp_handler_gp }, - { NCSI_PKT_RSP_GCPS, 172, ncsi_rsp_handler_gcps }, - { NCSI_PKT_RSP_GNS, 172, ncsi_rsp_handler_gns }, - { NCSI_PKT_RSP_GNPTS, 172, ncsi_rsp_handler_gnpts }, + { NCSI_PKT_RSP_GCPS, 204, ncsi_rsp_handler_gcps }, + { NCSI_PKT_RSP_GNS, 32, ncsi_rsp_handler_gns }, + { NCSI_PKT_RSP_GNPTS, 48, ncsi_rsp_handler_gnpts }, { NCSI_PKT_RSP_GPS, 8, ncsi_rsp_handler_gps }, { NCSI_PKT_RSP_OEM, -1, ncsi_rsp_handler_oem }, - { NCSI_PKT_RSP_PLDM, 0, NULL }, - { NCSI_PKT_RSP_GPUUID, 20, ncsi_rsp_handler_gpuuid } + { NCSI_PKT_RSP_PLDM, -1, ncsi_rsp_handler_pldm }, + { NCSI_PKT_RSP_GPUUID, 20, ncsi_rsp_handler_gpuuid }, + { NCSI_PKT_RSP_QPNPR, -1, ncsi_rsp_handler_pldm }, + { NCSI_PKT_RSP_SNPR, -1, ncsi_rsp_handler_pldm } }; int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev, diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 0d65f4d39494..91efae88e8c2 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -20,7 +20,7 @@ config NETFILTER_FAMILY_ARP bool config NETFILTER_NETLINK_ACCT -tristate "Netfilter NFACCT over NFNETLINK interface" + tristate "Netfilter NFACCT over NFNETLINK interface" depends on NETFILTER_ADVANCED select NETFILTER_NETLINK help @@ -34,7 +34,7 @@ config NETFILTER_NETLINK_QUEUE help If this option is enabled, the kernel will include support for queueing packets via NFNETLINK. - + config NETFILTER_NETLINK_LOG tristate "Netfilter LOG over NFNETLINK interface" default m if NETFILTER_ADVANCED=n @@ -697,7 +697,7 @@ config NF_FLOW_TABLE_INET tristate "Netfilter flow table mixed IPv4/IPv6 module" depends on NF_FLOW_TABLE help - This option adds the flow table mixed IPv4/IPv6 support. + This option adds the flow table mixed IPv4/IPv6 support. To compile it as a module, choose M here. @@ -1502,7 +1502,7 @@ config NETFILTER_XT_MATCH_REALM This option adds a `realm' match, which allows you to use the realm key from the routing subsystem inside iptables. - This match pretty much resembles the CONFIG_NET_CLS_ROUTE4 option + This match pretty much resembles the CONFIG_NET_CLS_ROUTE4 option in tc world. If you want to compile it as a module, say M here and read @@ -1523,7 +1523,7 @@ config NETFILTER_XT_MATCH_SCTP depends on NETFILTER_ADVANCED default IP_SCTP help - With this option enabled, you will be able to use the + With this option enabled, you will be able to use the `sctp' match in order to match on SCTP source/destination ports and SCTP chunk types. diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 9270a7fae484..3f572e5a975e 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -81,7 +81,8 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \ nft_chain_route.o nf_tables_offload.o nf_tables_set-objs := nf_tables_set_core.o \ - nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o + nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o \ + nft_set_pipapo.o obj-$(CONFIG_NF_TABLES) += nf_tables.o obj-$(CONFIG_NF_TABLES_SET) += nf_tables_set.o @@ -120,11 +121,12 @@ obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o # flow table infrastructure obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o -nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o +nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o \ + nf_flow_table_offload.o obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o -# generic X tables +# generic X tables obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o # combos diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 5d5bdf450091..78f046ec506f 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -536,6 +536,26 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, } EXPORT_SYMBOL(nf_hook_slow); +void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state, + const struct nf_hook_entries *e) +{ + struct sk_buff *skb, *next; + struct list_head sublist; + int ret; + + INIT_LIST_HEAD(&sublist); + + list_for_each_entry_safe(skb, next, head, list) { + skb_list_del_init(skb); + ret = nf_hook_slow(skb, state, e, 0); + if (ret == 1) + list_add_tail(&skb->list, &sublist); + } + /* Put passed packets back on main list */ + list_splice(&sublist, head); +} +EXPORT_SYMBOL(nf_hook_slow_list); + /* This needs to be compiled in any case to avoid dependencies between the * nfnetlink_queue code and nf_conntrack. */ diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index 063df74b4647..26ab0e9612d8 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -60,9 +60,9 @@ mtype_destroy(struct ip_set *set) if (SET_WITH_TIMEOUT(set)) del_timer_sync(&map->gc); - ip_set_free(map->members); if (set->dsize && set->extensions & IPSET_EXT_DESTROY) mtype_ext_cleanup(set); + ip_set_free(map->members); ip_set_free(map); set->data = NULL; @@ -75,7 +75,7 @@ mtype_flush(struct ip_set *set) if (set->extensions & IPSET_EXT_DESTROY) mtype_ext_cleanup(set); - memset(map->members, 0, map->memsize); + bitmap_zero(map->members, map->elements); set->elements = 0; set->ext_size = 0; } @@ -192,7 +192,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, } #ifndef IP_SET_BITMAP_STORED_TIMEOUT -static inline bool +static bool mtype_is_filled(const struct mtype_elem *x) { return true; diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index 11ff9d4a7006..0a2196f59106 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -37,7 +37,7 @@ MODULE_ALIAS("ip_set_bitmap:ip"); /* Type structure */ struct bitmap_ip { - void *members; /* the set members */ + unsigned long *members; /* the set members */ u32 first_ip; /* host byte order, included in range */ u32 last_ip; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ @@ -55,7 +55,7 @@ struct bitmap_ip_adt_elem { u16 id; }; -static inline u32 +static u32 ip_to_id(const struct bitmap_ip *m, u32 ip) { return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip) / m->hosts; @@ -63,33 +63,33 @@ ip_to_id(const struct bitmap_ip *m, u32 ip) /* Common functions */ -static inline int +static int bitmap_ip_do_test(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map, size_t dsize) { return !!test_bit(e->id, map->members); } -static inline int +static int bitmap_ip_gc_test(u16 id, const struct bitmap_ip *map, size_t dsize) { return !!test_bit(id, map->members); } -static inline int +static int bitmap_ip_do_add(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map, u32 flags, size_t dsize) { return !!test_bit(e->id, map->members); } -static inline int +static int bitmap_ip_do_del(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map) { return !test_and_clear_bit(e->id, map->members); } -static inline int +static int bitmap_ip_do_list(struct sk_buff *skb, const struct bitmap_ip *map, u32 id, size_t dsize) { @@ -97,7 +97,7 @@ bitmap_ip_do_list(struct sk_buff *skb, const struct bitmap_ip *map, u32 id, htonl(map->first_ip + id * map->hosts)); } -static inline int +static int bitmap_ip_do_head(struct sk_buff *skb, const struct bitmap_ip *map) { return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) || @@ -220,7 +220,7 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map, u32 first_ip, u32 last_ip, u32 elements, u32 hosts, u8 netmask) { - map->members = ip_set_alloc(map->memsize); + map->members = bitmap_zalloc(elements, GFP_KERNEL | __GFP_NOWARN); if (!map->members) return false; map->first_ip = first_ip; @@ -237,6 +237,18 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map, return true; } +static u32 +range_to_mask(u32 from, u32 to, u8 *bits) +{ + u32 mask = 0xFFFFFFFE; + + *bits = 32; + while (--(*bits) > 0 && mask && (to & mask) != from) + mask <<= 1; + + return mask; +} + static int bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[], u32 flags) @@ -310,7 +322,7 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[], if (!map) return -ENOMEM; - map->memsize = bitmap_bytes(0, elements - 1); + map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long); set->variant = &bitmap_ip; if (!init_map_ip(set, map, first_ip, last_ip, elements, hosts, netmask)) { diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index 1d4e63326e68..739e343efaf6 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -42,7 +42,7 @@ enum { /* Type structure */ struct bitmap_ipmac { - void *members; /* the set members */ + unsigned long *members; /* the set members */ u32 first_ip; /* host byte order, included in range */ u32 last_ip; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ @@ -65,7 +65,7 @@ struct bitmap_ipmac_elem { unsigned char filled; } __aligned(__alignof__(u64)); -static inline u32 +static u32 ip_to_id(const struct bitmap_ipmac *m, u32 ip) { return ip - m->first_ip; @@ -79,7 +79,7 @@ ip_to_id(const struct bitmap_ipmac *m, u32 ip) /* Common functions */ -static inline int +static int bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e, const struct bitmap_ipmac *map, size_t dsize) { @@ -94,7 +94,7 @@ bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e, return -EAGAIN; } -static inline int +static int bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map, size_t dsize) { const struct bitmap_ipmac_elem *elem; @@ -106,13 +106,13 @@ bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map, size_t dsize) return elem->filled == MAC_FILLED; } -static inline int +static int bitmap_ipmac_is_filled(const struct bitmap_ipmac_elem *elem) { return elem->filled == MAC_FILLED; } -static inline int +static int bitmap_ipmac_add_timeout(unsigned long *timeout, const struct bitmap_ipmac_adt_elem *e, const struct ip_set_ext *ext, struct ip_set *set, @@ -139,7 +139,7 @@ bitmap_ipmac_add_timeout(unsigned long *timeout, return 0; } -static inline int +static int bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e, struct bitmap_ipmac *map, u32 flags, size_t dsize) { @@ -177,14 +177,14 @@ bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e, return IPSET_ADD_STORE_PLAIN_TIMEOUT; } -static inline int +static int bitmap_ipmac_do_del(const struct bitmap_ipmac_adt_elem *e, struct bitmap_ipmac *map) { return !test_and_clear_bit(e->id, map->members); } -static inline int +static int bitmap_ipmac_do_list(struct sk_buff *skb, const struct bitmap_ipmac *map, u32 id, size_t dsize) { @@ -197,7 +197,7 @@ bitmap_ipmac_do_list(struct sk_buff *skb, const struct bitmap_ipmac *map, nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, elem->ether)); } -static inline int +static int bitmap_ipmac_do_head(struct sk_buff *skb, const struct bitmap_ipmac *map) { return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) || @@ -299,7 +299,7 @@ static bool init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map, u32 first_ip, u32 last_ip, u32 elements) { - map->members = ip_set_alloc(map->memsize); + map->members = bitmap_zalloc(elements, GFP_KERNEL | __GFP_NOWARN); if (!map->members) return false; map->first_ip = first_ip; @@ -360,7 +360,7 @@ bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[], if (!map) return -ENOMEM; - map->memsize = bitmap_bytes(0, elements - 1); + map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long); set->variant = &bitmap_ipmac; if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) { kfree(map); diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index 704a0dda1609..b49978dd810d 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -30,7 +30,7 @@ MODULE_ALIAS("ip_set_bitmap:port"); /* Type structure */ struct bitmap_port { - void *members; /* the set members */ + unsigned long *members; /* the set members */ u16 first_port; /* host byte order, included in range */ u16 last_port; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ @@ -46,7 +46,7 @@ struct bitmap_port_adt_elem { u16 id; }; -static inline u16 +static u16 port_to_id(const struct bitmap_port *m, u16 port) { return port - m->first_port; @@ -54,34 +54,34 @@ port_to_id(const struct bitmap_port *m, u16 port) /* Common functions */ -static inline int +static int bitmap_port_do_test(const struct bitmap_port_adt_elem *e, const struct bitmap_port *map, size_t dsize) { return !!test_bit(e->id, map->members); } -static inline int +static int bitmap_port_gc_test(u16 id, const struct bitmap_port *map, size_t dsize) { return !!test_bit(id, map->members); } -static inline int +static int bitmap_port_do_add(const struct bitmap_port_adt_elem *e, struct bitmap_port *map, u32 flags, size_t dsize) { return !!test_bit(e->id, map->members); } -static inline int +static int bitmap_port_do_del(const struct bitmap_port_adt_elem *e, struct bitmap_port *map) { return !test_and_clear_bit(e->id, map->members); } -static inline int +static int bitmap_port_do_list(struct sk_buff *skb, const struct bitmap_port *map, u32 id, size_t dsize) { @@ -89,13 +89,40 @@ bitmap_port_do_list(struct sk_buff *skb, const struct bitmap_port *map, u32 id, htons(map->first_port + id)); } -static inline int +static int bitmap_port_do_head(struct sk_buff *skb, const struct bitmap_port *map) { return nla_put_net16(skb, IPSET_ATTR_PORT, htons(map->first_port)) || nla_put_net16(skb, IPSET_ATTR_PORT_TO, htons(map->last_port)); } +static bool +ip_set_get_ip_port(const struct sk_buff *skb, u8 pf, bool src, __be16 *port) +{ + bool ret; + u8 proto; + + switch (pf) { + case NFPROTO_IPV4: + ret = ip_set_get_ip4_port(skb, src, port, &proto); + break; + case NFPROTO_IPV6: + ret = ip_set_get_ip6_port(skb, src, port, &proto); + break; + default: + return false; + } + if (!ret) + return ret; + switch (proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + return true; + default: + return false; + } +} + static int bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, @@ -204,7 +231,7 @@ static bool init_map_port(struct ip_set *set, struct bitmap_port *map, u16 first_port, u16 last_port) { - map->members = ip_set_alloc(map->memsize); + map->members = bitmap_zalloc(map->elements, GFP_KERNEL | __GFP_NOWARN); if (!map->members) return false; map->first_port = first_port; @@ -244,7 +271,7 @@ bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[], return -ENOMEM; map->elements = elements; - map->memsize = bitmap_bytes(0, map->elements); + map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long); set->variant = &bitmap_port; if (!init_map_port(set, map, first_port, last_port)) { kfree(map); diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index e64d5f9a89dd..69c107f9ba8d 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -35,7 +35,7 @@ struct ip_set_net { static unsigned int ip_set_net_id __read_mostly; -static inline struct ip_set_net *ip_set_pernet(struct net *net) +static struct ip_set_net *ip_set_pernet(struct net *net) { return net_generic(net, ip_set_net_id); } @@ -67,13 +67,13 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); * serialized by ip_set_type_mutex. */ -static inline void +static void ip_set_type_lock(void) { mutex_lock(&ip_set_type_mutex); } -static inline void +static void ip_set_type_unlock(void) { mutex_unlock(&ip_set_type_mutex); @@ -277,7 +277,7 @@ ip_set_free(void *members) } EXPORT_SYMBOL_GPL(ip_set_free); -static inline bool +static bool flag_nested(const struct nlattr *nla) { return nla->nla_type & NLA_F_NESTED; @@ -296,7 +296,8 @@ ip_set_get_ipaddr4(struct nlattr *nla, __be32 *ipaddr) if (unlikely(!flag_nested(nla))) return -IPSET_ERR_PROTOCOL; - if (nla_parse_nested_deprecated(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy, NULL)) + if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, + ipaddr_policy, NULL)) return -IPSET_ERR_PROTOCOL; if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV4))) return -IPSET_ERR_PROTOCOL; @@ -314,7 +315,8 @@ ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr) if (unlikely(!flag_nested(nla))) return -IPSET_ERR_PROTOCOL; - if (nla_parse_nested_deprecated(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy, NULL)) + if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, + ipaddr_policy, NULL)) return -IPSET_ERR_PROTOCOL; if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV6))) return -IPSET_ERR_PROTOCOL; @@ -325,6 +327,83 @@ ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr) } EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6); +static u32 +ip_set_timeout_get(const unsigned long *timeout) +{ + u32 t; + + if (*timeout == IPSET_ELEM_PERMANENT) + return 0; + + t = jiffies_to_msecs(*timeout - jiffies) / MSEC_PER_SEC; + /* Zero value in userspace means no timeout */ + return t == 0 ? 1 : t; +} + +static char * +ip_set_comment_uget(struct nlattr *tb) +{ + return nla_data(tb); +} + +/* Called from uadd only, protected by the set spinlock. + * The kadt functions don't use the comment extensions in any way. + */ +void +ip_set_init_comment(struct ip_set *set, struct ip_set_comment *comment, + const struct ip_set_ext *ext) +{ + struct ip_set_comment_rcu *c = rcu_dereference_protected(comment->c, 1); + size_t len = ext->comment ? strlen(ext->comment) : 0; + + if (unlikely(c)) { + set->ext_size -= sizeof(*c) + strlen(c->str) + 1; + kfree_rcu(c, rcu); + rcu_assign_pointer(comment->c, NULL); + } + if (!len) + return; + if (unlikely(len > IPSET_MAX_COMMENT_SIZE)) + len = IPSET_MAX_COMMENT_SIZE; + c = kmalloc(sizeof(*c) + len + 1, GFP_ATOMIC); + if (unlikely(!c)) + return; + strlcpy(c->str, ext->comment, len + 1); + set->ext_size += sizeof(*c) + strlen(c->str) + 1; + rcu_assign_pointer(comment->c, c); +} +EXPORT_SYMBOL_GPL(ip_set_init_comment); + +/* Used only when dumping a set, protected by rcu_read_lock() */ +static int +ip_set_put_comment(struct sk_buff *skb, const struct ip_set_comment *comment) +{ + struct ip_set_comment_rcu *c = rcu_dereference(comment->c); + + if (!c) + return 0; + return nla_put_string(skb, IPSET_ATTR_COMMENT, c->str); +} + +/* Called from uadd/udel, flush or the garbage collectors protected + * by the set spinlock. + * Called when the set is destroyed and when there can't be any user + * of the set data anymore. + */ +static void +ip_set_comment_free(struct ip_set *set, void *ptr) +{ + struct ip_set_comment *comment = ptr; + struct ip_set_comment_rcu *c; + + c = rcu_dereference_protected(comment->c, 1); + if (unlikely(!c)) + return; + set->ext_size -= sizeof(*c) + strlen(c->str) + 1; + kfree_rcu(c, rcu); + rcu_assign_pointer(comment->c, NULL); +} + typedef void (*destroyer)(struct ip_set *, void *); /* ipset data extension types, in size order */ @@ -351,12 +430,12 @@ const struct ip_set_ext_type ip_set_extensions[] = { .flag = IPSET_FLAG_WITH_COMMENT, .len = sizeof(struct ip_set_comment), .align = __alignof__(struct ip_set_comment), - .destroy = (destroyer) ip_set_comment_free, + .destroy = ip_set_comment_free, }, }; EXPORT_SYMBOL_GPL(ip_set_extensions); -static inline bool +static bool add_extension(enum ip_set_ext_id id, u32 flags, struct nlattr *tb[]) { return ip_set_extensions[id].flag ? @@ -446,6 +525,46 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], } EXPORT_SYMBOL_GPL(ip_set_get_extensions); +static u64 +ip_set_get_bytes(const struct ip_set_counter *counter) +{ + return (u64)atomic64_read(&(counter)->bytes); +} + +static u64 +ip_set_get_packets(const struct ip_set_counter *counter) +{ + return (u64)atomic64_read(&(counter)->packets); +} + +static bool +ip_set_put_counter(struct sk_buff *skb, const struct ip_set_counter *counter) +{ + return nla_put_net64(skb, IPSET_ATTR_BYTES, + cpu_to_be64(ip_set_get_bytes(counter)), + IPSET_ATTR_PAD) || + nla_put_net64(skb, IPSET_ATTR_PACKETS, + cpu_to_be64(ip_set_get_packets(counter)), + IPSET_ATTR_PAD); +} + +static bool +ip_set_put_skbinfo(struct sk_buff *skb, const struct ip_set_skbinfo *skbinfo) +{ + /* Send nonzero parameters only */ + return ((skbinfo->skbmark || skbinfo->skbmarkmask) && + nla_put_net64(skb, IPSET_ATTR_SKBMARK, + cpu_to_be64((u64)skbinfo->skbmark << 32 | + skbinfo->skbmarkmask), + IPSET_ATTR_PAD)) || + (skbinfo->skbprio && + nla_put_net32(skb, IPSET_ATTR_SKBPRIO, + cpu_to_be32(skbinfo->skbprio))) || + (skbinfo->skbqueue && + nla_put_net16(skb, IPSET_ATTR_SKBQUEUE, + cpu_to_be16(skbinfo->skbqueue))); +} + int ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set, const void *e, bool active) @@ -471,6 +590,55 @@ ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set, } EXPORT_SYMBOL_GPL(ip_set_put_extensions); +static bool +ip_set_match_counter(u64 counter, u64 match, u8 op) +{ + switch (op) { + case IPSET_COUNTER_NONE: + return true; + case IPSET_COUNTER_EQ: + return counter == match; + case IPSET_COUNTER_NE: + return counter != match; + case IPSET_COUNTER_LT: + return counter < match; + case IPSET_COUNTER_GT: + return counter > match; + } + return false; +} + +static void +ip_set_add_bytes(u64 bytes, struct ip_set_counter *counter) +{ + atomic64_add((long long)bytes, &(counter)->bytes); +} + +static void +ip_set_add_packets(u64 packets, struct ip_set_counter *counter) +{ + atomic64_add((long long)packets, &(counter)->packets); +} + +static void +ip_set_update_counter(struct ip_set_counter *counter, + const struct ip_set_ext *ext, u32 flags) +{ + if (ext->packets != ULLONG_MAX && + !(flags & IPSET_FLAG_SKIP_COUNTER_UPDATE)) { + ip_set_add_bytes(ext->bytes, counter); + ip_set_add_packets(ext->packets, counter); + } +} + +static void +ip_set_get_skbinfo(struct ip_set_skbinfo *skbinfo, + const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + mext->skbinfo = *skbinfo; +} + bool ip_set_match_extensions(struct ip_set *set, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags, void *data) @@ -506,7 +674,7 @@ EXPORT_SYMBOL_GPL(ip_set_match_extensions); * The set behind an index may change by swapping only, from userspace. */ -static inline void +static void __ip_set_get(struct ip_set *set) { write_lock_bh(&ip_set_ref_lock); @@ -514,7 +682,7 @@ __ip_set_get(struct ip_set *set) write_unlock_bh(&ip_set_ref_lock); } -static inline void +static void __ip_set_put(struct ip_set *set) { write_lock_bh(&ip_set_ref_lock); @@ -526,7 +694,7 @@ __ip_set_put(struct ip_set *set) /* set->ref can be swapped out by ip_set_swap, netlink events (like dump) need * a separate reference counter */ -static inline void +static void __ip_set_put_netlink(struct ip_set *set) { write_lock_bh(&ip_set_ref_lock); @@ -541,7 +709,7 @@ __ip_set_put_netlink(struct ip_set *set) * so it can't be destroyed (or changed) under our foot. */ -static inline struct ip_set * +static struct ip_set * ip_set_rcu_get(struct net *net, ip_set_id_t index) { struct ip_set *set; @@ -670,7 +838,7 @@ EXPORT_SYMBOL_GPL(ip_set_get_byname); * */ -static inline void +static void __ip_set_put_byindex(struct ip_set_net *inst, ip_set_id_t index) { struct ip_set *set; @@ -934,7 +1102,8 @@ static int ip_set_create(struct net *net, struct sock *ctnl, /* Without holding any locks, create private part. */ if (attr[IPSET_ATTR_DATA] && - nla_parse_nested_deprecated(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA], set->type->create_policy, NULL)) { + nla_parse_nested(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA], + set->type->create_policy, NULL)) { ret = -IPSET_ERR_PROTOCOL; goto put_out; } @@ -1252,6 +1421,30 @@ static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb, #define DUMP_TYPE(arg) (((u32)(arg)) & 0x0000FFFF) #define DUMP_FLAGS(arg) (((u32)(arg)) >> 16) +int +ip_set_put_flags(struct sk_buff *skb, struct ip_set *set) +{ + u32 cadt_flags = 0; + + if (SET_WITH_TIMEOUT(set)) + if (unlikely(nla_put_net32(skb, IPSET_ATTR_TIMEOUT, + htonl(set->timeout)))) + return -EMSGSIZE; + if (SET_WITH_COUNTER(set)) + cadt_flags |= IPSET_FLAG_WITH_COUNTERS; + if (SET_WITH_COMMENT(set)) + cadt_flags |= IPSET_FLAG_WITH_COMMENT; + if (SET_WITH_SKBINFO(set)) + cadt_flags |= IPSET_FLAG_WITH_SKBINFO; + if (SET_WITH_FORCEADD(set)) + cadt_flags |= IPSET_FLAG_WITH_FORCEADD; + + if (!cadt_flags) + return 0; + return nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(cadt_flags)); +} +EXPORT_SYMBOL_GPL(ip_set_put_flags); + static int ip_set_dump_done(struct netlink_callback *cb) { @@ -1281,32 +1474,43 @@ dump_attrs(struct nlmsghdr *nlh) } } +static const struct nla_policy +ip_set_dump_policy[IPSET_ATTR_CMD_MAX + 1] = { + [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, + [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING, + .len = IPSET_MAXNAMELEN - 1 }, + [IPSET_ATTR_FLAGS] = { .type = NLA_U32 }, +}; + static int -dump_init(struct netlink_callback *cb, struct ip_set_net *inst) +ip_set_dump_start(struct netlink_callback *cb) { struct nlmsghdr *nlh = nlmsg_hdr(cb->skb); int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); struct nlattr *cda[IPSET_ATTR_CMD_MAX + 1]; struct nlattr *attr = (void *)nlh + min_len; + struct sk_buff *skb = cb->skb; + struct ip_set_net *inst = ip_set_pernet(sock_net(skb->sk)); u32 dump_type; - ip_set_id_t index; int ret; - ret = nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, attr, - nlh->nlmsg_len - min_len, - ip_set_setname_policy, NULL); + ret = nla_parse(cda, IPSET_ATTR_CMD_MAX, attr, + nlh->nlmsg_len - min_len, + ip_set_dump_policy, NULL); if (ret) - return ret; + goto error; cb->args[IPSET_CB_PROTO] = nla_get_u8(cda[IPSET_ATTR_PROTOCOL]); if (cda[IPSET_ATTR_SETNAME]) { + ip_set_id_t index; struct ip_set *set; set = find_set_and_id(inst, nla_data(cda[IPSET_ATTR_SETNAME]), &index); - if (!set) - return -ENOENT; - + if (!set) { + ret = -ENOENT; + goto error; + } dump_type = DUMP_ONE; cb->args[IPSET_CB_INDEX] = index; } else { @@ -1322,10 +1526,17 @@ dump_init(struct netlink_callback *cb, struct ip_set_net *inst) cb->args[IPSET_CB_DUMP] = dump_type; return 0; + +error: + /* We have to create and send the error message manually :-( */ + if (nlh->nlmsg_flags & NLM_F_ACK) { + netlink_ack(cb->skb, nlh, ret, NULL); + } + return ret; } static int -ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb) +ip_set_dump_do(struct sk_buff *skb, struct netlink_callback *cb) { ip_set_id_t index = IPSET_INVALID_ID, max; struct ip_set *set = NULL; @@ -1336,18 +1547,8 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb) bool is_destroyed; int ret = 0; - if (!cb->args[IPSET_CB_DUMP]) { - ret = dump_init(cb, inst); - if (ret < 0) { - nlh = nlmsg_hdr(cb->skb); - /* We have to create and send the error message - * manually :-( - */ - if (nlh->nlmsg_flags & NLM_F_ACK) - netlink_ack(cb->skb, nlh, ret, NULL); - return ret; - } - } + if (!cb->args[IPSET_CB_DUMP]) + return -EINVAL; if (cb->args[IPSET_CB_INDEX] >= inst->ip_set_max) goto out; @@ -1483,7 +1684,8 @@ static int ip_set_dump(struct net *net, struct sock *ctnl, struct sk_buff *skb, { struct netlink_dump_control c = { - .dump = ip_set_dump_start, + .start = ip_set_dump_start, + .dump = ip_set_dump_do, .done = ip_set_dump_done, }; return netlink_dump_start(ctnl, skb, nlh, &c); @@ -1543,9 +1745,9 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, memcpy(&errmsg->msg, nlh, nlh->nlmsg_len); cmdattr = (void *)&errmsg->msg + min_len; - ret = nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, cmdattr, - nlh->nlmsg_len - min_len, - ip_set_adt_policy, NULL); + ret = nla_parse(cda, IPSET_ATTR_CMD_MAX, cmdattr, + nlh->nlmsg_len - min_len, ip_set_adt_policy, + NULL); if (ret) { nlmsg_free(skb2); @@ -1596,7 +1798,9 @@ static int ip_set_ad(struct net *net, struct sock *ctnl, use_lineno = !!attr[IPSET_ATTR_LINENO]; if (attr[IPSET_ATTR_DATA]) { - if (nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL)) + if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, + attr[IPSET_ATTR_DATA], + set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; ret = call_ad(ctnl, skb, set, tb, adt, flags, use_lineno); @@ -1606,7 +1810,8 @@ static int ip_set_ad(struct net *net, struct sock *ctnl, nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) { if (nla_type(nla) != IPSET_ATTR_DATA || !flag_nested(nla) || - nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, nla, set->type->adt_policy, NULL)) + nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla, + set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; ret = call_ad(ctnl, skb, set, tb, adt, flags, use_lineno); @@ -1644,6 +1849,7 @@ static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb, struct ip_set *set; struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {}; int ret = 0; + u32 lineno; if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME] || @@ -1655,11 +1861,12 @@ static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb, if (!set) return -ENOENT; - if (nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL)) + if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], + set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; rcu_read_lock_bh(); - ret = set->variant->uadt(set, tb, IPSET_TEST, NULL, 0, 0); + ret = set->variant->uadt(set, tb, IPSET_TEST, &lineno, 0, 0); rcu_read_unlock_bh(); /* Userspace can't trigger element to be re-added */ if (ret == -EAGAIN) @@ -1961,7 +2168,7 @@ static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = { [IPSET_CMD_LIST] = { .call = ip_set_dump, .attr_count = IPSET_ATTR_CMD_MAX, - .policy = ip_set_setname_policy, + .policy = ip_set_dump_policy, }, [IPSET_CMD_SAVE] = { .call = ip_set_dump, @@ -2069,8 +2276,9 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) } req_version->version = IPSET_PROTOCOL; - ret = copy_to_user(user, req_version, - sizeof(struct ip_set_req_version)); + if (copy_to_user(user, req_version, + sizeof(struct ip_set_req_version))) + ret = -EFAULT; goto done; } case IP_SET_OP_GET_BYNAME: { @@ -2129,7 +2337,8 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) } /* end of switch(op) */ copy: - ret = copy_to_user(user, data, copylen); + if (copy_to_user(user, data, copylen)) + ret = -EFAULT; done: vfree(data); diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c index 2b8f959574b4..36615eb3eae1 100644 --- a/net/netfilter/ipset/ip_set_getport.c +++ b/net/netfilter/ipset/ip_set_getport.c @@ -148,31 +148,3 @@ ip_set_get_ip6_port(const struct sk_buff *skb, bool src, } EXPORT_SYMBOL_GPL(ip_set_get_ip6_port); #endif - -bool -ip_set_get_ip_port(const struct sk_buff *skb, u8 pf, bool src, __be16 *port) -{ - bool ret; - u8 proto; - - switch (pf) { - case NFPROTO_IPV4: - ret = ip_set_get_ip4_port(skb, src, port, &proto); - break; - case NFPROTO_IPV6: - ret = ip_set_get_ip6_port(skb, src, port, &proto); - break; - default: - return false; - } - if (!ret) - return ret; - switch (proto) { - case IPPROTO_TCP: - case IPPROTO_UDP: - return true; - default: - return false; - } -} -EXPORT_SYMBOL_GPL(ip_set_get_ip_port); diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 0feb77fa9edc..7480ce55b5c8 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -7,7 +7,7 @@ #include <linux/rcupdate.h> #include <linux/jhash.h> #include <linux/types.h> -#include <linux/netfilter/ipset/ip_set_timeout.h> +#include <linux/netfilter/ipset/ip_set.h> #define __ipset_dereference_protected(p, c) rcu_dereference_protected(p, c) #define ipset_dereference_protected(p, set) \ @@ -39,7 +39,7 @@ #ifdef IP_SET_HASH_WITH_MULTI #define AHASH_MAX(h) ((h)->ahash_max) -static inline u8 +static u8 tune_ahash_max(u8 curr, u32 multi) { u32 n; @@ -909,7 +909,7 @@ out: return ret; } -static inline int +static int mtype_data_match(struct mtype_elem *data, const struct ip_set_ext *ext, struct ip_set_ext *mext, struct ip_set *set, u32 flags) { @@ -953,7 +953,7 @@ mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d, mtype_data_netmask(d, NCIDR_GET(h->nets[j].cidr[0])); #endif key = HKEY(d, h->initval, t->htable_bits); - n = rcu_dereference_bh(hbucket(t, key)); + n = rcu_dereference_bh(hbucket(t, key)); if (!n) continue; for (i = 0; i < n->pos; i++) { diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index f4432d9fcad0..5d6d68eaf6a9 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -44,7 +44,7 @@ struct hash_ip4_elem { /* Common functions */ -static inline bool +static bool hash_ip4_data_equal(const struct hash_ip4_elem *e1, const struct hash_ip4_elem *e2, u32 *multi) @@ -63,7 +63,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ip4_data_next(struct hash_ip4_elem *next, const struct hash_ip4_elem *e) { next->ip = e->ip; @@ -171,7 +171,7 @@ struct hash_ip6_elem { /* Common functions */ -static inline bool +static bool hash_ip6_data_equal(const struct hash_ip6_elem *ip1, const struct hash_ip6_elem *ip2, u32 *multi) @@ -179,7 +179,7 @@ hash_ip6_data_equal(const struct hash_ip6_elem *ip1, return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6); } -static inline void +static void hash_ip6_netmask(union nf_inet_addr *ip, u8 prefix) { ip6_netmask(ip, prefix); @@ -196,7 +196,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ip6_data_next(struct hash_ip6_elem *next, const struct hash_ip6_elem *e) { } diff --git a/net/netfilter/ipset/ip_set_hash_ipmac.c b/net/netfilter/ipset/ip_set_hash_ipmac.c index 24d8f4df4230..eceb7bc4a93a 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmac.c +++ b/net/netfilter/ipset/ip_set_hash_ipmac.c @@ -47,7 +47,7 @@ struct hash_ipmac4_elem { /* Common functions */ -static inline bool +static bool hash_ipmac4_data_equal(const struct hash_ipmac4_elem *e1, const struct hash_ipmac4_elem *e2, u32 *multi) @@ -67,7 +67,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipmac4_data_next(struct hash_ipmac4_elem *next, const struct hash_ipmac4_elem *e) { @@ -154,7 +154,7 @@ struct hash_ipmac6_elem { /* Common functions */ -static inline bool +static bool hash_ipmac6_data_equal(const struct hash_ipmac6_elem *e1, const struct hash_ipmac6_elem *e2, u32 *multi) @@ -175,7 +175,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipmac6_data_next(struct hash_ipmac6_elem *next, const struct hash_ipmac6_elem *e) { @@ -209,7 +209,7 @@ hash_ipmac6_kadt(struct ip_set *set, const struct sk_buff *skb, (skb_mac_header(skb) + ETH_HLEN) > skb->data) return -EINVAL; - if (opt->flags & IPSET_DIM_ONE_SRC) + if (opt->flags & IPSET_DIM_TWO_SRC) ether_addr_copy(e.ether, eth_hdr(skb)->h_source); else ether_addr_copy(e.ether, eth_hdr(skb)->h_dest); diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index 7a1734aad0c5..aba1df617d6e 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -42,7 +42,7 @@ struct hash_ipmark4_elem { /* Common functions */ -static inline bool +static bool hash_ipmark4_data_equal(const struct hash_ipmark4_elem *ip1, const struct hash_ipmark4_elem *ip2, u32 *multi) @@ -64,7 +64,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipmark4_data_next(struct hash_ipmark4_elem *next, const struct hash_ipmark4_elem *d) { @@ -165,7 +165,7 @@ struct hash_ipmark6_elem { /* Common functions */ -static inline bool +static bool hash_ipmark6_data_equal(const struct hash_ipmark6_elem *ip1, const struct hash_ipmark6_elem *ip2, u32 *multi) @@ -187,7 +187,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipmark6_data_next(struct hash_ipmark6_elem *next, const struct hash_ipmark6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index 32e240658334..1ff228717e29 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -47,7 +47,7 @@ struct hash_ipport4_elem { /* Common functions */ -static inline bool +static bool hash_ipport4_data_equal(const struct hash_ipport4_elem *ip1, const struct hash_ipport4_elem *ip2, u32 *multi) @@ -71,7 +71,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipport4_data_next(struct hash_ipport4_elem *next, const struct hash_ipport4_elem *d) { @@ -202,7 +202,7 @@ struct hash_ipport6_elem { /* Common functions */ -static inline bool +static bool hash_ipport6_data_equal(const struct hash_ipport6_elem *ip1, const struct hash_ipport6_elem *ip2, u32 *multi) @@ -226,7 +226,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipport6_data_next(struct hash_ipport6_elem *next, const struct hash_ipport6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index 15d419353179..fa88afd812fa 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -46,7 +46,7 @@ struct hash_ipportip4_elem { u8 padding; }; -static inline bool +static bool hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1, const struct hash_ipportip4_elem *ip2, u32 *multi) @@ -72,7 +72,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipportip4_data_next(struct hash_ipportip4_elem *next, const struct hash_ipportip4_elem *d) { @@ -210,7 +210,7 @@ struct hash_ipportip6_elem { /* Common functions */ -static inline bool +static bool hash_ipportip6_data_equal(const struct hash_ipportip6_elem *ip1, const struct hash_ipportip6_elem *ip2, u32 *multi) @@ -236,7 +236,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipportip6_data_next(struct hash_ipportip6_elem *next, const struct hash_ipportip6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 7a4d7afd4121..eef6ecfcb409 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -59,7 +59,7 @@ struct hash_ipportnet4_elem { /* Common functions */ -static inline bool +static bool hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1, const struct hash_ipportnet4_elem *ip2, u32 *multi) @@ -71,25 +71,25 @@ hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1, ip1->proto == ip2->proto; } -static inline int +static int hash_ipportnet4_do_data_match(const struct hash_ipportnet4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_ipportnet4_data_set_flags(struct hash_ipportnet4_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } -static inline void +static void hash_ipportnet4_data_reset_flags(struct hash_ipportnet4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_ipportnet4_data_netmask(struct hash_ipportnet4_elem *elem, u8 cidr) { elem->ip2 &= ip_set_netmask(cidr); @@ -116,7 +116,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipportnet4_data_next(struct hash_ipportnet4_elem *next, const struct hash_ipportnet4_elem *d) { @@ -308,7 +308,7 @@ struct hash_ipportnet6_elem { /* Common functions */ -static inline bool +static bool hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1, const struct hash_ipportnet6_elem *ip2, u32 *multi) @@ -320,25 +320,25 @@ hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1, ip1->proto == ip2->proto; } -static inline int +static int hash_ipportnet6_do_data_match(const struct hash_ipportnet6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_ipportnet6_data_set_flags(struct hash_ipportnet6_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } -static inline void +static void hash_ipportnet6_data_reset_flags(struct hash_ipportnet6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_ipportnet6_data_netmask(struct hash_ipportnet6_elem *elem, u8 cidr) { ip6_netmask(&elem->ip2, cidr); @@ -365,7 +365,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipportnet6_data_next(struct hash_ipportnet6_elem *next, const struct hash_ipportnet6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c index d94c585d33c5..0b61593165ef 100644 --- a/net/netfilter/ipset/ip_set_hash_mac.c +++ b/net/netfilter/ipset/ip_set_hash_mac.c @@ -37,7 +37,7 @@ struct hash_mac4_elem { /* Common functions */ -static inline bool +static bool hash_mac4_data_equal(const struct hash_mac4_elem *e1, const struct hash_mac4_elem *e2, u32 *multi) @@ -45,7 +45,7 @@ hash_mac4_data_equal(const struct hash_mac4_elem *e1, return ether_addr_equal(e1->ether, e2->ether); } -static inline bool +static bool hash_mac4_data_list(struct sk_buff *skb, const struct hash_mac4_elem *e) { if (nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether)) @@ -56,7 +56,7 @@ nla_put_failure: return true; } -static inline void +static void hash_mac4_data_next(struct hash_mac4_elem *next, const struct hash_mac4_elem *e) { diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index c259cbc3ef45..136cf0781d3a 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -47,7 +47,7 @@ struct hash_net4_elem { /* Common functions */ -static inline bool +static bool hash_net4_data_equal(const struct hash_net4_elem *ip1, const struct hash_net4_elem *ip2, u32 *multi) @@ -56,25 +56,25 @@ hash_net4_data_equal(const struct hash_net4_elem *ip1, ip1->cidr == ip2->cidr; } -static inline int +static int hash_net4_do_data_match(const struct hash_net4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_net4_data_set_flags(struct hash_net4_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } -static inline void +static void hash_net4_data_reset_flags(struct hash_net4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_net4_data_netmask(struct hash_net4_elem *elem, u8 cidr) { elem->ip &= ip_set_netmask(cidr); @@ -97,7 +97,7 @@ nla_put_failure: return true; } -static inline void +static void hash_net4_data_next(struct hash_net4_elem *next, const struct hash_net4_elem *d) { @@ -212,7 +212,7 @@ struct hash_net6_elem { /* Common functions */ -static inline bool +static bool hash_net6_data_equal(const struct hash_net6_elem *ip1, const struct hash_net6_elem *ip2, u32 *multi) @@ -221,25 +221,25 @@ hash_net6_data_equal(const struct hash_net6_elem *ip1, ip1->cidr == ip2->cidr; } -static inline int +static int hash_net6_do_data_match(const struct hash_net6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_net6_data_set_flags(struct hash_net6_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } -static inline void +static void hash_net6_data_reset_flags(struct hash_net6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_net6_data_netmask(struct hash_net6_elem *elem, u8 cidr) { ip6_netmask(&elem->ip, cidr); @@ -262,7 +262,7 @@ nla_put_failure: return true; } -static inline void +static void hash_net6_data_next(struct hash_net6_elem *next, const struct hash_net6_elem *d) { @@ -368,6 +368,7 @@ static struct ip_set_type hash_net_type __read_mostly = { [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 87b29f971226..be5e95a0d876 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -25,7 +25,8 @@ /* 3 Counters support added */ /* 4 Comments support added */ /* 5 Forceadd support added */ -#define IPSET_TYPE_REV_MAX 6 /* skbinfo support added */ +/* 6 skbinfo support added */ +#define IPSET_TYPE_REV_MAX 7 /* interface wildcard support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); @@ -57,12 +58,13 @@ struct hash_netiface4_elem { u8 cidr; u8 nomatch; u8 elem; + u8 wildcard; char iface[IFNAMSIZ]; }; /* Common functions */ -static inline bool +static bool hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1, const struct hash_netiface4_elem *ip2, u32 *multi) @@ -71,28 +73,30 @@ hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1, ip1->cidr == ip2->cidr && (++*multi) && ip1->physdev == ip2->physdev && - strcmp(ip1->iface, ip2->iface) == 0; + (ip1->wildcard ? + strncmp(ip1->iface, ip2->iface, strlen(ip1->iface)) == 0 : + strcmp(ip1->iface, ip2->iface) == 0); } -static inline int +static int hash_netiface4_do_data_match(const struct hash_netiface4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netiface4_data_set_flags(struct hash_netiface4_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } -static inline void +static void hash_netiface4_data_reset_flags(struct hash_netiface4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netiface4_data_netmask(struct hash_netiface4_elem *elem, u8 cidr) { elem->ip &= ip_set_netmask(cidr); @@ -103,7 +107,8 @@ static bool hash_netiface4_data_list(struct sk_buff *skb, const struct hash_netiface4_elem *data) { - u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0; + u32 flags = (data->physdev ? IPSET_FLAG_PHYSDEV : 0) | + (data->wildcard ? IPSET_FLAG_IFACE_WILDCARD : 0); if (data->nomatch) flags |= IPSET_FLAG_NOMATCH; @@ -119,7 +124,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netiface4_data_next(struct hash_netiface4_elem *next, const struct hash_netiface4_elem *d) { @@ -229,6 +234,8 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], e.physdev = 1; if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); + if (cadt_flags & IPSET_FLAG_IFACE_WILDCARD) + e.wildcard = 1; } if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) { e.ip = htonl(ip & ip_set_hostmask(e.cidr)); @@ -280,12 +287,13 @@ struct hash_netiface6_elem { u8 cidr; u8 nomatch; u8 elem; + u8 wildcard; char iface[IFNAMSIZ]; }; /* Common functions */ -static inline bool +static bool hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1, const struct hash_netiface6_elem *ip2, u32 *multi) @@ -294,28 +302,30 @@ hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1, ip1->cidr == ip2->cidr && (++*multi) && ip1->physdev == ip2->physdev && - strcmp(ip1->iface, ip2->iface) == 0; + (ip1->wildcard ? + strncmp(ip1->iface, ip2->iface, strlen(ip1->iface)) == 0 : + strcmp(ip1->iface, ip2->iface) == 0); } -static inline int +static int hash_netiface6_do_data_match(const struct hash_netiface6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netiface6_data_set_flags(struct hash_netiface6_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } -static inline void +static void hash_netiface6_data_reset_flags(struct hash_netiface6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netiface6_data_netmask(struct hash_netiface6_elem *elem, u8 cidr) { ip6_netmask(&elem->ip, cidr); @@ -326,7 +336,8 @@ static bool hash_netiface6_data_list(struct sk_buff *skb, const struct hash_netiface6_elem *data) { - u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0; + u32 flags = (data->physdev ? IPSET_FLAG_PHYSDEV : 0) | + (data->wildcard ? IPSET_FLAG_IFACE_WILDCARD : 0); if (data->nomatch) flags |= IPSET_FLAG_NOMATCH; @@ -342,7 +353,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netiface6_data_next(struct hash_netiface6_elem *next, const struct hash_netiface6_elem *d) { @@ -440,6 +451,8 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], e.physdev = 1; if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); + if (cadt_flags & IPSET_FLAG_IFACE_WILDCARD) + e.wildcard = 1; } ret = adtfn(set, &e, &ext, &ext, flags); diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index a3ae69bfee66..da4ef910b12d 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -52,7 +52,7 @@ struct hash_netnet4_elem { /* Common functions */ -static inline bool +static bool hash_netnet4_data_equal(const struct hash_netnet4_elem *ip1, const struct hash_netnet4_elem *ip2, u32 *multi) @@ -61,32 +61,32 @@ hash_netnet4_data_equal(const struct hash_netnet4_elem *ip1, ip1->ccmp == ip2->ccmp; } -static inline int +static int hash_netnet4_do_data_match(const struct hash_netnet4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netnet4_data_set_flags(struct hash_netnet4_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } -static inline void +static void hash_netnet4_data_reset_flags(struct hash_netnet4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netnet4_data_reset_elem(struct hash_netnet4_elem *elem, struct hash_netnet4_elem *orig) { elem->ip[1] = orig->ip[1]; } -static inline void +static void hash_netnet4_data_netmask(struct hash_netnet4_elem *elem, u8 cidr, bool inner) { if (inner) { @@ -117,7 +117,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netnet4_data_next(struct hash_netnet4_elem *next, const struct hash_netnet4_elem *d) { @@ -282,7 +282,7 @@ struct hash_netnet6_elem { /* Common functions */ -static inline bool +static bool hash_netnet6_data_equal(const struct hash_netnet6_elem *ip1, const struct hash_netnet6_elem *ip2, u32 *multi) @@ -292,32 +292,32 @@ hash_netnet6_data_equal(const struct hash_netnet6_elem *ip1, ip1->ccmp == ip2->ccmp; } -static inline int +static int hash_netnet6_do_data_match(const struct hash_netnet6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netnet6_data_set_flags(struct hash_netnet6_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } -static inline void +static void hash_netnet6_data_reset_flags(struct hash_netnet6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netnet6_data_reset_elem(struct hash_netnet6_elem *elem, struct hash_netnet6_elem *orig) { elem->ip[1] = orig->ip[1]; } -static inline void +static void hash_netnet6_data_netmask(struct hash_netnet6_elem *elem, u8 cidr, bool inner) { if (inner) { @@ -348,7 +348,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netnet6_data_next(struct hash_netnet6_elem *next, const struct hash_netnet6_elem *d) { @@ -476,6 +476,7 @@ static struct ip_set_type hash_netnet_type __read_mostly = { [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_CIDR2] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 799f2272cc65..34448df80fb9 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -57,7 +57,7 @@ struct hash_netport4_elem { /* Common functions */ -static inline bool +static bool hash_netport4_data_equal(const struct hash_netport4_elem *ip1, const struct hash_netport4_elem *ip2, u32 *multi) @@ -68,25 +68,25 @@ hash_netport4_data_equal(const struct hash_netport4_elem *ip1, ip1->cidr == ip2->cidr; } -static inline int +static int hash_netport4_do_data_match(const struct hash_netport4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netport4_data_set_flags(struct hash_netport4_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } -static inline void +static void hash_netport4_data_reset_flags(struct hash_netport4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netport4_data_netmask(struct hash_netport4_elem *elem, u8 cidr) { elem->ip &= ip_set_netmask(cidr); @@ -112,7 +112,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netport4_data_next(struct hash_netport4_elem *next, const struct hash_netport4_elem *d) { @@ -270,7 +270,7 @@ struct hash_netport6_elem { /* Common functions */ -static inline bool +static bool hash_netport6_data_equal(const struct hash_netport6_elem *ip1, const struct hash_netport6_elem *ip2, u32 *multi) @@ -281,25 +281,25 @@ hash_netport6_data_equal(const struct hash_netport6_elem *ip1, ip1->cidr == ip2->cidr; } -static inline int +static int hash_netport6_do_data_match(const struct hash_netport6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netport6_data_set_flags(struct hash_netport6_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } -static inline void +static void hash_netport6_data_reset_flags(struct hash_netport6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netport6_data_netmask(struct hash_netport6_elem *elem, u8 cidr) { ip6_netmask(&elem->ip, cidr); @@ -325,7 +325,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netport6_data_next(struct hash_netport6_elem *next, const struct hash_netport6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index a82b70e8b9a6..934c1712cba8 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -56,7 +56,7 @@ struct hash_netportnet4_elem { /* Common functions */ -static inline bool +static bool hash_netportnet4_data_equal(const struct hash_netportnet4_elem *ip1, const struct hash_netportnet4_elem *ip2, u32 *multi) @@ -67,32 +67,32 @@ hash_netportnet4_data_equal(const struct hash_netportnet4_elem *ip1, ip1->proto == ip2->proto; } -static inline int +static int hash_netportnet4_do_data_match(const struct hash_netportnet4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netportnet4_data_set_flags(struct hash_netportnet4_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } -static inline void +static void hash_netportnet4_data_reset_flags(struct hash_netportnet4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netportnet4_data_reset_elem(struct hash_netportnet4_elem *elem, struct hash_netportnet4_elem *orig) { elem->ip[1] = orig->ip[1]; } -static inline void +static void hash_netportnet4_data_netmask(struct hash_netportnet4_elem *elem, u8 cidr, bool inner) { @@ -126,7 +126,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netportnet4_data_next(struct hash_netportnet4_elem *next, const struct hash_netportnet4_elem *d) { @@ -331,7 +331,7 @@ struct hash_netportnet6_elem { /* Common functions */ -static inline bool +static bool hash_netportnet6_data_equal(const struct hash_netportnet6_elem *ip1, const struct hash_netportnet6_elem *ip2, u32 *multi) @@ -343,32 +343,32 @@ hash_netportnet6_data_equal(const struct hash_netportnet6_elem *ip1, ip1->proto == ip2->proto; } -static inline int +static int hash_netportnet6_do_data_match(const struct hash_netportnet6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netportnet6_data_set_flags(struct hash_netportnet6_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } -static inline void +static void hash_netportnet6_data_reset_flags(struct hash_netportnet6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netportnet6_data_reset_elem(struct hash_netportnet6_elem *elem, struct hash_netportnet6_elem *orig) { elem->ip[1] = orig->ip[1]; } -static inline void +static void hash_netportnet6_data_netmask(struct hash_netportnet6_elem *elem, u8 cidr, bool inner) { @@ -402,7 +402,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netportnet6_data_next(struct hash_netportnet6_elem *next, const struct hash_netportnet6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 6f9ead6319e0..cd747c0962fd 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -149,7 +149,7 @@ __list_set_del_rcu(struct rcu_head * rcu) kfree(e); } -static inline void +static void list_set_del(struct ip_set *set, struct set_elem *e) { struct list_set *map = set->data; @@ -160,7 +160,7 @@ list_set_del(struct ip_set *set, struct set_elem *e) call_rcu(&e->rcu, __list_set_del_rcu); } -static inline void +static void list_set_replace(struct ip_set *set, struct set_elem *e, struct set_elem *old) { struct list_set *map = set->data; @@ -288,7 +288,7 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, if (n && !(SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(n, set)))) - n = NULL; + n = NULL; e = kzalloc(set->dsize, GFP_ATOMIC); if (!e) diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index f6f1a0d5c47d..5b672e05d758 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -135,7 +135,7 @@ config IP_VS_WRR module, choose M here. If unsure, say N. config IP_VS_LC - tristate "least-connection scheduling" + tristate "least-connection scheduling" ---help--- The least-connection scheduling algorithm directs network connections to the server with the least number of active @@ -145,7 +145,7 @@ config IP_VS_LC module, choose M here. If unsure, say N. config IP_VS_WLC - tristate "weighted least-connection scheduling" + tristate "weighted least-connection scheduling" ---help--- The weighted least-connection scheduling algorithm directs network connections to the server with the least active connections @@ -333,7 +333,7 @@ config IP_VS_NFCT config IP_VS_PE_SIP tristate "SIP persistence engine" - depends on IP_VS_PROTO_UDP + depends on IP_VS_PROTO_UDP depends on NF_CONNTRACK_SIP ---help--- Allow persistence based on the SIP Call-ID diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index 4515056ef1c2..f9b16f2b2219 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -193,21 +193,29 @@ struct ip_vs_app *register_ip_vs_app(struct netns_ipvs *ipvs, struct ip_vs_app * mutex_lock(&__ip_vs_app_mutex); + /* increase the module use count */ + if (!ip_vs_use_count_inc()) { + err = -ENOENT; + goto out_unlock; + } + list_for_each_entry(a, &ipvs->app_list, a_list) { if (!strcmp(app->name, a->name)) { err = -EEXIST; + /* decrease the module use count */ + ip_vs_use_count_dec(); goto out_unlock; } } a = kmemdup(app, sizeof(*app), GFP_KERNEL); if (!a) { err = -ENOMEM; + /* decrease the module use count */ + ip_vs_use_count_dec(); goto out_unlock; } INIT_LIST_HEAD(&a->incs_list); list_add(&a->a_list, &ipvs->app_list); - /* increase the module use count */ - ip_vs_use_count_inc(); out_unlock: mutex_unlock(&__ip_vs_app_mutex); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 46f06f92ab8f..512259f579d7 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -617,7 +617,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0; - union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } }; + union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } }; /* create a new connection entry */ IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__); @@ -2402,18 +2402,22 @@ estimator_fail: return -ENOMEM; } -static void __net_exit __ip_vs_cleanup(struct net *net) +static void __net_exit __ip_vs_cleanup_batch(struct list_head *net_list) { - struct netns_ipvs *ipvs = net_ipvs(net); - - ip_vs_service_net_cleanup(ipvs); /* ip_vs_flush() with locks */ - ip_vs_conn_net_cleanup(ipvs); - ip_vs_app_net_cleanup(ipvs); - ip_vs_protocol_net_cleanup(ipvs); - ip_vs_control_net_cleanup(ipvs); - ip_vs_estimator_net_cleanup(ipvs); - IP_VS_DBG(2, "ipvs netns %d released\n", ipvs->gen); - net->ipvs = NULL; + struct netns_ipvs *ipvs; + struct net *net; + + ip_vs_service_nets_cleanup(net_list); /* ip_vs_flush() with locks */ + list_for_each_entry(net, net_list, exit_list) { + ipvs = net_ipvs(net); + ip_vs_conn_net_cleanup(ipvs); + ip_vs_app_net_cleanup(ipvs); + ip_vs_protocol_net_cleanup(ipvs); + ip_vs_control_net_cleanup(ipvs); + ip_vs_estimator_net_cleanup(ipvs); + IP_VS_DBG(2, "ipvs netns %d released\n", ipvs->gen); + net->ipvs = NULL; + } } static int __net_init __ip_vs_dev_init(struct net *net) @@ -2429,27 +2433,32 @@ hook_fail: return ret; } -static void __net_exit __ip_vs_dev_cleanup(struct net *net) +static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list) { - struct netns_ipvs *ipvs = net_ipvs(net); + struct netns_ipvs *ipvs; + struct net *net; + EnterFunction(2); - nf_unregister_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); - ipvs->enable = 0; /* Disable packet reception */ - smp_wmb(); - ip_vs_sync_net_cleanup(ipvs); + list_for_each_entry(net, net_list, exit_list) { + ipvs = net_ipvs(net); + nf_unregister_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); + ipvs->enable = 0; /* Disable packet reception */ + smp_wmb(); + ip_vs_sync_net_cleanup(ipvs); + } LeaveFunction(2); } static struct pernet_operations ipvs_core_ops = { .init = __ip_vs_init, - .exit = __ip_vs_cleanup, + .exit_batch = __ip_vs_cleanup_batch, .id = &ip_vs_net_id, .size = sizeof(struct netns_ipvs), }; static struct pernet_operations ipvs_core_dev_ops = { .init = __ip_vs_dev_init, - .exit = __ip_vs_dev_cleanup, + .exit_batch = __ip_vs_dev_cleanup_batch, }; /* diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 060565e7d227..8d14a1acbc37 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -93,7 +93,6 @@ static bool __ip_vs_addr_is_local_v6(struct net *net, static void update_defense_level(struct netns_ipvs *ipvs) { struct sysinfo i; - static int old_secure_tcp = 0; int availmem; int nomem; int to_change = -1; @@ -174,35 +173,35 @@ static void update_defense_level(struct netns_ipvs *ipvs) spin_lock(&ipvs->securetcp_lock); switch (ipvs->sysctl_secure_tcp) { case 0: - if (old_secure_tcp >= 2) + if (ipvs->old_secure_tcp >= 2) to_change = 0; break; case 1: if (nomem) { - if (old_secure_tcp < 2) + if (ipvs->old_secure_tcp < 2) to_change = 1; ipvs->sysctl_secure_tcp = 2; } else { - if (old_secure_tcp >= 2) + if (ipvs->old_secure_tcp >= 2) to_change = 0; } break; case 2: if (nomem) { - if (old_secure_tcp < 2) + if (ipvs->old_secure_tcp < 2) to_change = 1; } else { - if (old_secure_tcp >= 2) + if (ipvs->old_secure_tcp >= 2) to_change = 0; ipvs->sysctl_secure_tcp = 1; } break; case 3: - if (old_secure_tcp < 2) + if (ipvs->old_secure_tcp < 2) to_change = 1; break; } - old_secure_tcp = ipvs->sysctl_secure_tcp; + ipvs->old_secure_tcp = ipvs->sysctl_secure_tcp; if (to_change >= 0) ip_vs_protocol_timeout_change(ipvs, ipvs->sysctl_secure_tcp > 1); @@ -262,7 +261,7 @@ static inline unsigned int ip_vs_svc_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto, const union nf_inet_addr *addr, __be16 port) { - register unsigned int porth = ntohs(port); + unsigned int porth = ntohs(port); __be32 addr_fold = addr->ip; __u32 ahash; @@ -424,7 +423,7 @@ ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol if (!svc && protocol == IPPROTO_TCP && atomic_read(&ipvs->ftpsvc_counter) && - (vport == FTPDATA || ntohs(vport) >= inet_prot_sock(ipvs->net))) { + (vport == FTPDATA || !inet_port_requires_bind_service(ipvs->net, ntohs(vport)))) { /* * Check if ftp service entry exists, the packet * might belong to FTP data connections. @@ -493,7 +492,7 @@ static inline unsigned int ip_vs_rs_hashkey(int af, const union nf_inet_addr *addr, __be16 port) { - register unsigned int porth = ntohs(port); + unsigned int porth = ntohs(port); __be32 addr_fold = addr->ip; #ifdef CONFIG_IP_VS_IPV6 @@ -1275,7 +1274,8 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, struct ip_vs_service *svc = NULL; /* increase the module use count */ - ip_vs_use_count_inc(); + if (!ip_vs_use_count_inc()) + return -ENOPROTOOPT; /* Lookup the scheduler by 'u->sched_name' */ if (strcmp(u->sched_name, "none")) { @@ -1607,14 +1607,20 @@ static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup) /* * Delete service by {netns} in the service table. - * Called by __ip_vs_cleanup() + * Called by __ip_vs_batch_cleanup() */ -void ip_vs_service_net_cleanup(struct netns_ipvs *ipvs) +void ip_vs_service_nets_cleanup(struct list_head *net_list) { + struct netns_ipvs *ipvs; + struct net *net; + EnterFunction(2); /* Check for "full" addressed entries */ mutex_lock(&__ip_vs_mutex); - ip_vs_flush(ipvs, true); + list_for_each_entry(net, net_list, exit_list) { + ipvs = net_ipvs(net); + ip_vs_flush(ipvs, true); + } mutex_unlock(&__ip_vs_mutex); LeaveFunction(2); } @@ -1737,12 +1743,18 @@ proc_do_defense_mode(struct ctl_table *table, int write, int val = *valp; int rc; - rc = proc_dointvec(table, write, buffer, lenp, ppos); + struct ctl_table tmp = { + .data = &val, + .maxlen = sizeof(int), + .mode = table->mode, + }; + + rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); if (write && (*valp != val)) { - if ((*valp < 0) || (*valp > 3)) { - /* Restore the correct value */ - *valp = val; + if (val < 0 || val > 3) { + rc = -EINVAL; } else { + *valp = val; update_defense_level(ipvs); } } @@ -1756,33 +1768,20 @@ proc_do_sync_threshold(struct ctl_table *table, int write, int *valp = table->data; int val[2]; int rc; + struct ctl_table tmp = { + .data = &val, + .maxlen = table->maxlen, + .mode = table->mode, + }; - /* backup the value first */ memcpy(val, valp, sizeof(val)); - - rc = proc_dointvec(table, write, buffer, lenp, ppos); - if (write && (valp[0] < 0 || valp[1] < 0 || - (valp[0] >= valp[1] && valp[1]))) { - /* Restore the correct value */ - memcpy(valp, val, sizeof(val)); - } - return rc; -} - -static int -proc_do_sync_mode(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int *valp = table->data; - int val = *valp; - int rc; - - rc = proc_dointvec(table, write, buffer, lenp, ppos); - if (write && (*valp != val)) { - if ((*valp < 0) || (*valp > 1)) { - /* Restore the correct value */ - *valp = val; - } + rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); + if (write) { + if (val[0] < 0 || val[1] < 0 || + (val[0] >= val[1] && val[1])) + rc = -EINVAL; + else + memcpy(valp, val, sizeof(val)); } return rc; } @@ -1795,12 +1794,18 @@ proc_do_sync_ports(struct ctl_table *table, int write, int val = *valp; int rc; - rc = proc_dointvec(table, write, buffer, lenp, ppos); + struct ctl_table tmp = { + .data = &val, + .maxlen = sizeof(int), + .mode = table->mode, + }; + + rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); if (write && (*valp != val)) { - if (*valp < 1 || !is_power_of_2(*valp)) { - /* Restore the correct value */ + if (val < 1 || !is_power_of_2(val)) + rc = -EINVAL; + else *valp = val; - } } return rc; } @@ -1860,7 +1865,9 @@ static struct ctl_table vs_vars[] = { .procname = "sync_version", .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_do_sync_mode, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "sync_ports", @@ -2434,9 +2441,6 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) if (copy_from_user(arg, user, len) != 0) return -EFAULT; - /* increase the module use count */ - ip_vs_use_count_inc(); - /* Handle daemons since they have another lock */ if (cmd == IP_VS_SO_SET_STARTDAEMON || cmd == IP_VS_SO_SET_STOPDAEMON) { @@ -2449,13 +2453,13 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) ret = -EINVAL; if (strscpy(cfg.mcast_ifn, dm->mcast_ifn, sizeof(cfg.mcast_ifn)) <= 0) - goto out_dec; + return ret; cfg.syncid = dm->syncid; ret = start_sync_thread(ipvs, &cfg, dm->state); } else { ret = stop_sync_thread(ipvs, dm->state); } - goto out_dec; + return ret; } mutex_lock(&__ip_vs_mutex); @@ -2550,10 +2554,6 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) out_unlock: mutex_unlock(&__ip_vs_mutex); - out_dec: - /* decrease the module use count */ - ip_vs_use_count_dec(); - return ret; } diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index c8b5a504476c..77c323c36a88 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -160,7 +160,7 @@ static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) /* get weighted least-connection node in the destination set */ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) { - register struct ip_vs_dest_set_elem *e; + struct ip_vs_dest_set_elem *e; struct ip_vs_dest *dest, *least; int loh, doh; @@ -209,7 +209,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) /* get weighted most-connection node in the destination set */ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) { - register struct ip_vs_dest_set_elem *e; + struct ip_vs_dest_set_elem *e; struct ip_vs_dest *dest, *most; int moh, doh; diff --git a/net/netfilter/ipvs/ip_vs_mh.c b/net/netfilter/ipvs/ip_vs_mh.c index 94d9d349ebb0..da0280cec506 100644 --- a/net/netfilter/ipvs/ip_vs_mh.c +++ b/net/netfilter/ipvs/ip_vs_mh.c @@ -174,8 +174,8 @@ static int ip_vs_mh_populate(struct ip_vs_mh_state *s, return 0; } - table = kcalloc(BITS_TO_LONGS(IP_VS_MH_TAB_SIZE), - sizeof(unsigned long), GFP_KERNEL); + table = kcalloc(BITS_TO_LONGS(IP_VS_MH_TAB_SIZE), + sizeof(unsigned long), GFP_KERNEL); if (!table) return -ENOMEM; diff --git a/net/netfilter/ipvs/ip_vs_ovf.c b/net/netfilter/ipvs/ip_vs_ovf.c index 78b074cd5464..c03066fdd5ca 100644 --- a/net/netfilter/ipvs/ip_vs_ovf.c +++ b/net/netfilter/ipvs/ip_vs_ovf.c @@ -5,7 +5,7 @@ * Authors: Raducu Deaconu <rhadoo_io@yahoo.com> * * Scheduler implements "overflow" loadbalancing according to number of active - * connections , will keep all conections to the node with the highest weight + * connections , will keep all connections to the node with the highest weight * and overflow to the next node if the number of connections exceeds the node's * weight. * Note that this scheduler might not be suitable for UDP because it only uses diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c index 8e104dff7abc..166c669f0763 100644 --- a/net/netfilter/ipvs/ip_vs_pe.c +++ b/net/netfilter/ipvs/ip_vs_pe.c @@ -68,7 +68,8 @@ int register_ip_vs_pe(struct ip_vs_pe *pe) struct ip_vs_pe *tmp; /* increase the module use count */ - ip_vs_use_count_inc(); + if (!ip_vs_use_count_inc()) + return -ENOENT; mutex_lock(&ip_vs_pe_mutex); /* Make sure that the pe with this name doesn't exist diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 000d961b97e4..32b028853a7c 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -710,7 +710,7 @@ static int __ip_vs_tcp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd sizeof(tcp_timeouts)); if (!pd->timeout_table) return -ENOMEM; - pd->tcp_state_table = tcp_states; + pd->tcp_state_table = tcp_states; return 0; } diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c index 2f9d5cd5daee..d4903723be7e 100644 --- a/net/netfilter/ipvs/ip_vs_sched.c +++ b/net/netfilter/ipvs/ip_vs_sched.c @@ -179,7 +179,8 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) } /* increase the module use count */ - ip_vs_use_count_inc(); + if (!ip_vs_use_count_inc()) + return -ENOENT; mutex_lock(&ip_vs_sched_mutex); diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index a4a78c4b06de..605e0f68f8bd 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1239,7 +1239,7 @@ static void ip_vs_process_message(struct netns_ipvs *ipvs, __u8 *buffer, p = msg_end; if (p + sizeof(s->v4) > buffer+buflen) { - IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n"); + IP_VS_ERR_RL("BACKUP, Dropping buffer, too small\n"); return; } s = (union ip_vs_sync_conn *)p; @@ -1762,6 +1762,10 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %zd bytes\n", sizeof(struct ip_vs_sync_conn_v0)); + /* increase the module use count */ + if (!ip_vs_use_count_inc()) + return -ENOPROTOOPT; + /* Do not hold one mutex and then to block on another */ for (;;) { rtnl_lock(); @@ -1892,9 +1896,6 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, mutex_unlock(&ipvs->sync_mutex); rtnl_unlock(); - /* increase the module use count */ - ip_vs_use_count_inc(); - return 0; out: @@ -1924,11 +1925,17 @@ out: } kfree(ti); } + + /* decrease the module use count */ + ip_vs_use_count_dec(); return result; out_early: mutex_unlock(&ipvs->sync_mutex); rtnl_unlock(); + + /* decrease the module use count */ + ip_vs_use_count_dec(); return result; } diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 9c464d24beec..b00866d777fe 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -208,7 +208,7 @@ static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu) struct rtable *ort = skb_rtable(skb); if (!skb->dev && sk && sk_fullsock(sk)) - ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); + ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu, true); } static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af, @@ -407,12 +407,9 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, goto err_put; skb_dst_drop(skb); - if (noref) { - if (!local) - skb_dst_set_noref(skb, &rt->dst); - else - skb_dst_set(skb, dst_clone(&rt->dst)); - } else + if (noref) + skb_dst_set_noref(skb, &rt->dst); + else skb_dst_set(skb, &rt->dst); return local; @@ -574,12 +571,9 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, goto err_put; skb_dst_drop(skb); - if (noref) { - if (!local) - skb_dst_set_noref(skb, &rt->dst); - else - skb_dst_set(skb, dst_clone(&rt->dst)); - } else + if (noref) + skb_dst_set_noref(skb, &rt->dst); + else skb_dst_set(skb, &rt->dst); return local; @@ -613,7 +607,7 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, if (unlikely(cp->flags & IP_VS_CONN_F_NFCT)) ret = ip_vs_confirm_conntrack(skb); if (ret == NF_ACCEPT) { - nf_reset(skb); + nf_reset_ct(skb); skb_forward_csum(skb); } return ret; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 81a8ef42b88d..d1305423640f 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -73,8 +73,7 @@ struct conntrack_gc_work { }; static __read_mostly struct kmem_cache *nf_conntrack_cachep; -static __read_mostly spinlock_t nf_conntrack_locks_all_lock; -static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); +static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); static __read_mostly bool nf_conntrack_locks_all; /* every gc cycle scans at most 1/GC_MAX_BUCKETS_DIV part of table */ @@ -574,7 +573,6 @@ EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); void nf_ct_tmpl_free(struct nf_conn *tmpl) { nf_ct_ext_destroy(tmpl); - nf_ct_ext_free(tmpl); if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) kfree((char *)tmpl - tmpl->proto.tmpl_padto); @@ -897,9 +895,10 @@ static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo, } /* Resolve race on insertion if this protocol allows this. */ -static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb, - enum ip_conntrack_info ctinfo, - struct nf_conntrack_tuple_hash *h) +static __cold noinline int +nf_ct_resolve_clash(struct net *net, struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + struct nf_conntrack_tuple_hash *h) { /* This is the conntrack entry already in hashes that won race. */ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); @@ -1418,7 +1417,6 @@ void nf_conntrack_free(struct nf_conn *ct) WARN_ON(atomic_read(&ct->ct_general.use) != 0); nf_ct_ext_destroy(ct); - nf_ct_ext_free(ct); kmem_cache_free(nf_conntrack_cachep, ct); smp_mb__before_atomic(); atomic_dec(&net->ct.count); @@ -1793,8 +1791,8 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, if (nf_ct_is_confirmed(ct)) extra_jiffies += nfct_time_stamp; - if (ct->timeout != extra_jiffies) - ct->timeout = extra_jiffies; + if (READ_ONCE(ct->timeout) != extra_jiffies) + WRITE_ONCE(ct->timeout, extra_jiffies); acct: if (do_acct) nf_ct_acct_update(ct, ctinfo, skb->len); @@ -2250,8 +2248,7 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); - hash = kvmalloc_array(nr_slots, sizeof(struct hlist_nulls_head), - GFP_KERNEL | __GFP_ZERO); + hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL); if (hash && nulls) for (i = 0; i < nr_slots; i++) @@ -2336,7 +2333,6 @@ int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp) return nf_conntrack_hash_resize(hashsize); } -EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); static __always_inline unsigned int total_extension_size(void) { diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 5e2812ee2149..7956c9f19899 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -24,11 +24,13 @@ #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_extend.h> static DEFINE_MUTEX(nf_ct_ecache_mutex); #define ECACHE_RETRY_WAIT (HZ/10) +#define ECACHE_STACK_ALLOC (256 / sizeof(void *)) enum retry_state { STATE_CONGESTED, @@ -38,11 +40,11 @@ enum retry_state { static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu) { - struct nf_conn *refs[16]; + struct nf_conn *refs[ECACHE_STACK_ALLOC]; + enum retry_state ret = STATE_DONE; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; unsigned int evicted = 0; - enum retry_state ret = STATE_DONE; spin_lock(&pcpu->lock); @@ -53,10 +55,22 @@ static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu) if (!nf_ct_is_confirmed(ct)) continue; + /* This ecache access is safe because the ct is on the + * pcpu dying list and we hold the spinlock -- the entry + * cannot be free'd until after the lock is released. + * + * This is true even if ct has a refcount of 0: the + * cpu that is about to free the entry must remove it + * from the dying list and needs the lock to do so. + */ e = nf_ct_ecache_find(ct); if (!e || e->state != NFCT_ECACHE_DESTROY_FAIL) continue; + /* ct is in NFCT_ECACHE_DESTROY_FAIL state, this means + * the worker owns this entry: the ct will remain valid + * until the worker puts its ct reference. + */ if (nf_conntrack_event(IPCT_DESTROY, ct)) { ret = STATE_CONGESTED; break; @@ -188,15 +202,15 @@ void nf_ct_deliver_cached_events(struct nf_conn *ct) if (notify == NULL) goto out_unlock; + if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct)) + goto out_unlock; + e = nf_ct_ecache_find(ct); if (e == NULL) goto out_unlock; events = xchg(&e->cache, 0); - if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct)) - goto out_unlock; - /* We make a copy of the missed event cache without taking * the lock, thus we may send missed events twice. However, * this does not harm and it happens very rarely. */ diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 65364de915d1..42557d2b6a90 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -25,8 +25,10 @@ #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_zones.h> diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c index d4ed1e197921..3dbe2329c3f1 100644 --- a/net/netfilter/nf_conntrack_extend.c +++ b/net/netfilter/nf_conntrack_extend.c @@ -34,21 +34,23 @@ void nf_ct_ext_destroy(struct nf_conn *ct) t->destroy(ct); rcu_read_unlock(); } + + kfree(ct->ext); } -EXPORT_SYMBOL(nf_ct_ext_destroy); void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp) { unsigned int newlen, newoff, oldlen, alloc; - struct nf_ct_ext *old, *new; struct nf_ct_ext_type *t; + struct nf_ct_ext *new; /* Conntrack must not be confirmed to avoid races on reallocation. */ WARN_ON(nf_ct_is_confirmed(ct)); - old = ct->ext; - if (old) { + if (ct->ext) { + const struct nf_ct_ext *old = ct->ext; + if (__nf_ct_ext_exist(old, id)) return NULL; oldlen = old->len; @@ -68,22 +70,18 @@ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp) rcu_read_unlock(); alloc = max(newlen, NF_CT_EXT_PREALLOC); - kmemleak_not_leak(old); - new = __krealloc(old, alloc, gfp); + new = krealloc(ct->ext, alloc, gfp); if (!new) return NULL; - if (!old) { + if (!ct->ext) memset(new->offset, 0, sizeof(new->offset)); - ct->ext = new; - } else if (new != old) { - kfree_rcu(old, rcu); - rcu_assign_pointer(ct->ext, new); - } new->offset[id] = newoff; new->len = newlen; memset((void *)new + newoff, 0, newlen - newoff); + + ct->ext = new; return (void *)new + newoff; } EXPORT_SYMBOL(nf_ct_ext_add); diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index 0ecb3e289ef2..9eca90414bb7 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -162,7 +162,7 @@ static int try_rfc959(const char *data, size_t dlen, if (length == 0) return 0; - cmd->u3.ip = htonl((array[0] << 24) | (array[1] << 16) | + cmd->u3.ip = htonl((array[0] << 24) | (array[1] << 16) | (array[2] << 8) | array[3]); cmd->u.tcp.port = htons((array[4] << 8) | array[5]); return length; @@ -322,7 +322,7 @@ static int find_pattern(const char *data, size_t dlen, i++; } - pr_debug("Skipped up to `%c'!\n", skip); + pr_debug("Skipped up to 0x%hhx delimiter!\n", skip); *numoff = i; *numlen = getnum(data + i, dlen - i, cmd, term, numoff); diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 8d729e7c36ff..118f415928ae 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -21,10 +21,11 @@ #include <linux/rtnetlink.h> #include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_conntrack_l4proto.h> -#include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_extend.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_log.h> static DEFINE_MUTEX(nf_ct_helper_mutex); diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c index 74b8113f7aeb..522792556632 100644 --- a/net/netfilter/nf_conntrack_labels.c +++ b/net/netfilter/nf_conntrack_labels.c @@ -11,7 +11,7 @@ #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_labels.h> -static spinlock_t nf_connlabels_lock; +static DEFINE_SPINLOCK(nf_connlabels_lock); static int replace_u32(u32 *address, u32 mask, u32 new) { @@ -89,7 +89,6 @@ int nf_conntrack_labels_init(void) { BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE / sizeof(long) >= U8_MAX); - spin_lock_init(&nf_connlabels_lock); return nf_ct_extend_register(&labels_extend); } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 6aa01eb6fe99..6a1c8f1f6171 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -506,9 +506,45 @@ nla_put_failure: return -1; } +/* all these functions access ct->ext. Caller must either hold a reference + * on ct or prevent its deletion by holding either the bucket spinlock or + * pcpu dying list lock. + */ +static int ctnetlink_dump_extinfo(struct sk_buff *skb, + struct nf_conn *ct, u32 type) +{ + if (ctnetlink_dump_acct(skb, ct, type) < 0 || + ctnetlink_dump_timestamp(skb, ct) < 0 || + ctnetlink_dump_helpinfo(skb, ct) < 0 || + ctnetlink_dump_labels(skb, ct) < 0 || + ctnetlink_dump_ct_seq_adj(skb, ct) < 0 || + ctnetlink_dump_ct_synproxy(skb, ct) < 0) + return -1; + + return 0; +} + +static int ctnetlink_dump_info(struct sk_buff *skb, struct nf_conn *ct) +{ + if (ctnetlink_dump_status(skb, ct) < 0 || + ctnetlink_dump_mark(skb, ct) < 0 || + ctnetlink_dump_secctx(skb, ct) < 0 || + ctnetlink_dump_id(skb, ct) < 0 || + ctnetlink_dump_use(skb, ct) < 0 || + ctnetlink_dump_master(skb, ct) < 0) + return -1; + + if (!test_bit(IPS_OFFLOAD_BIT, &ct->status) && + (ctnetlink_dump_timeout(skb, ct) < 0 || + ctnetlink_dump_protoinfo(skb, ct) < 0)) + return -1; + + return 0; +} + static int ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, - struct nf_conn *ct) + struct nf_conn *ct, bool extinfo) { const struct nf_conntrack_zone *zone; struct nlmsghdr *nlh; @@ -552,20 +588,9 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, NF_CT_DEFAULT_ZONE_DIR) < 0) goto nla_put_failure; - if (ctnetlink_dump_status(skb, ct) < 0 || - ctnetlink_dump_timeout(skb, ct) < 0 || - ctnetlink_dump_acct(skb, ct, type) < 0 || - ctnetlink_dump_timestamp(skb, ct) < 0 || - ctnetlink_dump_protoinfo(skb, ct) < 0 || - ctnetlink_dump_helpinfo(skb, ct) < 0 || - ctnetlink_dump_mark(skb, ct) < 0 || - ctnetlink_dump_secctx(skb, ct) < 0 || - ctnetlink_dump_labels(skb, ct) < 0 || - ctnetlink_dump_id(skb, ct) < 0 || - ctnetlink_dump_use(skb, ct) < 0 || - ctnetlink_dump_master(skb, ct) < 0 || - ctnetlink_dump_ct_seq_adj(skb, ct) < 0 || - ctnetlink_dump_ct_synproxy(skb, ct) < 0) + if (ctnetlink_dump_info(skb, ct) < 0) + goto nla_put_failure; + if (extinfo && ctnetlink_dump_extinfo(skb, ct, type) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -950,13 +975,11 @@ restart: if (!ctnetlink_filter_match(ct, cb->data)) continue; - rcu_read_lock(); res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), - ct); - rcu_read_unlock(); + ct, true); if (res < 0) { nf_conntrack_get(&ct->ct_general); cb->args[1] = (unsigned long)ct; @@ -1361,10 +1384,8 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl, return -ENOMEM; } - rcu_read_lock(); err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, - NFNL_MSG_TYPE(nlh->nlmsg_type), ct); - rcu_read_unlock(); + NFNL_MSG_TYPE(nlh->nlmsg_type), ct, true); nf_ct_put(ct); if (err <= 0) goto free; @@ -1426,12 +1447,18 @@ restart: continue; cb->args[1] = 0; } - rcu_read_lock(); + + /* We can't dump extension info for the unconfirmed + * list because unconfirmed conntracks can have + * ct->ext reallocated (and thus freed). + * + * In the dying list case ct->ext can't be free'd + * until after we drop pcpu->lock. + */ res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), - ct); - rcu_read_unlock(); + ct, dying ? true : false); if (res < 0) { if (!atomic_inc_not_zero(&ct->ct_general.use)) continue; @@ -3599,6 +3626,9 @@ static void __net_exit ctnetlink_net_exit_batch(struct list_head *net_exit_list) list_for_each_entry(net, net_exit_list, exit_list) ctnetlink_net_exit(net); + + /* wait for other cpus until they are done with ctnl_notifiers */ + synchronize_rcu(); } static struct pernet_operations ctnetlink_net_ops = { diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index b6b14db3955b..b3f4a334f9d7 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -677,6 +677,9 @@ static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[], unsigned int *timeouts = data; int i; + if (!timeouts) + timeouts = dn->dccp_timeout; + /* set default DCCP timeouts. */ for (i=0; i<CT_DCCP_MAX; i++) timeouts[i] = dn->dccp_timeout[i]; diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c index 097deba7441a..c2e3dff773bc 100644 --- a/net/netfilter/nf_conntrack_proto_icmp.c +++ b/net/netfilter/nf_conntrack_proto_icmp.c @@ -235,11 +235,7 @@ int nf_conntrack_icmpv4_error(struct nf_conn *tmpl, } /* Need to track icmp error message? */ - if (icmph->type != ICMP_DEST_UNREACH && - icmph->type != ICMP_SOURCE_QUENCH && - icmph->type != ICMP_TIME_EXCEEDED && - icmph->type != ICMP_PARAMETERPROB && - icmph->type != ICMP_REDIRECT) + if (!icmp_is_err(icmph->type)) return NF_ACCEPT; memset(&outer_daddr, 0, sizeof(outer_daddr)); diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c index 7e317e6698ba..6f9144e1f1c1 100644 --- a/net/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/netfilter/nf_conntrack_proto_icmpv6.c @@ -22,7 +22,6 @@ #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_conntrack_zones.h> -#include <net/netfilter/ipv6/nf_conntrack_icmpv6.h> #include <net/netfilter/nf_log.h> static const unsigned int nf_ct_icmpv6_timeout = 30*HZ; diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index fce3d93f1541..4f897b14b606 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -114,7 +114,7 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { { /* ORIGINAL */ /* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */ -/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA}, +/* init */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA}, /* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA}, /* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, /* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL, sSS}, @@ -130,7 +130,7 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { /* REPLY */ /* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */ /* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* INIT in sCL Big TODO */ -/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA}, +/* init_ack */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA}, /* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV, sCL}, /* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV, sSR}, /* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV, sHA}, @@ -316,7 +316,7 @@ sctp_new(struct nf_conn *ct, const struct sk_buff *skb, ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag; } - ct->proto.sctp.state = new_state; + ct->proto.sctp.state = SCTP_CONNTRACK_NONE; } return true; @@ -594,6 +594,9 @@ static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[], struct nf_sctp_net *sn = nf_sctp_pernet(net); int i; + if (!timeouts) + timeouts = sn->timeouts; + /* set default SCTP timeouts. */ for (i=0; i<SCTP_CONNTRACK_MAX; i++) timeouts[i] = sn->timeouts[i]; diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 85c1f8c213b0..1926fd56df56 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1227,7 +1227,7 @@ static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = { [CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] = { .type = NLA_U8 }, [CTA_PROTOINFO_TCP_WSCALE_REPLY] = { .type = NLA_U8 }, [CTA_PROTOINFO_TCP_FLAGS_ORIGINAL] = { .len = sizeof(struct nf_ct_tcp_flags) }, - [CTA_PROTOINFO_TCP_FLAGS_REPLY] = { .len = sizeof(struct nf_ct_tcp_flags) }, + [CTA_PROTOINFO_TCP_FLAGS_REPLY] = { .len = sizeof(struct nf_ct_tcp_flags) }, }; #define TCP_NLATTR_SIZE ( \ diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index e0d392cb3075..410809c669e1 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -511,8 +511,6 @@ static void nf_conntrack_standalone_fini_proc(struct net *net) /* Log invalid packets of a given protocol */ static int log_invalid_proto_min __read_mostly; static int log_invalid_proto_max __read_mostly = 255; -static int zero; -static int one = 1; /* size the user *wants to set */ static unsigned int nf_conntrack_htable_size_user __read_mostly; @@ -629,8 +627,8 @@ static struct ctl_table nf_ct_sysctl_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_LOG_INVALID] = { .procname = "nf_conntrack_log_invalid", @@ -654,8 +652,8 @@ static struct ctl_table nf_ct_sysctl_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_HELPER] = { .procname = "nf_conntrack_helper", @@ -663,8 +661,8 @@ static struct ctl_table nf_ct_sysctl_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #ifdef CONFIG_NF_CONNTRACK_EVENTS [NF_SYSCTL_CT_EVENTS] = { @@ -673,8 +671,8 @@ static struct ctl_table nf_ct_sysctl_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP @@ -684,8 +682,8 @@ static struct ctl_table nf_ct_sysctl_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif [NF_SYSCTL_CT_PROTO_TIMEOUT_GENERIC] = { @@ -759,16 +757,16 @@ static struct ctl_table nf_ct_sysctl_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_PROTO_TCP_LIBERAL] = { .procname = "nf_conntrack_tcp_be_liberal", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS] = { .procname = "nf_conntrack_tcp_max_retrans", @@ -904,8 +902,8 @@ static struct ctl_table nf_ct_sysctl_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif #ifdef CONFIG_NF_CT_PROTO_GRE @@ -1037,9 +1035,14 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) table[NF_SYSCTL_CT_COUNT].data = &net->ct.count; table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum; table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid; + table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct; + table[NF_SYSCTL_CT_HELPER].data = &net->ct.sysctl_auto_assign_helper; #ifdef CONFIG_NF_CONNTRACK_EVENTS table[NF_SYSCTL_CT_EVENTS].data = &net->ct.sysctl_events; #endif +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + table[NF_SYSCTL_CT_TIMESTAMP].data = &net->ct.sysctl_tstamp; +#endif table[NF_SYSCTL_CT_PROTO_TIMEOUT_GENERIC].data = &nf_generic_pernet(net)->timeout; table[NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP].data = &nf_icmp_pernet(net)->timeout; table[NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6].data = &nf_icmpv6_pernet(net)->timeout; @@ -1164,7 +1167,6 @@ static int __init nf_conntrack_standalone_init(void) if (ret < 0) goto out_start; - BUILD_BUG_ON(SKB_NFCT_PTRMASK != NFCT_PTRMASK); BUILD_BUG_ON(NFCT_INFOMASK <= IP_CT_NUMBER); #ifdef CONFIG_SYSCTL diff --git a/net/netfilter/nf_conntrack_timeout.c b/net/netfilter/nf_conntrack_timeout.c index 13d0f4a92647..14387e0b8008 100644 --- a/net/netfilter/nf_conntrack_timeout.c +++ b/net/netfilter/nf_conntrack_timeout.c @@ -19,6 +19,7 @@ #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_extend.h> +#include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_timeout.h> struct nf_ct_timeout * diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c index 5a35ef08c3cb..f108a76925dd 100644 --- a/net/netfilter/nf_dup_netdev.c +++ b/net/netfilter/nf_dup_netdev.c @@ -10,6 +10,7 @@ #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_offload.h> #include <net/netfilter/nf_dup_netdev.h> static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev) @@ -50,5 +51,25 @@ void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif) } EXPORT_SYMBOL_GPL(nf_dup_netdev_egress); +int nft_fwd_dup_netdev_offload(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + enum flow_action_id id, int oif) +{ + struct flow_action_entry *entry; + struct net_device *dev; + + /* nft_flow_rule_destroy() releases the reference on this device. */ + dev = dev_get_by_index(ctx->net, oif); + if (!dev) + return -EOPNOTSUPP; + + entry = &flow->rule->action.entries[ctx->num_actions++]; + entry->id = id; + entry->dev = dev; + + return 0; +} +EXPORT_SYMBOL_GPL(nft_fwd_dup_netdev_offload); + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 80a8f9ae4c93..8af28e10b4e6 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -11,26 +11,18 @@ #include <net/netfilter/nf_flow_table.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_tuple.h> -struct flow_offload_entry { - struct flow_offload flow; - struct nf_conn *ct; - struct rcu_head rcu_head; -}; - static DEFINE_MUTEX(flowtable_lock); static LIST_HEAD(flowtables); static void -flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct, - struct nf_flow_route *route, +flow_offload_fill_dir(struct flow_offload *flow, enum flow_offload_tuple_dir dir) { struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple; - struct nf_conntrack_tuple *ctt = &ct->tuplehash[dir].tuple; - struct dst_entry *other_dst = route->tuple[!dir].dst; - struct dst_entry *dst = route->tuple[dir].dst; + struct nf_conntrack_tuple *ctt = &flow->ct->tuplehash[dir].tuple; ft->dir = dir; @@ -38,12 +30,10 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct, case NFPROTO_IPV4: ft->src_v4 = ctt->src.u3.in; ft->dst_v4 = ctt->dst.u3.in; - ft->mtu = ip_dst_mtu_maybe_forward(dst, true); break; case NFPROTO_IPV6: ft->src_v6 = ctt->src.u3.in6; ft->dst_v6 = ctt->dst.u3.in6; - ft->mtu = ip6_dst_mtu_forward(dst); break; } @@ -51,49 +41,32 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct, ft->l4proto = ctt->dst.protonum; ft->src_port = ctt->src.u.tcp.port; ft->dst_port = ctt->dst.u.tcp.port; - - ft->iifidx = other_dst->dev->ifindex; - ft->dst_cache = dst; } -struct flow_offload * -flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route) +struct flow_offload *flow_offload_alloc(struct nf_conn *ct) { - struct flow_offload_entry *entry; struct flow_offload *flow; if (unlikely(nf_ct_is_dying(ct) || !atomic_inc_not_zero(&ct->ct_general.use))) return NULL; - entry = kzalloc(sizeof(*entry), GFP_ATOMIC); - if (!entry) + flow = kzalloc(sizeof(*flow), GFP_ATOMIC); + if (!flow) goto err_ct_refcnt; - flow = &entry->flow; + flow->ct = ct; - if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst)) - goto err_dst_cache_original; - - if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_REPLY].dst)) - goto err_dst_cache_reply; - - entry->ct = ct; - - flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_ORIGINAL); - flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_REPLY); + flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_ORIGINAL); + flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_REPLY); if (ct->status & IPS_SRC_NAT) - flow->flags |= FLOW_OFFLOAD_SNAT; + __set_bit(NF_FLOW_SNAT, &flow->flags); if (ct->status & IPS_DST_NAT) - flow->flags |= FLOW_OFFLOAD_DNAT; + __set_bit(NF_FLOW_DNAT, &flow->flags); return flow; -err_dst_cache_reply: - dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst); -err_dst_cache_original: - kfree(entry); err_ct_refcnt: nf_ct_put(ct); @@ -101,6 +74,56 @@ err_ct_refcnt: } EXPORT_SYMBOL_GPL(flow_offload_alloc); +static int flow_offload_fill_route(struct flow_offload *flow, + const struct nf_flow_route *route, + enum flow_offload_tuple_dir dir) +{ + struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple; + struct dst_entry *other_dst = route->tuple[!dir].dst; + struct dst_entry *dst = route->tuple[dir].dst; + + if (!dst_hold_safe(route->tuple[dir].dst)) + return -1; + + switch (flow_tuple->l3proto) { + case NFPROTO_IPV4: + flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true); + break; + case NFPROTO_IPV6: + flow_tuple->mtu = ip6_dst_mtu_forward(dst); + break; + } + + flow_tuple->iifidx = other_dst->dev->ifindex; + flow_tuple->dst_cache = dst; + + return 0; +} + +int flow_offload_route_init(struct flow_offload *flow, + const struct nf_flow_route *route) +{ + int err; + + err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL); + if (err < 0) + return err; + + err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY); + if (err < 0) + goto err_route_reply; + + flow->type = NF_FLOW_OFFLOAD_ROUTE; + + return 0; + +err_route_reply: + dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst); + + return err; +} +EXPORT_SYMBOL_GPL(flow_offload_route_init); + static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) { tcp->state = TCP_CONNTRACK_ESTABLISHED; @@ -111,11 +134,6 @@ static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) #define NF_FLOWTABLE_TCP_PICKUP_TIMEOUT (120 * HZ) #define NF_FLOWTABLE_UDP_PICKUP_TIMEOUT (30 * HZ) -static inline __s32 nf_flow_timeout_delta(unsigned int timeout) -{ - return (__s32)(timeout - (u32)jiffies); -} - static void flow_offload_fixup_ct_timeout(struct nf_conn *ct) { const struct nf_conntrack_l4proto *l4proto; @@ -149,17 +167,23 @@ static void flow_offload_fixup_ct(struct nf_conn *ct) flow_offload_fixup_ct_timeout(ct); } -void flow_offload_free(struct flow_offload *flow) +static void flow_offload_route_release(struct flow_offload *flow) { - struct flow_offload_entry *e; - dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache); dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache); - e = container_of(flow, struct flow_offload_entry, flow); - if (flow->flags & FLOW_OFFLOAD_DYING) - nf_ct_delete(e->ct, 0, 0); - nf_ct_put(e->ct); - kfree_rcu(e, rcu_head); +} + +void flow_offload_free(struct flow_offload *flow) +{ + switch (flow->type) { + case NF_FLOW_OFFLOAD_ROUTE: + flow_offload_route_release(flow); + break; + default: + break; + } + nf_ct_put(flow->ct); + kfree_rcu(flow, rcu_head); } EXPORT_SYMBOL_GPL(flow_offload_free); @@ -201,6 +225,8 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) { int err; + flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT; + err = rhashtable_insert_fast(&flow_table->rhashtable, &flow->tuplehash[0].node, nf_flow_offload_rhash_params); @@ -217,7 +243,11 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) return err; } - flow->timeout = (u32)jiffies; + if (nf_flowtable_hw_offload(flow_table)) { + __set_bit(NF_FLOW_HW, &flow->flags); + nf_flow_offload_add(flow_table, flow); + } + return 0; } EXPORT_SYMBOL_GPL(flow_offload_add); @@ -230,8 +260,6 @@ static inline bool nf_flow_has_expired(const struct flow_offload *flow) static void flow_offload_del(struct nf_flowtable *flow_table, struct flow_offload *flow) { - struct flow_offload_entry *e; - rhashtable_remove_fast(&flow_table->rhashtable, &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node, nf_flow_offload_rhash_params); @@ -239,25 +267,21 @@ static void flow_offload_del(struct nf_flowtable *flow_table, &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node, nf_flow_offload_rhash_params); - e = container_of(flow, struct flow_offload_entry, flow); - clear_bit(IPS_OFFLOAD_BIT, &e->ct->status); + clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status); if (nf_flow_has_expired(flow)) - flow_offload_fixup_ct(e->ct); - else if (flow->flags & FLOW_OFFLOAD_TEARDOWN) - flow_offload_fixup_ct_timeout(e->ct); + flow_offload_fixup_ct(flow->ct); + else if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) + flow_offload_fixup_ct_timeout(flow->ct); flow_offload_free(flow); } void flow_offload_teardown(struct flow_offload *flow) { - struct flow_offload_entry *e; - - flow->flags |= FLOW_OFFLOAD_TEARDOWN; + set_bit(NF_FLOW_TEARDOWN, &flow->flags); - e = container_of(flow, struct flow_offload_entry, flow); - flow_offload_fixup_ct_state(e->ct); + flow_offload_fixup_ct_state(flow->ct); } EXPORT_SYMBOL_GPL(flow_offload_teardown); @@ -267,7 +291,6 @@ flow_offload_lookup(struct nf_flowtable *flow_table, { struct flow_offload_tuple_rhash *tuplehash; struct flow_offload *flow; - struct flow_offload_entry *e; int dir; tuplehash = rhashtable_lookup(&flow_table->rhashtable, tuple, @@ -277,11 +300,10 @@ flow_offload_lookup(struct nf_flowtable *flow_table, dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); - if (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN)) + if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) return NULL; - e = container_of(flow, struct flow_offload_entry, flow); - if (unlikely(nf_ct_is_dying(e->ct))) + if (unlikely(nf_ct_is_dying(flow->ct))) return NULL; return tuplehash; @@ -325,12 +347,20 @@ nf_flow_table_iterate(struct nf_flowtable *flow_table, static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data) { struct nf_flowtable *flow_table = data; - struct flow_offload_entry *e; - e = container_of(flow, struct flow_offload_entry, flow); - if (nf_flow_has_expired(flow) || nf_ct_is_dying(e->ct) || - (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN))) - flow_offload_del(flow_table, flow); + if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct) || + test_bit(NF_FLOW_TEARDOWN, &flow->flags)) { + if (test_bit(NF_FLOW_HW, &flow->flags)) { + if (!test_bit(NF_FLOW_HW_DYING, &flow->flags)) + nf_flow_offload_del(flow_table, flow); + else if (test_bit(NF_FLOW_HW_DEAD, &flow->flags)) + flow_offload_del(flow_table, flow); + } else { + flow_offload_del(flow_table, flow); + } + } else if (test_bit(NF_FLOW_HW, &flow->flags)) { + nf_flow_offload_stats(flow_table, flow); + } } static void nf_flow_offload_work_gc(struct work_struct *work) @@ -463,6 +493,7 @@ int nf_flow_table_init(struct nf_flowtable *flowtable) int err; INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc); + flow_block_init(&flowtable->flow_block); err = rhashtable_init(&flowtable->rhashtable, &nf_flow_offload_rhash_params); @@ -483,18 +514,16 @@ EXPORT_SYMBOL_GPL(nf_flow_table_init); static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data) { struct net_device *dev = data; - struct flow_offload_entry *e; - - e = container_of(flow, struct flow_offload_entry, flow); if (!dev) { flow_offload_teardown(flow); return; } - if (net_eq(nf_ct_net(e->ct), dev_net(dev)) && + + if (net_eq(nf_ct_net(flow->ct), dev_net(dev)) && (flow->tuplehash[0].tuple.iifidx == dev->ifindex || flow->tuplehash[1].tuple.iifidx == dev->ifindex)) - flow_offload_dead(flow); + flow_offload_teardown(flow); } static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable, @@ -502,6 +531,7 @@ static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable, { nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev); flush_delayed_work(&flowtable->gc_work); + nf_flow_table_offload_flush(flowtable); } void nf_flow_table_cleanup(struct net_device *dev) @@ -523,9 +553,23 @@ void nf_flow_table_free(struct nf_flowtable *flow_table) cancel_delayed_work_sync(&flow_table->gc_work); nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL); nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table); + nf_flow_table_offload_flush(flow_table); rhashtable_destroy(&flow_table->rhashtable); } EXPORT_SYMBOL_GPL(nf_flow_table_free); +static int __init nf_flow_table_module_init(void) +{ + return nf_flow_table_offload_init(); +} + +static void __exit nf_flow_table_module_exit(void) +{ + nf_flow_table_offload_exit(); +} + +module_init(nf_flow_table_module_init); +module_exit(nf_flow_table_module_exit); + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c index 593357aedb36..88bedf1ff1ae 100644 --- a/net/netfilter/nf_flow_table_inet.c +++ b/net/netfilter/nf_flow_table_inet.c @@ -21,9 +21,34 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb, return NF_ACCEPT; } +static int nf_flow_rule_route_inet(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + const struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple; + int err; + + switch (flow_tuple->l3proto) { + case NFPROTO_IPV4: + err = nf_flow_rule_route_ipv4(net, flow, dir, flow_rule); + break; + case NFPROTO_IPV6: + err = nf_flow_rule_route_ipv6(net, flow, dir, flow_rule); + break; + default: + err = -1; + break; + } + + return err; +} + static struct nf_flowtable_type flowtable_inet = { .family = NFPROTO_INET, .init = nf_flow_table_init, + .setup = nf_flow_table_offload_setup, + .action = nf_flow_rule_route_inet, .free = nf_flow_table_free, .hook = nf_flow_offload_inet_hook, .owner = THIS_MODULE, diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index d68c801dd614..9e563fd3da0f 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -144,11 +144,11 @@ static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb, { struct iphdr *iph = ip_hdr(skb); - if (flow->flags & FLOW_OFFLOAD_SNAT && + if (test_bit(NF_FLOW_SNAT, &flow->flags) && (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 || nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0)) return -1; - if (flow->flags & FLOW_OFFLOAD_DNAT && + if (test_bit(NF_FLOW_DNAT, &flow->flags) && (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 || nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0)) return -1; @@ -228,11 +228,17 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb, { skb_orphan(skb); skb_dst_set_noref(skb, dst); - skb->tstamp = 0; dst_output(state->net, state->sk, skb); return NF_STOLEN; } +static bool nf_flow_offload_refresh(struct nf_flowtable *flow_table, + struct flow_offload *flow) +{ + return nf_flowtable_hw_offload(flow_table) && + test_and_clear_bit(NF_FLOW_HW_REFRESH, &flow->flags); +} + unsigned int nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -273,6 +279,9 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, if (nf_flow_state_check(flow, ip_hdr(skb)->protocol, skb, thoff)) return NF_ACCEPT; + if (unlikely(nf_flow_offload_refresh(flow_table, flow))) + nf_flow_offload_add(flow_table, flow); + if (nf_flow_offload_dst_check(&rt->dst)) { flow_offload_teardown(flow); return NF_ACCEPT; @@ -281,9 +290,10 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, if (nf_flow_nat_ip(flow, skb, thoff, dir) < 0) return NF_DROP; - flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; + flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT; iph = ip_hdr(skb); ip_decrease_ttl(iph); + skb->tstamp = 0; if (unlikely(dst_xfrm(&rt->dst))) { memset(skb->cb, 0, sizeof(struct inet_skb_parm)); @@ -414,11 +424,11 @@ static int nf_flow_nat_ipv6(const struct flow_offload *flow, struct ipv6hdr *ip6h = ipv6_hdr(skb); unsigned int thoff = sizeof(*ip6h); - if (flow->flags & FLOW_OFFLOAD_SNAT && + if (test_bit(NF_FLOW_SNAT, &flow->flags) && (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0)) return -1; - if (flow->flags & FLOW_OFFLOAD_DNAT && + if (test_bit(NF_FLOW_DNAT, &flow->flags) && (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0)) return -1; @@ -498,6 +508,9 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, sizeof(*ip6h))) return NF_ACCEPT; + if (unlikely(nf_flow_offload_refresh(flow_table, flow))) + nf_flow_offload_add(flow_table, flow); + if (nf_flow_offload_dst_check(&rt->dst)) { flow_offload_teardown(flow); return NF_ACCEPT; @@ -509,9 +522,10 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, if (nf_flow_nat_ipv6(flow, skb, dir) < 0) return NF_DROP; - flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; + flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT; ip6h = ipv6_hdr(skb); ip6h->hop_limit--; + skb->tstamp = 0; if (unlikely(dst_xfrm(&rt->dst))) { memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c new file mode 100644 index 000000000000..83e1db37c3b0 --- /dev/null +++ b/net/netfilter/nf_flow_table_offload.c @@ -0,0 +1,905 @@ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/rhashtable.h> +#include <linux/netdevice.h> +#include <linux/tc_act/tc_csum.h> +#include <net/flow_offload.h> +#include <net/netfilter/nf_flow_table.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_tuple.h> + +static struct work_struct nf_flow_offload_work; +static DEFINE_SPINLOCK(flow_offload_pending_list_lock); +static LIST_HEAD(flow_offload_pending_list); + +struct flow_offload_work { + struct list_head list; + enum flow_cls_command cmd; + int priority; + struct nf_flowtable *flowtable; + struct flow_offload *flow; +}; + +struct nf_flow_key { + struct flow_dissector_key_meta meta; + struct flow_dissector_key_control control; + struct flow_dissector_key_basic basic; + union { + struct flow_dissector_key_ipv4_addrs ipv4; + struct flow_dissector_key_ipv6_addrs ipv6; + }; + struct flow_dissector_key_tcp tcp; + struct flow_dissector_key_ports tp; +} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ + +struct nf_flow_match { + struct flow_dissector dissector; + struct nf_flow_key key; + struct nf_flow_key mask; +}; + +struct nf_flow_rule { + struct nf_flow_match match; + struct flow_rule *rule; +}; + +#define NF_FLOW_DISSECTOR(__match, __type, __field) \ + (__match)->dissector.offset[__type] = \ + offsetof(struct nf_flow_key, __field) + +static int nf_flow_rule_match(struct nf_flow_match *match, + const struct flow_offload_tuple *tuple) +{ + struct nf_flow_key *mask = &match->mask; + struct nf_flow_key *key = &match->key; + + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_META, meta); + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_CONTROL, control); + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_BASIC, basic); + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4); + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6); + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_TCP, tcp); + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_PORTS, tp); + + key->meta.ingress_ifindex = tuple->iifidx; + mask->meta.ingress_ifindex = 0xffffffff; + + switch (tuple->l3proto) { + case AF_INET: + key->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + key->basic.n_proto = htons(ETH_P_IP); + key->ipv4.src = tuple->src_v4.s_addr; + mask->ipv4.src = 0xffffffff; + key->ipv4.dst = tuple->dst_v4.s_addr; + mask->ipv4.dst = 0xffffffff; + break; + case AF_INET6: + key->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + key->basic.n_proto = htons(ETH_P_IPV6); + key->ipv6.src = tuple->src_v6; + memset(&mask->ipv6.src, 0xff, sizeof(mask->ipv6.src)); + key->ipv6.dst = tuple->dst_v6; + memset(&mask->ipv6.dst, 0xff, sizeof(mask->ipv6.dst)); + break; + default: + return -EOPNOTSUPP; + } + match->dissector.used_keys |= BIT(key->control.addr_type); + mask->basic.n_proto = 0xffff; + + switch (tuple->l4proto) { + case IPPROTO_TCP: + key->tcp.flags = 0; + mask->tcp.flags = cpu_to_be16(be32_to_cpu(TCP_FLAG_RST | TCP_FLAG_FIN) >> 16); + match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_TCP); + break; + case IPPROTO_UDP: + break; + default: + return -EOPNOTSUPP; + } + + key->basic.ip_proto = tuple->l4proto; + mask->basic.ip_proto = 0xff; + + key->tp.src = tuple->src_port; + mask->tp.src = 0xffff; + key->tp.dst = tuple->dst_port; + mask->tp.dst = 0xffff; + + match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_META) | + BIT(FLOW_DISSECTOR_KEY_CONTROL) | + BIT(FLOW_DISSECTOR_KEY_BASIC) | + BIT(FLOW_DISSECTOR_KEY_PORTS); + return 0; +} + +static void flow_offload_mangle(struct flow_action_entry *entry, + enum flow_action_mangle_base htype, u32 offset, + const __be32 *value, const __be32 *mask) +{ + entry->id = FLOW_ACTION_MANGLE; + entry->mangle.htype = htype; + entry->mangle.offset = offset; + memcpy(&entry->mangle.mask, mask, sizeof(u32)); + memcpy(&entry->mangle.val, value, sizeof(u32)); +} + +static inline struct flow_action_entry * +flow_action_entry_next(struct nf_flow_rule *flow_rule) +{ + int i = flow_rule->rule->action.num_entries++; + + return &flow_rule->rule->action.entries[i]; +} + +static int flow_offload_eth_src(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + const struct flow_offload_tuple *tuple = &flow->tuplehash[!dir].tuple; + struct flow_action_entry *entry0 = flow_action_entry_next(flow_rule); + struct flow_action_entry *entry1 = flow_action_entry_next(flow_rule); + struct net_device *dev; + u32 mask, val; + u16 val16; + + dev = dev_get_by_index(net, tuple->iifidx); + if (!dev) + return -ENOENT; + + mask = ~0xffff0000; + memcpy(&val16, dev->dev_addr, 2); + val = val16 << 16; + flow_offload_mangle(entry0, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 4, + &val, &mask); + + mask = ~0xffffffff; + memcpy(&val, dev->dev_addr + 2, 4); + flow_offload_mangle(entry1, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 8, + &val, &mask); + dev_put(dev); + + return 0; +} + +static int flow_offload_eth_dst(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + struct flow_action_entry *entry0 = flow_action_entry_next(flow_rule); + struct flow_action_entry *entry1 = flow_action_entry_next(flow_rule); + const void *daddr = &flow->tuplehash[!dir].tuple.src_v4; + const struct dst_entry *dst_cache; + unsigned char ha[ETH_ALEN]; + struct neighbour *n; + u32 mask, val; + u8 nud_state; + u16 val16; + + dst_cache = flow->tuplehash[dir].tuple.dst_cache; + n = dst_neigh_lookup(dst_cache, daddr); + if (!n) + return -ENOENT; + + read_lock_bh(&n->lock); + nud_state = n->nud_state; + ether_addr_copy(ha, n->ha); + read_unlock_bh(&n->lock); + + if (!(nud_state & NUD_VALID)) { + neigh_release(n); + return -ENOENT; + } + + mask = ~0xffffffff; + memcpy(&val, ha, 4); + flow_offload_mangle(entry0, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 0, + &val, &mask); + + mask = ~0x0000ffff; + memcpy(&val16, ha + 4, 2); + val = val16; + flow_offload_mangle(entry1, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 4, + &val, &mask); + neigh_release(n); + + return 0; +} + +static void flow_offload_ipv4_snat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + struct flow_action_entry *entry = flow_action_entry_next(flow_rule); + u32 mask = ~htonl(0xffffffff); + __be32 addr; + u32 offset; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr; + offset = offsetof(struct iphdr, saddr); + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr; + offset = offsetof(struct iphdr, daddr); + break; + default: + return; + } + + flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP4, offset, + &addr, &mask); +} + +static void flow_offload_ipv4_dnat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + struct flow_action_entry *entry = flow_action_entry_next(flow_rule); + u32 mask = ~htonl(0xffffffff); + __be32 addr; + u32 offset; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr; + offset = offsetof(struct iphdr, daddr); + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr; + offset = offsetof(struct iphdr, saddr); + break; + default: + return; + } + + flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP4, offset, + &addr, &mask); +} + +static void flow_offload_ipv6_mangle(struct nf_flow_rule *flow_rule, + unsigned int offset, + const __be32 *addr, const __be32 *mask) +{ + struct flow_action_entry *entry; + int i; + + for (i = 0; i < sizeof(struct in6_addr) / sizeof(u32); i += sizeof(u32)) { + entry = flow_action_entry_next(flow_rule); + flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP6, + offset + i, &addr[i], mask); + } +} + +static void flow_offload_ipv6_snat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + u32 mask = ~htonl(0xffffffff); + const __be32 *addr; + u32 offset; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6.s6_addr32; + offset = offsetof(struct ipv6hdr, saddr); + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6.s6_addr32; + offset = offsetof(struct ipv6hdr, daddr); + break; + default: + return; + } + + flow_offload_ipv6_mangle(flow_rule, offset, addr, &mask); +} + +static void flow_offload_ipv6_dnat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + u32 mask = ~htonl(0xffffffff); + const __be32 *addr; + u32 offset; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6.s6_addr32; + offset = offsetof(struct ipv6hdr, daddr); + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6.s6_addr32; + offset = offsetof(struct ipv6hdr, saddr); + break; + default: + return; + } + + flow_offload_ipv6_mangle(flow_rule, offset, addr, &mask); +} + +static int flow_offload_l4proto(const struct flow_offload *flow) +{ + u8 protonum = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto; + u8 type = 0; + + switch (protonum) { + case IPPROTO_TCP: + type = FLOW_ACT_MANGLE_HDR_TYPE_TCP; + break; + case IPPROTO_UDP: + type = FLOW_ACT_MANGLE_HDR_TYPE_UDP; + break; + default: + break; + } + + return type; +} + +static void flow_offload_port_snat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + struct flow_action_entry *entry = flow_action_entry_next(flow_rule); + u32 mask, port; + u32 offset; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port); + offset = 0; /* offsetof(struct tcphdr, source); */ + port = htonl(port << 16); + mask = ~htonl(0xffff0000); + break; + case FLOW_OFFLOAD_DIR_REPLY: + port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port); + offset = 0; /* offsetof(struct tcphdr, dest); */ + port = htonl(port); + mask = ~htonl(0xffff); + break; + default: + return; + } + + flow_offload_mangle(entry, flow_offload_l4proto(flow), offset, + &port, &mask); +} + +static void flow_offload_port_dnat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + struct flow_action_entry *entry = flow_action_entry_next(flow_rule); + u32 mask, port; + u32 offset; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port); + offset = 0; /* offsetof(struct tcphdr, dest); */ + port = htonl(port); + mask = ~htonl(0xffff); + break; + case FLOW_OFFLOAD_DIR_REPLY: + port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port); + offset = 0; /* offsetof(struct tcphdr, source); */ + port = htonl(port << 16); + mask = ~htonl(0xffff0000); + break; + default: + return; + } + + flow_offload_mangle(entry, flow_offload_l4proto(flow), offset, + &port, &mask); +} + +static void flow_offload_ipv4_checksum(struct net *net, + const struct flow_offload *flow, + struct nf_flow_rule *flow_rule) +{ + u8 protonum = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto; + struct flow_action_entry *entry = flow_action_entry_next(flow_rule); + + entry->id = FLOW_ACTION_CSUM; + entry->csum_flags = TCA_CSUM_UPDATE_FLAG_IPV4HDR; + + switch (protonum) { + case IPPROTO_TCP: + entry->csum_flags |= TCA_CSUM_UPDATE_FLAG_TCP; + break; + case IPPROTO_UDP: + entry->csum_flags |= TCA_CSUM_UPDATE_FLAG_UDP; + break; + } +} + +static void flow_offload_redirect(const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + struct flow_action_entry *entry = flow_action_entry_next(flow_rule); + struct rtable *rt; + + rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache; + entry->id = FLOW_ACTION_REDIRECT; + entry->dev = rt->dst.dev; + dev_hold(rt->dst.dev); +} + +int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 || + flow_offload_eth_dst(net, flow, dir, flow_rule) < 0) + return -1; + + if (test_bit(NF_FLOW_SNAT, &flow->flags)) { + flow_offload_ipv4_snat(net, flow, dir, flow_rule); + flow_offload_port_snat(net, flow, dir, flow_rule); + } + if (test_bit(NF_FLOW_DNAT, &flow->flags)) { + flow_offload_ipv4_dnat(net, flow, dir, flow_rule); + flow_offload_port_dnat(net, flow, dir, flow_rule); + } + if (test_bit(NF_FLOW_SNAT, &flow->flags) || + test_bit(NF_FLOW_DNAT, &flow->flags)) + flow_offload_ipv4_checksum(net, flow, flow_rule); + + flow_offload_redirect(flow, dir, flow_rule); + + return 0; +} +EXPORT_SYMBOL_GPL(nf_flow_rule_route_ipv4); + +int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 || + flow_offload_eth_dst(net, flow, dir, flow_rule) < 0) + return -1; + + if (test_bit(NF_FLOW_SNAT, &flow->flags)) { + flow_offload_ipv6_snat(net, flow, dir, flow_rule); + flow_offload_port_snat(net, flow, dir, flow_rule); + } + if (test_bit(NF_FLOW_DNAT, &flow->flags)) { + flow_offload_ipv6_dnat(net, flow, dir, flow_rule); + flow_offload_port_dnat(net, flow, dir, flow_rule); + } + + flow_offload_redirect(flow, dir, flow_rule); + + return 0; +} +EXPORT_SYMBOL_GPL(nf_flow_rule_route_ipv6); + +#define NF_FLOW_RULE_ACTION_MAX 16 + +static struct nf_flow_rule * +nf_flow_offload_rule_alloc(struct net *net, + const struct flow_offload_work *offload, + enum flow_offload_tuple_dir dir) +{ + const struct nf_flowtable *flowtable = offload->flowtable; + const struct flow_offload *flow = offload->flow; + const struct flow_offload_tuple *tuple; + struct nf_flow_rule *flow_rule; + int err = -ENOMEM; + + flow_rule = kzalloc(sizeof(*flow_rule), GFP_KERNEL); + if (!flow_rule) + goto err_flow; + + flow_rule->rule = flow_rule_alloc(NF_FLOW_RULE_ACTION_MAX); + if (!flow_rule->rule) + goto err_flow_rule; + + flow_rule->rule->match.dissector = &flow_rule->match.dissector; + flow_rule->rule->match.mask = &flow_rule->match.mask; + flow_rule->rule->match.key = &flow_rule->match.key; + + tuple = &flow->tuplehash[dir].tuple; + err = nf_flow_rule_match(&flow_rule->match, tuple); + if (err < 0) + goto err_flow_match; + + flow_rule->rule->action.num_entries = 0; + if (flowtable->type->action(net, flow, dir, flow_rule) < 0) + goto err_flow_match; + + return flow_rule; + +err_flow_match: + kfree(flow_rule->rule); +err_flow_rule: + kfree(flow_rule); +err_flow: + return NULL; +} + +static void __nf_flow_offload_destroy(struct nf_flow_rule *flow_rule) +{ + struct flow_action_entry *entry; + int i; + + for (i = 0; i < flow_rule->rule->action.num_entries; i++) { + entry = &flow_rule->rule->action.entries[i]; + if (entry->id != FLOW_ACTION_REDIRECT) + continue; + + dev_put(entry->dev); + } + kfree(flow_rule->rule); + kfree(flow_rule); +} + +static void nf_flow_offload_destroy(struct nf_flow_rule *flow_rule[]) +{ + int i; + + for (i = 0; i < FLOW_OFFLOAD_DIR_MAX; i++) + __nf_flow_offload_destroy(flow_rule[i]); +} + +static int nf_flow_offload_alloc(const struct flow_offload_work *offload, + struct nf_flow_rule *flow_rule[]) +{ + struct net *net = read_pnet(&offload->flowtable->net); + + flow_rule[0] = nf_flow_offload_rule_alloc(net, offload, + FLOW_OFFLOAD_DIR_ORIGINAL); + if (!flow_rule[0]) + return -ENOMEM; + + flow_rule[1] = nf_flow_offload_rule_alloc(net, offload, + FLOW_OFFLOAD_DIR_REPLY); + if (!flow_rule[1]) { + __nf_flow_offload_destroy(flow_rule[0]); + return -ENOMEM; + } + + return 0; +} + +static void nf_flow_offload_init(struct flow_cls_offload *cls_flow, + __be16 proto, int priority, + enum flow_cls_command cmd, + const struct flow_offload_tuple *tuple, + struct netlink_ext_ack *extack) +{ + cls_flow->common.protocol = proto; + cls_flow->common.prio = priority; + cls_flow->common.extack = extack; + cls_flow->command = cmd; + cls_flow->cookie = (unsigned long)tuple; +} + +static int nf_flow_offload_tuple(struct nf_flowtable *flowtable, + struct flow_offload *flow, + struct nf_flow_rule *flow_rule, + enum flow_offload_tuple_dir dir, + int priority, int cmd, + struct list_head *block_cb_list) +{ + struct flow_cls_offload cls_flow = {}; + struct flow_block_cb *block_cb; + struct netlink_ext_ack extack; + __be16 proto = ETH_P_ALL; + int err, i = 0; + + nf_flow_offload_init(&cls_flow, proto, priority, cmd, + &flow->tuplehash[dir].tuple, &extack); + if (cmd == FLOW_CLS_REPLACE) + cls_flow.rule = flow_rule->rule; + + list_for_each_entry(block_cb, block_cb_list, list) { + err = block_cb->cb(TC_SETUP_CLSFLOWER, &cls_flow, + block_cb->cb_priv); + if (err < 0) + continue; + + i++; + } + + return i; +} + +static int flow_offload_tuple_add(struct flow_offload_work *offload, + struct nf_flow_rule *flow_rule, + enum flow_offload_tuple_dir dir) +{ + return nf_flow_offload_tuple(offload->flowtable, offload->flow, + flow_rule, dir, offload->priority, + FLOW_CLS_REPLACE, + &offload->flowtable->flow_block.cb_list); +} + +static void flow_offload_tuple_del(struct flow_offload_work *offload, + enum flow_offload_tuple_dir dir) +{ + nf_flow_offload_tuple(offload->flowtable, offload->flow, NULL, dir, + offload->priority, FLOW_CLS_DESTROY, + &offload->flowtable->flow_block.cb_list); +} + +static int flow_offload_rule_add(struct flow_offload_work *offload, + struct nf_flow_rule *flow_rule[]) +{ + int ok_count = 0; + + ok_count += flow_offload_tuple_add(offload, flow_rule[0], + FLOW_OFFLOAD_DIR_ORIGINAL); + ok_count += flow_offload_tuple_add(offload, flow_rule[1], + FLOW_OFFLOAD_DIR_REPLY); + if (ok_count == 0) + return -ENOENT; + + return 0; +} + +static void flow_offload_work_add(struct flow_offload_work *offload) +{ + struct nf_flow_rule *flow_rule[FLOW_OFFLOAD_DIR_MAX]; + int err; + + err = nf_flow_offload_alloc(offload, flow_rule); + if (err < 0) + return; + + err = flow_offload_rule_add(offload, flow_rule); + if (err < 0) + set_bit(NF_FLOW_HW_REFRESH, &offload->flow->flags); + + nf_flow_offload_destroy(flow_rule); +} + +static void flow_offload_work_del(struct flow_offload_work *offload) +{ + flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_ORIGINAL); + flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY); + set_bit(NF_FLOW_HW_DEAD, &offload->flow->flags); +} + +static void flow_offload_tuple_stats(struct flow_offload_work *offload, + enum flow_offload_tuple_dir dir, + struct flow_stats *stats) +{ + struct nf_flowtable *flowtable = offload->flowtable; + struct flow_cls_offload cls_flow = {}; + struct flow_block_cb *block_cb; + struct netlink_ext_ack extack; + __be16 proto = ETH_P_ALL; + + nf_flow_offload_init(&cls_flow, proto, offload->priority, + FLOW_CLS_STATS, + &offload->flow->tuplehash[dir].tuple, &extack); + + list_for_each_entry(block_cb, &flowtable->flow_block.cb_list, list) + block_cb->cb(TC_SETUP_CLSFLOWER, &cls_flow, block_cb->cb_priv); + memcpy(stats, &cls_flow.stats, sizeof(*stats)); +} + +static void flow_offload_work_stats(struct flow_offload_work *offload) +{ + struct flow_stats stats[FLOW_OFFLOAD_DIR_MAX] = {}; + u64 lastused; + + flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_ORIGINAL, &stats[0]); + flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_REPLY, &stats[1]); + + lastused = max_t(u64, stats[0].lastused, stats[1].lastused); + offload->flow->timeout = max_t(u64, offload->flow->timeout, + lastused + NF_FLOW_TIMEOUT); +} + +static void flow_offload_work_handler(struct work_struct *work) +{ + struct flow_offload_work *offload, *next; + LIST_HEAD(offload_pending_list); + + spin_lock_bh(&flow_offload_pending_list_lock); + list_replace_init(&flow_offload_pending_list, &offload_pending_list); + spin_unlock_bh(&flow_offload_pending_list_lock); + + list_for_each_entry_safe(offload, next, &offload_pending_list, list) { + switch (offload->cmd) { + case FLOW_CLS_REPLACE: + flow_offload_work_add(offload); + break; + case FLOW_CLS_DESTROY: + flow_offload_work_del(offload); + break; + case FLOW_CLS_STATS: + flow_offload_work_stats(offload); + break; + default: + WARN_ON_ONCE(1); + } + list_del(&offload->list); + kfree(offload); + } +} + +static void flow_offload_queue_work(struct flow_offload_work *offload) +{ + spin_lock_bh(&flow_offload_pending_list_lock); + list_add_tail(&offload->list, &flow_offload_pending_list); + spin_unlock_bh(&flow_offload_pending_list_lock); + + schedule_work(&nf_flow_offload_work); +} + +static struct flow_offload_work * +nf_flow_offload_work_alloc(struct nf_flowtable *flowtable, + struct flow_offload *flow, unsigned int cmd) +{ + struct flow_offload_work *offload; + + offload = kmalloc(sizeof(struct flow_offload_work), GFP_ATOMIC); + if (!offload) + return NULL; + + offload->cmd = cmd; + offload->flow = flow; + offload->priority = flowtable->priority; + offload->flowtable = flowtable; + + return offload; +} + + +void nf_flow_offload_add(struct nf_flowtable *flowtable, + struct flow_offload *flow) +{ + struct flow_offload_work *offload; + + offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_REPLACE); + if (!offload) + return; + + flow_offload_queue_work(offload); +} + +void nf_flow_offload_del(struct nf_flowtable *flowtable, + struct flow_offload *flow) +{ + struct flow_offload_work *offload; + + offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_DESTROY); + if (!offload) + return; + + set_bit(NF_FLOW_HW_DYING, &flow->flags); + flow_offload_queue_work(offload); +} + +void nf_flow_offload_stats(struct nf_flowtable *flowtable, + struct flow_offload *flow) +{ + struct flow_offload_work *offload; + __s32 delta; + + delta = nf_flow_timeout_delta(flow->timeout); + if ((delta >= (9 * NF_FLOW_TIMEOUT) / 10)) + return; + + offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_STATS); + if (!offload) + return; + + flow_offload_queue_work(offload); +} + +void nf_flow_table_offload_flush(struct nf_flowtable *flowtable) +{ + if (nf_flowtable_hw_offload(flowtable)) + flush_work(&nf_flow_offload_work); +} + +static int nf_flow_table_block_setup(struct nf_flowtable *flowtable, + struct flow_block_offload *bo, + enum flow_block_command cmd) +{ + struct flow_block_cb *block_cb, *next; + int err = 0; + + switch (cmd) { + case FLOW_BLOCK_BIND: + list_splice(&bo->cb_list, &flowtable->flow_block.cb_list); + break; + case FLOW_BLOCK_UNBIND: + list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) { + list_del(&block_cb->list); + flow_block_cb_free(block_cb); + } + break; + default: + WARN_ON_ONCE(1); + err = -EOPNOTSUPP; + } + + return err; +} + +static int nf_flow_table_offload_cmd(struct flow_block_offload *bo, + struct nf_flowtable *flowtable, + struct net_device *dev, + enum flow_block_command cmd, + struct netlink_ext_ack *extack) +{ + int err; + + if (!nf_flowtable_hw_offload(flowtable)) + return 0; + + if (!dev->netdev_ops->ndo_setup_tc) + return -EOPNOTSUPP; + + memset(bo, 0, sizeof(*bo)); + bo->net = dev_net(dev); + bo->block = &flowtable->flow_block; + bo->command = cmd; + bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; + bo->extack = extack; + INIT_LIST_HEAD(&bo->cb_list); + + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_FT, bo); + if (err < 0) + return err; + + return 0; +} + +int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, + struct net_device *dev, + enum flow_block_command cmd) +{ + struct netlink_ext_ack extack = {}; + struct flow_block_offload bo; + int err; + + err = nf_flow_table_offload_cmd(&bo, flowtable, dev, cmd, &extack); + if (err < 0) + return err; + + return nf_flow_table_block_setup(flowtable, &bo, cmd); +} +EXPORT_SYMBOL_GPL(nf_flow_table_offload_setup); + +int nf_flow_table_offload_init(void) +{ + INIT_WORK(&nf_flow_offload_work, flow_offload_work_handler); + + return 0; +} + +void nf_flow_table_offload_exit(void) +{ + struct flow_offload_work *offload, *next; + LIST_HEAD(offload_pending_list); + + cancel_work_sync(&nf_flow_offload_work); + + list_for_each_entry_safe(offload, next, &offload_pending_list, list) { + list_del(&offload->list); + kfree(offload); + } +} diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 3f6023ed4966..bfc555fcbc72 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -18,12 +18,12 @@ #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_helper.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_zones.h> -#include <linux/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_helper.h> +#include <uapi/linux/netfilter/nf_nat.h> #include "nf_internals.h" diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c index 7ac733ebd060..64eedc17037a 100644 --- a/net/netfilter/nf_nat_proto.c +++ b/net/netfilter/nf_nat_proto.c @@ -233,6 +233,19 @@ icmp_manip_pkt(struct sk_buff *skb, return false; hdr = (struct icmphdr *)(skb->data + hdroff); + switch (hdr->type) { + case ICMP_ECHO: + case ICMP_ECHOREPLY: + case ICMP_TIMESTAMP: + case ICMP_TIMESTAMPREPLY: + case ICMP_INFO_REQUEST: + case ICMP_INFO_REPLY: + case ICMP_ADDRESS: + case ICMP_ADDRESSREPLY: + break; + default: + return true; + } inet_proto_csum_replace2(&hdr->checksum, skb, hdr->un.echo.id, tuple->src.u.icmp.id, false); hdr->un.echo.id = tuple->src.u.icmp.id; @@ -722,7 +735,7 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, return ret; } -const struct nf_hook_ops nf_nat_ipv4_ops[] = { +static const struct nf_hook_ops nf_nat_ipv4_ops[] = { /* Before packet filtering, change destination */ { .hook = nf_nat_ipv4_in, @@ -961,7 +974,7 @@ nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb, return ret; } -const struct nf_hook_ops nf_nat_ipv6_ops[] = { +static const struct nf_hook_ops nf_nat_ipv6_ops[] = { /* Before packet filtering, change destination */ { .hook = nf_nat_ipv6_in, diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index a2b58de82600..f8f52ff99cfb 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -189,7 +189,7 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, goto err; } - if (!skb_dst_force(skb) && state->hook != NF_INET_PRE_ROUTING) { + if (skb_dst(skb) && !skb_dst_force(skb)) { status = -ENETDOWN; goto err; } diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index c769462a839e..b0930d4aba22 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -56,7 +56,7 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, switch (opcode) { case TCPOPT_MSS: if (opsize == TCPOLEN_MSS) { - opts->mss = get_unaligned_be16(ptr); + opts->mss_option = get_unaligned_be16(ptr); opts->options |= NF_SYNPROXY_OPT_MSS; } break; @@ -115,7 +115,7 @@ synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts) if (options & NF_SYNPROXY_OPT_MSS) *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | - opts->mss); + opts->mss_option); if (options & NF_SYNPROXY_OPT_TIMESTAMP) { if (options & NF_SYNPROXY_OPT_SACK_PERM) @@ -642,7 +642,7 @@ synproxy_recv_client_ack(struct net *net, } this_cpu_inc(snet->stats->cookie_valid); - opts->mss = mss; + opts->mss_option = mss; opts->options |= NF_SYNPROXY_OPT_MSS; if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) @@ -1060,7 +1060,7 @@ synproxy_recv_client_ack_ipv6(struct net *net, } this_cpu_inc(snet->stats->cookie_valid); - opts->mss = mss; + opts->mss_option = mss; opts->options |= NF_SYNPROXY_OPT_MSS; if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d47469f824a1..d1318bdf49ca 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -22,6 +22,8 @@ #include <net/net_namespace.h> #include <net/sock.h> +#define NFT_MODULE_AUTOLOAD_LIMIT (MODULE_NAME_LEN - sizeof("nft-expr-255-")) + static LIST_HEAD(nf_tables_expressions); static LIST_HEAD(nf_tables_objects); static LIST_HEAD(nf_tables_flowtables); @@ -151,11 +153,64 @@ static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) } } +static int nft_netdev_register_hooks(struct net *net, + struct list_head *hook_list) +{ + struct nft_hook *hook; + int err, j; + + j = 0; + list_for_each_entry(hook, hook_list, list) { + err = nf_register_net_hook(net, &hook->ops); + if (err < 0) + goto err_register; + + j++; + } + return 0; + +err_register: + list_for_each_entry(hook, hook_list, list) { + if (j-- <= 0) + break; + + nf_unregister_net_hook(net, &hook->ops); + } + return err; +} + +static void nft_netdev_unregister_hooks(struct net *net, + struct list_head *hook_list) +{ + struct nft_hook *hook; + + list_for_each_entry(hook, hook_list, list) + nf_unregister_net_hook(net, &hook->ops); +} + +static int nft_register_basechain_hooks(struct net *net, int family, + struct nft_base_chain *basechain) +{ + if (family == NFPROTO_NETDEV) + return nft_netdev_register_hooks(net, &basechain->hook_list); + + return nf_register_net_hook(net, &basechain->ops); +} + +static void nft_unregister_basechain_hooks(struct net *net, int family, + struct nft_base_chain *basechain) +{ + if (family == NFPROTO_NETDEV) + nft_netdev_unregister_hooks(net, &basechain->hook_list); + else + nf_unregister_net_hook(net, &basechain->ops); +} + static int nf_tables_register_hook(struct net *net, const struct nft_table *table, struct nft_chain *chain) { - const struct nft_base_chain *basechain; + struct nft_base_chain *basechain; const struct nf_hook_ops *ops; if (table->flags & NFT_TABLE_F_DORMANT || @@ -168,14 +223,14 @@ static int nf_tables_register_hook(struct net *net, if (basechain->type->ops_register) return basechain->type->ops_register(net, ops); - return nf_register_net_hook(net, ops); + return nft_register_basechain_hooks(net, table->family, basechain); } static void nf_tables_unregister_hook(struct net *net, const struct nft_table *table, struct nft_chain *chain) { - const struct nft_base_chain *basechain; + struct nft_base_chain *basechain; const struct nf_hook_ops *ops; if (table->flags & NFT_TABLE_F_DORMANT || @@ -187,7 +242,7 @@ static void nf_tables_unregister_hook(struct net *net, if (basechain->type->ops_unregister) return basechain->type->ops_unregister(net, ops); - nf_unregister_net_hook(net, ops); + nft_unregister_basechain_hooks(net, table->family, basechain); } static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) @@ -308,6 +363,7 @@ static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type, static int nft_delrule(struct nft_ctx *ctx, struct nft_rule *rule) { + struct nft_flow_rule *flow; struct nft_trans *trans; int err; @@ -315,6 +371,16 @@ static int nft_delrule(struct nft_ctx *ctx, struct nft_rule *rule) if (trans == NULL) return -ENOMEM; + if (ctx->chain->flags & NFT_CHAIN_HW_OFFLOAD) { + flow = nft_flow_rule_create(ctx->net, rule); + if (IS_ERR(flow)) { + nft_trans_destroy(trans); + return PTR_ERR(flow); + } + + nft_trans_flow_rule(trans) = flow; + } + err = nf_tables_delrule_deactivate(ctx, rule); if (err < 0) { nft_trans_destroy(trans); @@ -487,46 +553,70 @@ static inline u64 nf_tables_alloc_handle(struct nft_table *table) static const struct nft_chain_type *chain_type[NFPROTO_NUMPROTO][NFT_CHAIN_T_MAX]; static const struct nft_chain_type * +__nft_chain_type_get(u8 family, enum nft_chain_types type) +{ + if (family >= NFPROTO_NUMPROTO || + type >= NFT_CHAIN_T_MAX) + return NULL; + + return chain_type[family][type]; +} + +static const struct nft_chain_type * __nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family) { + const struct nft_chain_type *type; int i; for (i = 0; i < NFT_CHAIN_T_MAX; i++) { - if (chain_type[family][i] != NULL && - !nla_strcmp(nla, chain_type[family][i]->name)) - return chain_type[family][i]; + type = __nft_chain_type_get(family, i); + if (!type) + continue; + if (!nla_strcmp(nla, type->name)) + return type; } return NULL; } -/* - * Loading a module requires dropping mutex that guards the - * transaction. - * We first need to abort any pending transactions as once - * mutex is unlocked a different client could start a new - * transaction. It must not see any 'future generation' - * changes * as these changes will never happen. - */ -#ifdef CONFIG_MODULES -static int __nf_tables_abort(struct net *net); +struct nft_module_request { + struct list_head list; + char module[MODULE_NAME_LEN]; + bool done; +}; -static void nft_request_module(struct net *net, const char *fmt, ...) +#ifdef CONFIG_MODULES +static int nft_request_module(struct net *net, const char *fmt, ...) { char module_name[MODULE_NAME_LEN]; + struct nft_module_request *req; va_list args; int ret; - __nf_tables_abort(net); - va_start(args, fmt); ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); va_end(args); - if (WARN(ret >= MODULE_NAME_LEN, "truncated: '%s' (len %d)", module_name, ret)) - return; + if (ret >= MODULE_NAME_LEN) + return 0; - mutex_unlock(&net->nft.commit_mutex); - request_module("%s", module_name); - mutex_lock(&net->nft.commit_mutex); + list_for_each_entry(req, &net->nft.module_list, list) { + if (!strcmp(req->module, module_name)) { + if (req->done) + return 0; + + /* A request to load this module already exists. */ + return -EAGAIN; + } + } + + req = kmalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return -ENOMEM; + + req->done = false; + strlcpy(req->module, module_name, MODULE_NAME_LEN); + list_add_tail(&req->list, &net->nft.module_list); + + return -EAGAIN; } #endif @@ -550,10 +640,9 @@ nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla, lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (autoload) { - nft_request_module(net, "nft-chain-%u-%.*s", family, - nla_len(nla), (const char *)nla_data(nla)); - type = __nf_tables_chain_type_lookup(nla, family); - if (type != NULL) + if (nft_request_module(net, "nft-chain-%u-%.*s", family, + nla_len(nla), + (const char *)nla_data(nla)) == -EAGAIN) return ERR_PTR(-EAGAIN); } #endif @@ -742,7 +831,8 @@ static void nft_table_disable(struct net *net, struct nft_table *table, u32 cnt) if (cnt && i++ == cnt) break; - nf_unregister_net_hook(net, &nft_base_chain(chain)->ops); + nft_unregister_basechain_hooks(net, table->family, + nft_base_chain(chain)); } } @@ -757,14 +847,16 @@ static int nf_tables_table_enable(struct net *net, struct nft_table *table) if (!nft_is_base_chain(chain)) continue; - err = nf_register_net_hook(net, &nft_base_chain(chain)->ops); + err = nft_register_basechain_hooks(net, table->family, + nft_base_chain(chain)); if (err < 0) - goto err; + goto err_register_hooks; i++; } return 0; -err: + +err_register_hooks: if (i) nft_table_disable(net, table, i); return err; @@ -978,12 +1070,18 @@ static int nft_flush_table(struct nft_ctx *ctx) } list_for_each_entry_safe(flowtable, nft, &ctx->table->flowtables, list) { + if (!nft_is_active_next(ctx->net, flowtable)) + continue; + err = nft_delflowtable(ctx, flowtable); if (err < 0) goto out; } list_for_each_entry_safe(obj, ne, &ctx->table->objects, list) { + if (!nft_is_active_next(ctx->net, obj)) + continue; + err = nft_delobj(ctx, obj); if (err < 0) goto out; @@ -1086,11 +1184,8 @@ static void nf_tables_table_destroy(struct nft_ctx *ctx) void nft_register_chain_type(const struct nft_chain_type *ctype) { - if (WARN_ON(ctype->family >= NFPROTO_NUMPROTO)) - return; - nfnl_lock(NFNL_SUBSYS_NFTABLES); - if (WARN_ON(chain_type[ctype->family][ctype->type] != NULL)) { + if (WARN_ON(__nft_chain_type_get(ctype->family, ctype->type))) { nfnl_unlock(NFNL_SUBSYS_NFTABLES); return; } @@ -1174,7 +1269,8 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = { .len = NFT_CHAIN_MAXNAMELEN - 1 }, [NFTA_CHAIN_HOOK] = { .type = NLA_NESTED }, [NFTA_CHAIN_POLICY] = { .type = NLA_U32 }, - [NFTA_CHAIN_TYPE] = { .type = NLA_STRING }, + [NFTA_CHAIN_TYPE] = { .type = NLA_STRING, + .len = NFT_MODULE_AUTOLOAD_LIMIT }, [NFTA_CHAIN_COUNTERS] = { .type = NLA_NESTED }, [NFTA_CHAIN_FLAGS] = { .type = NLA_U32 }, }; @@ -1225,6 +1321,46 @@ nla_put_failure: return -ENOSPC; } +static int nft_dump_basechain_hook(struct sk_buff *skb, int family, + const struct nft_base_chain *basechain) +{ + const struct nf_hook_ops *ops = &basechain->ops; + struct nft_hook *hook, *first = NULL; + struct nlattr *nest, *nest_devs; + int n = 0; + + nest = nla_nest_start_noflag(skb, NFTA_CHAIN_HOOK); + if (nest == NULL) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_HOOK_HOOKNUM, htonl(ops->hooknum))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority))) + goto nla_put_failure; + + if (family == NFPROTO_NETDEV) { + nest_devs = nla_nest_start_noflag(skb, NFTA_HOOK_DEVS); + list_for_each_entry(hook, &basechain->hook_list, list) { + if (!first) + first = hook; + + if (nla_put_string(skb, NFTA_DEVICE_NAME, + hook->ops.dev->name)) + goto nla_put_failure; + n++; + } + nla_nest_end(skb, nest_devs); + + if (n == 1 && + nla_put_string(skb, NFTA_HOOK_DEV, first->ops.dev->name)) + goto nla_put_failure; + } + nla_nest_end(skb, nest); + + return 0; +nla_put_failure: + return -1; +} + static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq, int event, u32 flags, int family, const struct nft_table *table, @@ -1253,21 +1389,10 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, if (nft_is_base_chain(chain)) { const struct nft_base_chain *basechain = nft_base_chain(chain); - const struct nf_hook_ops *ops = &basechain->ops; struct nft_stats __percpu *stats; - struct nlattr *nest; - nest = nla_nest_start_noflag(skb, NFTA_CHAIN_HOOK); - if (nest == NULL) - goto nla_put_failure; - if (nla_put_be32(skb, NFTA_HOOK_HOOKNUM, htonl(ops->hooknum))) - goto nla_put_failure; - if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority))) + if (nft_dump_basechain_hook(skb, family, basechain)) goto nla_put_failure; - if (basechain->dev_name[0] && - nla_put_string(skb, NFTA_HOOK_DEV, basechain->dev_name)) - goto nla_put_failure; - nla_nest_end(skb, nest); if (nla_put_be32(skb, NFTA_CHAIN_POLICY, htonl(basechain->policy))) @@ -1461,8 +1586,9 @@ static void nft_chain_stats_replace(struct nft_trans *trans) if (!nft_trans_chain_stats(trans)) return; - rcu_swap_protected(chain->stats, nft_trans_chain_stats(trans), - lockdep_commit_lock_is_held(trans->ctx.net)); + nft_trans_chain_stats(trans) = + rcu_replace_pointer(chain->stats, nft_trans_chain_stats(trans), + lockdep_commit_lock_is_held(trans->ctx.net)); if (!nft_trans_chain_stats(trans)) static_branch_inc(&nft_counters_enabled); @@ -1485,6 +1611,7 @@ static void nf_tables_chain_free_chain_rules(struct nft_chain *chain) static void nf_tables_chain_destroy(struct nft_ctx *ctx) { struct nft_chain *chain = ctx->chain; + struct nft_hook *hook, *next; if (WARN_ON(chain->use > 0)) return; @@ -1495,6 +1622,13 @@ static void nf_tables_chain_destroy(struct nft_ctx *ctx) if (nft_is_base_chain(chain)) { struct nft_base_chain *basechain = nft_base_chain(chain); + if (ctx->family == NFPROTO_NETDEV) { + list_for_each_entry_safe(hook, next, + &basechain->hook_list, list) { + list_del_rcu(&hook->list); + kfree_rcu(hook, rcu); + } + } module_put(basechain->type->owner); if (rcu_access_pointer(basechain->stats)) { static_branch_dec(&nft_counters_enabled); @@ -1508,13 +1642,126 @@ static void nf_tables_chain_destroy(struct nft_ctx *ctx) } } +static struct nft_hook *nft_netdev_hook_alloc(struct net *net, + const struct nlattr *attr) +{ + struct net_device *dev; + char ifname[IFNAMSIZ]; + struct nft_hook *hook; + int err; + + hook = kmalloc(sizeof(struct nft_hook), GFP_KERNEL); + if (!hook) { + err = -ENOMEM; + goto err_hook_alloc; + } + + nla_strlcpy(ifname, attr, IFNAMSIZ); + dev = __dev_get_by_name(net, ifname); + if (!dev) { + err = -ENOENT; + goto err_hook_dev; + } + hook->ops.dev = dev; + + return hook; + +err_hook_dev: + kfree(hook); +err_hook_alloc: + return ERR_PTR(err); +} + +static bool nft_hook_list_find(struct list_head *hook_list, + const struct nft_hook *this) +{ + struct nft_hook *hook; + + list_for_each_entry(hook, hook_list, list) { + if (this->ops.dev == hook->ops.dev) + return true; + } + + return false; +} + +static int nf_tables_parse_netdev_hooks(struct net *net, + const struct nlattr *attr, + struct list_head *hook_list) +{ + struct nft_hook *hook, *next; + const struct nlattr *tmp; + int rem, n = 0, err; + + nla_for_each_nested(tmp, attr, rem) { + if (nla_type(tmp) != NFTA_DEVICE_NAME) { + err = -EINVAL; + goto err_hook; + } + + hook = nft_netdev_hook_alloc(net, tmp); + if (IS_ERR(hook)) { + err = PTR_ERR(hook); + goto err_hook; + } + if (nft_hook_list_find(hook_list, hook)) { + kfree(hook); + err = -EEXIST; + goto err_hook; + } + list_add_tail(&hook->list, hook_list); + n++; + + if (n == NFT_NETDEVICE_MAX) { + err = -EFBIG; + goto err_hook; + } + } + if (!n) + return -EINVAL; + + return 0; + +err_hook: + list_for_each_entry_safe(hook, next, hook_list, list) { + list_del(&hook->list); + kfree(hook); + } + return err; +} + struct nft_chain_hook { u32 num; s32 priority; const struct nft_chain_type *type; - struct net_device *dev; + struct list_head list; }; +static int nft_chain_parse_netdev(struct net *net, + struct nlattr *tb[], + struct list_head *hook_list) +{ + struct nft_hook *hook; + int err; + + if (tb[NFTA_HOOK_DEV]) { + hook = nft_netdev_hook_alloc(net, tb[NFTA_HOOK_DEV]); + if (IS_ERR(hook)) + return PTR_ERR(hook); + + list_add_tail(&hook->list, hook_list); + } else if (tb[NFTA_HOOK_DEVS]) { + err = nf_tables_parse_netdev_hooks(net, tb[NFTA_HOOK_DEVS], + hook_list); + if (err < 0) + return err; + } else { + return -EINVAL; + } + + return 0; +} + static int nft_chain_parse_hook(struct net *net, const struct nlattr * const nla[], struct nft_chain_hook *hook, u8 family, @@ -1522,7 +1769,6 @@ static int nft_chain_parse_hook(struct net *net, { struct nlattr *ha[NFTA_HOOK_MAX + 1]; const struct nft_chain_type *type; - struct net_device *dev; int err; lockdep_assert_held(&net->nft.commit_mutex); @@ -1541,7 +1787,10 @@ static int nft_chain_parse_hook(struct net *net, hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM])); hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY])); - type = chain_type[family][NFT_CHAIN_T_DEFAULT]; + type = __nft_chain_type_get(family, NFT_CHAIN_T_DEFAULT); + if (!type) + return -EOPNOTSUPP; + if (nla[NFTA_CHAIN_TYPE]) { type = nf_tables_chain_type_lookup(net, nla[NFTA_CHAIN_TYPE], family, autoload); @@ -1560,23 +1809,14 @@ static int nft_chain_parse_hook(struct net *net, hook->type = type; - hook->dev = NULL; + INIT_LIST_HEAD(&hook->list); if (family == NFPROTO_NETDEV) { - char ifname[IFNAMSIZ]; - - if (!ha[NFTA_HOOK_DEV]) { - module_put(type->owner); - return -EOPNOTSUPP; - } - - nla_strlcpy(ifname, ha[NFTA_HOOK_DEV], IFNAMSIZ); - dev = __dev_get_by_name(net, ifname); - if (!dev) { + err = nft_chain_parse_netdev(net, ha, &hook->list); + if (err < 0) { module_put(type->owner); - return -ENOENT; + return err; } - hook->dev = dev; - } else if (ha[NFTA_HOOK_DEV]) { + } else if (ha[NFTA_HOOK_DEV] || ha[NFTA_HOOK_DEVS]) { module_put(type->owner); return -EOPNOTSUPP; } @@ -1586,6 +1826,12 @@ static int nft_chain_parse_hook(struct net *net, static void nft_chain_release_hook(struct nft_chain_hook *hook) { + struct nft_hook *h, *next; + + list_for_each_entry_safe(h, next, &hook->list, list) { + list_del(&h->list); + kfree(h); + } module_put(hook->type->owner); } @@ -1610,6 +1856,49 @@ static struct nft_rule **nf_tables_chain_alloc_rules(const struct nft_chain *cha return kvmalloc(alloc, GFP_KERNEL); } +static void nft_basechain_hook_init(struct nf_hook_ops *ops, u8 family, + const struct nft_chain_hook *hook, + struct nft_chain *chain) +{ + ops->pf = family; + ops->hooknum = hook->num; + ops->priority = hook->priority; + ops->priv = chain; + ops->hook = hook->type->hooks[ops->hooknum]; +} + +static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, + struct nft_chain_hook *hook, u32 flags) +{ + struct nft_chain *chain; + struct nft_hook *h; + + basechain->type = hook->type; + INIT_LIST_HEAD(&basechain->hook_list); + chain = &basechain->chain; + + if (family == NFPROTO_NETDEV) { + list_splice_init(&hook->list, &basechain->hook_list); + list_for_each_entry(h, &basechain->hook_list, list) + nft_basechain_hook_init(&h->ops, family, hook, chain); + + basechain->ops.hooknum = hook->num; + basechain->ops.priority = hook->priority; + } else { + nft_basechain_hook_init(&basechain->ops, family, hook, chain); + } + + chain->flags |= NFT_BASE_CHAIN | flags; + basechain->policy = NF_ACCEPT; + if (chain->flags & NFT_CHAIN_HW_OFFLOAD && + nft_chain_offload_priority(basechain) < 0) + return -EOPNOTSUPP; + + flow_block_init(&basechain->flow_block); + + return 0; +} + static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, u8 policy, u32 flags) { @@ -1628,7 +1917,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, if (nla[NFTA_CHAIN_HOOK]) { struct nft_chain_hook hook; - struct nf_hook_ops *ops; err = nft_chain_parse_hook(net, nla, &hook, family, true); if (err < 0) @@ -1639,9 +1927,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, nft_chain_release_hook(&hook); return -ENOMEM; } - - if (hook.dev != NULL) - strncpy(basechain->dev_name, hook.dev->name, IFNAMSIZ); + chain = &basechain->chain; if (nla[NFTA_CHAIN_COUNTERS]) { stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); @@ -1654,24 +1940,12 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, static_branch_inc(&nft_counters_enabled); } - basechain->type = hook.type; - chain = &basechain->chain; - - ops = &basechain->ops; - ops->pf = family; - ops->hooknum = hook.num; - ops->priority = hook.priority; - ops->priv = chain; - ops->hook = hook.type->hooks[ops->hooknum]; - ops->dev = hook.dev; - - chain->flags |= NFT_BASE_CHAIN | flags; - basechain->policy = NF_ACCEPT; - if (chain->flags & NFT_CHAIN_HW_OFFLOAD && - nft_chain_offload_priority(basechain) < 0) - return -EOPNOTSUPP; - - flow_block_init(&basechain->flow_block); + err = nft_basechain_init(basechain, family, &hook, flags); + if (err < 0) { + nft_chain_release_hook(&hook); + kfree(basechain); + return err; + } } else { chain = kzalloc(sizeof(*chain), GFP_KERNEL); if (chain == NULL) @@ -1715,7 +1989,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, goto err2; } - nft_trans_chain_policy(trans) = -1; + nft_trans_chain_policy(trans) = NFT_CHAIN_POLICY_UNSET; if (nft_is_base_chain(chain)) nft_trans_chain_policy(trans) = policy; @@ -1731,6 +2005,25 @@ err1: return err; } +static bool nft_hook_list_equal(struct list_head *hook_list1, + struct list_head *hook_list2) +{ + struct nft_hook *hook; + int n = 0, m = 0; + + n = 0; + list_for_each_entry(hook, hook_list2, list) { + if (!nft_hook_list_find(hook_list1, hook)) + return false; + + n++; + } + list_for_each_entry(hook, hook_list1, list) + m++; + + return n == m; +} + static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, u32 flags) { @@ -1762,12 +2055,19 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, return -EBUSY; } - ops = &basechain->ops; - if (ops->hooknum != hook.num || - ops->priority != hook.priority || - ops->dev != hook.dev) { - nft_chain_release_hook(&hook); - return -EBUSY; + if (ctx->family == NFPROTO_NETDEV) { + if (!nft_hook_list_equal(&basechain->hook_list, + &hook.list)) { + nft_chain_release_hook(&hook); + return -EBUSY; + } + } else { + ops = &basechain->ops; + if (ops->hooknum != hook.num || + ops->priority != hook.priority) { + nft_chain_release_hook(&hook); + return -EBUSY; + } } nft_chain_release_hook(&hook); } @@ -1922,6 +2222,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; + flags |= chain->flags & NFT_BASE_CHAIN; return nf_tables_updchain(&ctx, genmask, policy, flags); } @@ -2049,9 +2350,8 @@ static const struct nft_expr_type *__nft_expr_type_get(u8 family, static int nft_expr_type_request_module(struct net *net, u8 family, struct nlattr *nla) { - nft_request_module(net, "nft-expr-%u-%.*s", family, - nla_len(nla), (char *)nla_data(nla)); - if (__nft_expr_type_get(family, nla)) + if (nft_request_module(net, "nft-expr-%u-%.*s", family, + nla_len(nla), (char *)nla_data(nla)) == -EAGAIN) return -EAGAIN; return 0; @@ -2077,9 +2377,9 @@ static const struct nft_expr_type *nft_expr_type_get(struct net *net, if (nft_expr_type_request_module(net, family, nla) == -EAGAIN) return ERR_PTR(-EAGAIN); - nft_request_module(net, "nft-expr-%.*s", - nla_len(nla), (char *)nla_data(nla)); - if (__nft_expr_type_get(family, nla)) + if (nft_request_module(net, "nft-expr-%.*s", + nla_len(nla), + (char *)nla_data(nla)) == -EAGAIN) return ERR_PTR(-EAGAIN); } #endif @@ -2087,7 +2387,8 @@ static const struct nft_expr_type *nft_expr_type_get(struct net *net, } static const struct nla_policy nft_expr_policy[NFTA_EXPR_MAX + 1] = { - [NFTA_EXPR_NAME] = { .type = NLA_STRING }, + [NFTA_EXPR_NAME] = { .type = NLA_STRING, + .len = NFT_MODULE_AUTOLOAD_LIMIT }, [NFTA_EXPR_DATA] = { .type = NLA_NESTED }, }; @@ -2169,9 +2470,10 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx, err = PTR_ERR(ops); #ifdef CONFIG_MODULES if (err == -EAGAIN) - nft_expr_type_request_module(ctx->net, - ctx->family, - tb[NFTA_EXPR_NAME]); + if (nft_expr_type_request_module(ctx->net, + ctx->family, + tb[NFTA_EXPR_NAME]) != -EAGAIN) + err = -ENOENT; #endif goto err1; } @@ -2853,7 +3155,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, return nft_table_validate(net, table); if (chain->flags & NFT_CHAIN_HW_OFFLOAD) { - flow = nft_flow_rule_create(rule); + flow = nft_flow_rule_create(net, rule); if (IS_ERR(flow)) return PTR_ERR(flow); @@ -3008,8 +3310,7 @@ nft_select_set_ops(const struct nft_ctx *ctx, lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (list_empty(&nf_tables_set_types)) { - nft_request_module(ctx->net, "nft-set"); - if (!list_empty(&nf_tables_set_types)) + if (nft_request_module(ctx->net, "nft-set") == -EAGAIN) return ERR_PTR(-EAGAIN); } #endif @@ -3090,6 +3391,7 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { [NFTA_SET_DESC_SIZE] = { .type = NLA_U32 }, + [NFTA_SET_DESC_CONCAT] = { .type = NLA_NESTED }, }; static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, @@ -3256,6 +3558,33 @@ static __be64 nf_jiffies64_to_msecs(u64 input) return cpu_to_be64(jiffies64_to_msecs(input)); } +static int nf_tables_fill_set_concat(struct sk_buff *skb, + const struct nft_set *set) +{ + struct nlattr *concat, *field; + int i; + + concat = nla_nest_start_noflag(skb, NFTA_SET_DESC_CONCAT); + if (!concat) + return -ENOMEM; + + for (i = 0; i < set->field_count; i++) { + field = nla_nest_start_noflag(skb, NFTA_LIST_ELEM); + if (!field) + return -ENOMEM; + + if (nla_put_be32(skb, NFTA_SET_FIELD_LEN, + htonl(set->field_len[i]))) + return -ENOMEM; + + nla_nest_end(skb, field); + } + + nla_nest_end(skb, concat); + + return 0; +} + static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, const struct nft_set *set, u16 event, u16 flags) { @@ -3319,11 +3648,17 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, goto nla_put_failure; desc = nla_nest_start_noflag(skb, NFTA_SET_DESC); + if (desc == NULL) goto nla_put_failure; if (set->size && nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(set->size))) goto nla_put_failure; + + if (set->field_count > 1 && + nf_tables_fill_set_concat(skb, set)) + goto nla_put_failure; + nla_nest_end(skb, desc); nlmsg_end(skb, nlh); @@ -3496,6 +3831,53 @@ err: return err; } +static const struct nla_policy nft_concat_policy[NFTA_SET_FIELD_MAX + 1] = { + [NFTA_SET_FIELD_LEN] = { .type = NLA_U32 }, +}; + +static int nft_set_desc_concat_parse(const struct nlattr *attr, + struct nft_set_desc *desc) +{ + struct nlattr *tb[NFTA_SET_FIELD_MAX + 1]; + u32 len; + int err; + + err = nla_parse_nested_deprecated(tb, NFTA_SET_FIELD_MAX, attr, + nft_concat_policy, NULL); + if (err < 0) + return err; + + if (!tb[NFTA_SET_FIELD_LEN]) + return -EINVAL; + + len = ntohl(nla_get_be32(tb[NFTA_SET_FIELD_LEN])); + + if (len * BITS_PER_BYTE / 32 > NFT_REG32_COUNT) + return -E2BIG; + + desc->field_len[desc->field_count++] = len; + + return 0; +} + +static int nft_set_desc_concat(struct nft_set_desc *desc, + const struct nlattr *nla) +{ + struct nlattr *attr; + int rem, err; + + nla_for_each_nested(attr, nla, rem) { + if (nla_type(attr) != NFTA_LIST_ELEM) + return -EINVAL; + + err = nft_set_desc_concat_parse(attr, desc); + if (err < 0) + return err; + } + + return 0; +} + static int nf_tables_set_desc_parse(struct nft_set_desc *desc, const struct nlattr *nla) { @@ -3509,8 +3891,10 @@ static int nf_tables_set_desc_parse(struct nft_set_desc *desc, if (da[NFTA_SET_DESC_SIZE] != NULL) desc->size = ntohl(nla_get_be32(da[NFTA_SET_DESC_SIZE])); + if (da[NFTA_SET_DESC_CONCAT]) + err = nft_set_desc_concat(desc, da[NFTA_SET_DESC_CONCAT]); - return 0; + return err; } static int nf_tables_newset(struct net *net, struct sock *nlsk, @@ -3533,6 +3917,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, unsigned char *udata; u16 udlen; int err; + int i; if (nla[NFTA_SET_TABLE] == NULL || nla[NFTA_SET_NAME] == NULL || @@ -3562,8 +3947,11 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, NFT_SET_OBJECT)) return -EINVAL; /* Only one of these operations is supported */ - if ((flags & (NFT_SET_MAP | NFT_SET_EVAL | NFT_SET_OBJECT)) == - (NFT_SET_MAP | NFT_SET_EVAL | NFT_SET_OBJECT)) + if ((flags & (NFT_SET_MAP | NFT_SET_OBJECT)) == + (NFT_SET_MAP | NFT_SET_OBJECT)) + return -EOPNOTSUPP; + if ((flags & (NFT_SET_EVAL | NFT_SET_OBJECT)) == + (NFT_SET_EVAL | NFT_SET_OBJECT)) return -EOPNOTSUPP; } @@ -3708,6 +4096,10 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, set->gc_int = gc_int; set->handle = nf_tables_alloc_handle(table); + set->field_count = desc.field_count; + for (i = 0; i < desc.field_count; i++) + set->field_len[i] = desc.field_len[i]; + err = ops->init(set, &desc, nla); if (err < 0) goto err3; @@ -3911,6 +4303,9 @@ const struct nft_set_ext_type nft_set_ext_types[] = { .len = sizeof(struct nft_userdata), .align = __alignof__(struct nft_userdata), }, + [NFT_SET_EXT_KEY_END] = { + .align = __alignof__(u32), + }, }; EXPORT_SYMBOL_GPL(nft_set_ext_types); @@ -3927,7 +4322,9 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = { [NFTA_SET_ELEM_USERDATA] = { .type = NLA_BINARY, .len = NFT_USERDATA_MAXLEN }, [NFTA_SET_ELEM_EXPR] = { .type = NLA_NESTED }, - [NFTA_SET_ELEM_OBJREF] = { .type = NLA_STRING }, + [NFTA_SET_ELEM_OBJREF] = { .type = NLA_STRING, + .len = NFT_OBJ_MAXNAMELEN - 1 }, + [NFTA_SET_ELEM_KEY_END] = { .type = NLA_NESTED }, }; static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = { @@ -3977,6 +4374,11 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, NFT_DATA_VALUE, set->klen) < 0) goto nla_put_failure; + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END) && + nft_data_dump(skb, NFTA_SET_ELEM_KEY_END, nft_set_ext_key_end(ext), + NFT_DATA_VALUE, set->klen) < 0) + goto nla_put_failure; + if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && nft_data_dump(skb, NFTA_SET_ELEM_DATA, nft_set_ext_data(ext), set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE, @@ -4219,11 +4621,28 @@ static int nft_setelem_parse_flags(const struct nft_set *set, return 0; } +static int nft_setelem_parse_key(struct nft_ctx *ctx, struct nft_set *set, + struct nft_data *key, struct nlattr *attr) +{ + struct nft_data_desc desc; + int err; + + err = nft_data_init(ctx, key, NFT_DATA_VALUE_MAXLEN, &desc, attr); + if (err < 0) + return err; + + if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) { + nft_data_release(key, desc.type); + return -EINVAL; + } + + return 0; +} + static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; - struct nft_data_desc desc; struct nft_set_elem elem; struct sk_buff *skb; uint32_t flags = 0; @@ -4242,14 +4661,17 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (err < 0) return err; - err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &desc, - nla[NFTA_SET_ELEM_KEY]); + err = nft_setelem_parse_key(ctx, set, &elem.key.val, + nla[NFTA_SET_ELEM_KEY]); if (err < 0) return err; - err = -EINVAL; - if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) - return err; + if (nla[NFTA_SET_ELEM_KEY_END]) { + err = nft_setelem_parse_key(ctx, set, &elem.key_end.val, + nla[NFTA_SET_ELEM_KEY_END]); + if (err < 0) + return err; + } priv = set->ops->get(ctx->net, set, &elem, flags); if (IS_ERR(priv)) @@ -4376,8 +4798,8 @@ static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx, void *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, - const u32 *key, const u32 *data, - u64 timeout, u64 expiration, gfp_t gfp) + const u32 *key, const u32 *key_end, + const u32 *data, u64 timeout, u64 expiration, gfp_t gfp) { struct nft_set_ext *ext; void *elem; @@ -4390,6 +4812,8 @@ void *nft_set_elem_init(const struct nft_set *set, nft_set_ext_init(ext, tmpl); memcpy(nft_set_ext_key(ext), key, set->klen); + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) + memcpy(nft_set_ext_key_end(ext), key_end, set->klen); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) memcpy(nft_set_ext_data(ext), data, set->dlen); if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { @@ -4449,13 +4873,13 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; u8 genmask = nft_genmask_next(ctx->net); - struct nft_data_desc d1, d2; struct nft_set_ext_tmpl tmpl; struct nft_set_ext *ext, *ext2; struct nft_set_elem elem; struct nft_set_binding *binding; struct nft_object *obj = NULL; struct nft_userdata *udata; + struct nft_data_desc desc; struct nft_data data; enum nft_registers dreg; struct nft_trans *trans; @@ -4485,14 +4909,20 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (nla[NFTA_SET_ELEM_DATA] == NULL && !(flags & NFT_SET_ELEM_INTERVAL_END)) return -EINVAL; - if (nla[NFTA_SET_ELEM_DATA] != NULL && - flags & NFT_SET_ELEM_INTERVAL_END) - return -EINVAL; } else { if (nla[NFTA_SET_ELEM_DATA] != NULL) return -EINVAL; } + if ((flags & NFT_SET_ELEM_INTERVAL_END) && + (nla[NFTA_SET_ELEM_DATA] || + nla[NFTA_SET_ELEM_OBJREF] || + nla[NFTA_SET_ELEM_TIMEOUT] || + nla[NFTA_SET_ELEM_EXPIRATION] || + nla[NFTA_SET_ELEM_USERDATA] || + nla[NFTA_SET_ELEM_EXPR])) + return -EINVAL; + timeout = 0; if (nla[NFTA_SET_ELEM_TIMEOUT] != NULL) { if (!(set->flags & NFT_SET_TIMEOUT)) @@ -4515,15 +4945,22 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, return err; } - err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &d1, - nla[NFTA_SET_ELEM_KEY]); + err = nft_setelem_parse_key(ctx, set, &elem.key.val, + nla[NFTA_SET_ELEM_KEY]); if (err < 0) - goto err1; - err = -EINVAL; - if (d1.type != NFT_DATA_VALUE || d1.len != set->klen) - goto err2; + return err; + + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); + + if (nla[NFTA_SET_ELEM_KEY_END]) { + err = nft_setelem_parse_key(ctx, set, &elem.key_end.val, + nla[NFTA_SET_ELEM_KEY_END]); + if (err < 0) + goto err_parse_key; + + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen); + } - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, d1.len); if (timeout > 0) { nft_set_ext_add(&tmpl, NFT_SET_EXT_EXPIRATION); if (timeout != set->timeout) @@ -4533,27 +4970,27 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (nla[NFTA_SET_ELEM_OBJREF] != NULL) { if (!(set->flags & NFT_SET_OBJECT)) { err = -EINVAL; - goto err2; + goto err_parse_key_end; } obj = nft_obj_lookup(ctx->net, ctx->table, nla[NFTA_SET_ELEM_OBJREF], set->objtype, genmask); if (IS_ERR(obj)) { err = PTR_ERR(obj); - goto err2; + goto err_parse_key_end; } nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF); } if (nla[NFTA_SET_ELEM_DATA] != NULL) { - err = nft_data_init(ctx, &data, sizeof(data), &d2, + err = nft_data_init(ctx, &data, sizeof(data), &desc, nla[NFTA_SET_ELEM_DATA]); if (err < 0) - goto err2; + goto err_parse_key_end; err = -EINVAL; - if (set->dtype != NFT_DATA_VERDICT && d2.len != set->dlen) - goto err3; + if (set->dtype != NFT_DATA_VERDICT && desc.len != set->dlen) + goto err_parse_data; dreg = nft_type_to_reg(set->dtype); list_for_each_entry(binding, &set->bindings, list) { @@ -4569,18 +5006,18 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = nft_validate_register_store(&bind_ctx, dreg, &data, - d2.type, d2.len); + desc.type, desc.len); if (err < 0) - goto err3; + goto err_parse_data; - if (d2.type == NFT_DATA_VERDICT && + if (desc.type == NFT_DATA_VERDICT && (data.verdict.code == NFT_GOTO || data.verdict.code == NFT_JUMP)) nft_validate_state_update(ctx->net, NFT_VALIDATE_NEED); } - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, d2.len); + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, desc.len); } /* The full maximum length of userdata can exceed the maximum @@ -4596,10 +5033,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } err = -ENOMEM; - elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, data.data, + elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, + elem.key_end.val.data, data.data, timeout, expiration, GFP_KERNEL); if (elem.priv == NULL) - goto err3; + goto err_parse_data; ext = nft_set_elem_ext(set, elem.priv); if (flags) @@ -4616,7 +5054,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); if (trans == NULL) - goto err4; + goto err_trans; ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK; err = set->ops->insert(ctx->net, set, &elem, &ext2); @@ -4627,7 +5065,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) ^ nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF)) { err = -EBUSY; - goto err5; + goto err_element_clash; } if ((nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) && @@ -4640,33 +5078,35 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, else if (!(nlmsg_flags & NLM_F_EXCL)) err = 0; } - goto err5; + goto err_element_clash; } if (set->size && !atomic_add_unless(&set->nelems, 1, set->size + set->ndeact)) { err = -ENFILE; - goto err6; + goto err_set_full; } nft_trans_elem(trans) = elem; list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; -err6: +err_set_full: set->ops->remove(ctx->net, set, &elem); -err5: +err_element_clash: kfree(trans); -err4: +err_trans: if (obj) obj->use--; kfree(elem.priv); -err3: +err_parse_data: if (nla[NFTA_SET_ELEM_DATA] != NULL) - nft_data_release(&data, d2.type); -err2: - nft_data_release(&elem.key.val, d1.type); -err1: + nft_data_release(&data, desc.type); +err_parse_key_end: + nft_data_release(&elem.key_end.val, NFT_DATA_VALUE); +err_parse_key: + nft_data_release(&elem.key.val, NFT_DATA_VALUE); + return err; } @@ -4761,7 +5201,6 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; struct nft_set_ext_tmpl tmpl; - struct nft_data_desc desc; struct nft_set_elem elem; struct nft_set_ext *ext; struct nft_trans *trans; @@ -4772,11 +5211,10 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr, nft_set_elem_policy, NULL); if (err < 0) - goto err1; + return err; - err = -EINVAL; if (nla[NFTA_SET_ELEM_KEY] == NULL) - goto err1; + return -EINVAL; nft_set_ext_prepare(&tmpl); @@ -4786,37 +5224,41 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, if (flags != 0) nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS); - err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &desc, - nla[NFTA_SET_ELEM_KEY]); + err = nft_setelem_parse_key(ctx, set, &elem.key.val, + nla[NFTA_SET_ELEM_KEY]); if (err < 0) - goto err1; + return err; - err = -EINVAL; - if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) - goto err2; + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, desc.len); + if (nla[NFTA_SET_ELEM_KEY_END]) { + err = nft_setelem_parse_key(ctx, set, &elem.key_end.val, + nla[NFTA_SET_ELEM_KEY_END]); + if (err < 0) + return err; + + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen); + } err = -ENOMEM; - elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, NULL, 0, - 0, GFP_KERNEL); + elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, + elem.key_end.val.data, NULL, 0, 0, + GFP_KERNEL); if (elem.priv == NULL) - goto err2; + goto fail_elem; ext = nft_set_elem_ext(set, elem.priv); if (flags) *nft_set_ext_flags(ext) = flags; trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set); - if (trans == NULL) { - err = -ENOMEM; - goto err3; - } + if (trans == NULL) + goto fail_trans; priv = set->ops->deactivate(ctx->net, set, &elem); if (priv == NULL) { err = -ENOENT; - goto err4; + goto fail_ops; } kfree(elem.priv); elem.priv = priv; @@ -4827,13 +5269,12 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; -err4: +fail_ops: kfree(trans); -err3: +fail_trans: kfree(elem.priv); -err2: - nft_data_release(&elem.key.val, desc.type); -err1: +fail_elem: + nft_data_release(&elem.key.val, NFT_DATA_VALUE); return err; } @@ -5123,14 +5564,45 @@ nft_obj_type_get(struct net *net, u32 objtype) lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (type == NULL) { - nft_request_module(net, "nft-obj-%u", objtype); - if (__nft_obj_type_get(objtype)) + if (nft_request_module(net, "nft-obj-%u", objtype) == -EAGAIN) return ERR_PTR(-EAGAIN); } #endif return ERR_PTR(-ENOENT); } +static int nf_tables_updobj(const struct nft_ctx *ctx, + const struct nft_object_type *type, + const struct nlattr *attr, + struct nft_object *obj) +{ + struct nft_object *newobj; + struct nft_trans *trans; + int err; + + trans = nft_trans_alloc(ctx, NFT_MSG_NEWOBJ, + sizeof(struct nft_trans_obj)); + if (!trans) + return -ENOMEM; + + newobj = nft_obj_init(ctx, type, attr); + if (IS_ERR(newobj)) { + err = PTR_ERR(newobj); + goto err_free_trans; + } + + nft_trans_obj(trans) = obj; + nft_trans_obj_update(trans) = true; + nft_trans_obj_newobj(trans) = newobj; + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + + return 0; + +err_free_trans: + kfree(trans); + return err; +} + static int nf_tables_newobj(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], @@ -5170,7 +5642,13 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]); return -EEXIST; } - return 0; + if (nlh->nlmsg_flags & NLM_F_REPLACE) + return -EOPNOTSUPP; + + type = __nft_obj_type_get(objtype); + nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + + return nf_tables_updobj(&ctx, type, nla[NFTA_OBJ_DATA], obj); } nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); @@ -5538,6 +6016,7 @@ static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = { .len = NFT_NAME_MAXLEN - 1 }, [NFTA_FLOWTABLE_HOOK] = { .type = NLA_NESTED }, [NFTA_FLOWTABLE_HANDLE] = { .type = NLA_U64 }, + [NFTA_FLOWTABLE_FLAGS] = { .type = NLA_U32 }, }; struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, @@ -5554,6 +6033,22 @@ struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, } EXPORT_SYMBOL_GPL(nft_flowtable_lookup); +void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx, + struct nft_flowtable *flowtable, + enum nft_trans_phase phase) +{ + switch (phase) { + case NFT_TRANS_PREPARE: + case NFT_TRANS_ABORT: + case NFT_TRANS_RELEASE: + flowtable->use--; + /* fall through */ + default: + return; + } +} +EXPORT_SYMBOL_GPL(nf_tables_deactivate_flowtable); + static struct nft_flowtable * nft_flowtable_lookup_byhandle(const struct nft_table *table, const struct nlattr *nla, u8 genmask) @@ -5568,43 +6063,6 @@ nft_flowtable_lookup_byhandle(const struct nft_table *table, return ERR_PTR(-ENOENT); } -static int nf_tables_parse_devices(const struct nft_ctx *ctx, - const struct nlattr *attr, - struct net_device *dev_array[], int *len) -{ - const struct nlattr *tmp; - struct net_device *dev; - char ifname[IFNAMSIZ]; - int rem, n = 0, err; - - nla_for_each_nested(tmp, attr, rem) { - if (nla_type(tmp) != NFTA_DEVICE_NAME) { - err = -EINVAL; - goto err1; - } - - nla_strlcpy(ifname, tmp, IFNAMSIZ); - dev = __dev_get_by_name(ctx->net, ifname); - if (!dev) { - err = -ENOENT; - goto err1; - } - - dev_array[n++] = dev; - if (n == NFT_FLOWTABLE_DEVICE_MAX) { - err = -EFBIG; - goto err1; - } - } - if (!len) - return -EINVAL; - - err = 0; -err1: - *len = n; - return err; -} - static const struct nla_policy nft_flowtable_hook_policy[NFTA_FLOWTABLE_HOOK_MAX + 1] = { [NFTA_FLOWTABLE_HOOK_NUM] = { .type = NLA_U32 }, [NFTA_FLOWTABLE_HOOK_PRIORITY] = { .type = NLA_U32 }, @@ -5615,11 +6073,10 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx, const struct nlattr *attr, struct nft_flowtable *flowtable) { - struct net_device *dev_array[NFT_FLOWTABLE_DEVICE_MAX]; struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1]; - struct nf_hook_ops *ops; + struct nft_hook *hook; int hooknum, priority; - int err, n = 0, i; + int err; err = nla_parse_nested_deprecated(tb, NFTA_FLOWTABLE_HOOK_MAX, attr, nft_flowtable_hook_policy, NULL); @@ -5637,27 +6094,21 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx, priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY])); - err = nf_tables_parse_devices(ctx, tb[NFTA_FLOWTABLE_HOOK_DEVS], - dev_array, &n); + err = nf_tables_parse_netdev_hooks(ctx->net, + tb[NFTA_FLOWTABLE_HOOK_DEVS], + &flowtable->hook_list); if (err < 0) return err; - ops = kcalloc(n, sizeof(struct nf_hook_ops), GFP_KERNEL); - if (!ops) - return -ENOMEM; - - flowtable->hooknum = hooknum; - flowtable->priority = priority; - flowtable->ops = ops; - flowtable->ops_len = n; + flowtable->hooknum = hooknum; + flowtable->data.priority = priority; - for (i = 0; i < n; i++) { - flowtable->ops[i].pf = NFPROTO_NETDEV; - flowtable->ops[i].hooknum = hooknum; - flowtable->ops[i].priority = priority; - flowtable->ops[i].priv = &flowtable->data; - flowtable->ops[i].hook = flowtable->data.type->hook; - flowtable->ops[i].dev = dev_array[i]; + list_for_each_entry(hook, &flowtable->hook_list, list) { + hook->ops.pf = NFPROTO_NETDEV; + hook->ops.hooknum = hooknum; + hook->ops.priority = priority; + hook->ops.priv = &flowtable->data; + hook->ops.hook = flowtable->data.type->hook; } return err; @@ -5686,25 +6137,81 @@ nft_flowtable_type_get(struct net *net, u8 family) lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (type == NULL) { - nft_request_module(net, "nf-flowtable-%u", family); - if (__nft_flowtable_type_get(family)) + if (nft_request_module(net, "nf-flowtable-%u", family) == -EAGAIN) return ERR_PTR(-EAGAIN); } #endif return ERR_PTR(-ENOENT); } +/* Only called from error and netdev event paths. */ +static void nft_unregister_flowtable_hook(struct net *net, + struct nft_flowtable *flowtable, + struct nft_hook *hook) +{ + nf_unregister_net_hook(net, &hook->ops); + flowtable->data.type->setup(&flowtable->data, hook->ops.dev, + FLOW_BLOCK_UNBIND); +} + static void nft_unregister_flowtable_net_hooks(struct net *net, struct nft_flowtable *flowtable) { - int i; + struct nft_hook *hook; - for (i = 0; i < flowtable->ops_len; i++) { - if (!flowtable->ops[i].dev) - continue; + list_for_each_entry(hook, &flowtable->hook_list, list) + nf_unregister_net_hook(net, &hook->ops); +} - nf_unregister_net_hook(net, &flowtable->ops[i]); +static int nft_register_flowtable_net_hooks(struct net *net, + struct nft_table *table, + struct nft_flowtable *flowtable) +{ + struct nft_hook *hook, *hook2, *next; + struct nft_flowtable *ft; + int err, i = 0; + + list_for_each_entry(hook, &flowtable->hook_list, list) { + list_for_each_entry(ft, &table->flowtables, list) { + list_for_each_entry(hook2, &ft->hook_list, list) { + if (hook->ops.dev == hook2->ops.dev && + hook->ops.pf == hook2->ops.pf) { + err = -EBUSY; + goto err_unregister_net_hooks; + } + } + } + + err = flowtable->data.type->setup(&flowtable->data, + hook->ops.dev, + FLOW_BLOCK_BIND); + if (err < 0) + goto err_unregister_net_hooks; + + err = nf_register_net_hook(net, &hook->ops); + if (err < 0) { + flowtable->data.type->setup(&flowtable->data, + hook->ops.dev, + FLOW_BLOCK_UNBIND); + goto err_unregister_net_hooks; + } + + i++; } + + return 0; + +err_unregister_net_hooks: + list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { + if (i-- <= 0) + break; + + nft_unregister_flowtable_hook(net, flowtable, hook); + list_del_rcu(&hook->list); + kfree_rcu(hook, rcu); + } + + return err; } static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, @@ -5715,12 +6222,13 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); const struct nf_flowtable_type *type; - struct nft_flowtable *flowtable, *ft; u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; + struct nft_flowtable *flowtable; + struct nft_hook *hook, *next; struct nft_table *table; struct nft_ctx ctx; - int err, i, k; + int err; if (!nla[NFTA_FLOWTABLE_TABLE] || !nla[NFTA_FLOWTABLE_NAME] || @@ -5759,6 +6267,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, flowtable->table = table; flowtable->handle = nf_tables_alloc_handle(table); + INIT_LIST_HEAD(&flowtable->hook_list); flowtable->name = nla_strdup(nla[NFTA_FLOWTABLE_NAME], GFP_KERNEL); if (!flowtable->name) { @@ -5772,6 +6281,14 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, goto err2; } + if (nla[NFTA_FLOWTABLE_FLAGS]) { + flowtable->data.flags = + ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS])); + if (flowtable->data.flags & ~NF_FLOWTABLE_HW_OFFLOAD) + goto err3; + } + + write_pnet(&flowtable->data.net, net); flowtable->data.type = type; err = type->init(&flowtable->data); if (err < 0) @@ -5782,43 +6299,24 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, if (err < 0) goto err4; - for (i = 0; i < flowtable->ops_len; i++) { - if (!flowtable->ops[i].dev) - continue; - - list_for_each_entry(ft, &table->flowtables, list) { - for (k = 0; k < ft->ops_len; k++) { - if (!ft->ops[k].dev) - continue; - - if (flowtable->ops[i].dev == ft->ops[k].dev && - flowtable->ops[i].pf == ft->ops[k].pf) { - err = -EBUSY; - goto err5; - } - } - } - - err = nf_register_net_hook(net, &flowtable->ops[i]); - if (err < 0) - goto err5; - } + err = nft_register_flowtable_net_hooks(ctx.net, table, flowtable); + if (err < 0) + goto err4; err = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable); if (err < 0) - goto err6; + goto err5; list_add_tail_rcu(&flowtable->list, &table->flowtables); table->use++; return 0; -err6: - i = flowtable->ops_len; err5: - for (k = i - 1; k >= 0; k--) - nf_unregister_net_hook(net, &flowtable->ops[k]); - - kfree(flowtable->ops); + list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { + nft_unregister_flowtable_hook(net, flowtable, hook); + list_del_rcu(&hook->list); + kfree_rcu(hook, rcu); + } err4: flowtable->data.type->free(&flowtable->data); err3: @@ -5885,8 +6383,8 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, { struct nlattr *nest, *nest_devs; struct nfgenmsg *nfmsg; + struct nft_hook *hook; struct nlmsghdr *nlh; - int i; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); @@ -5902,25 +6400,23 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, nla_put_string(skb, NFTA_FLOWTABLE_NAME, flowtable->name) || nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) || nla_put_be64(skb, NFTA_FLOWTABLE_HANDLE, cpu_to_be64(flowtable->handle), - NFTA_FLOWTABLE_PAD)) + NFTA_FLOWTABLE_PAD) || + nla_put_be32(skb, NFTA_FLOWTABLE_FLAGS, htonl(flowtable->data.flags))) goto nla_put_failure; nest = nla_nest_start_noflag(skb, NFTA_FLOWTABLE_HOOK); if (!nest) goto nla_put_failure; if (nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_NUM, htonl(flowtable->hooknum)) || - nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_PRIORITY, htonl(flowtable->priority))) + nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_PRIORITY, htonl(flowtable->data.priority))) goto nla_put_failure; nest_devs = nla_nest_start_noflag(skb, NFTA_FLOWTABLE_HOOK_DEVS); if (!nest_devs) goto nla_put_failure; - for (i = 0; i < flowtable->ops_len; i++) { - const struct net_device *dev = READ_ONCE(flowtable->ops[i].dev); - - if (dev && - nla_put_string(skb, NFTA_DEVICE_NAME, dev->name)) + list_for_each_entry_rcu(hook, &flowtable->hook_list, list) { + if (nla_put_string(skb, NFTA_DEVICE_NAME, hook->ops.dev->name)) goto nla_put_failure; } nla_nest_end(skb, nest_devs); @@ -6111,9 +6607,16 @@ err: static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) { - kfree(flowtable->ops); - kfree(flowtable->name); + struct nft_hook *hook, *next; + flowtable->data.type->free(&flowtable->data); + list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { + flowtable->data.type->setup(&flowtable->data, hook->ops.dev, + FLOW_BLOCK_UNBIND); + list_del_rcu(&hook->list); + kfree(hook); + } + kfree(flowtable->name); module_put(flowtable->data.type->owner); kfree(flowtable); } @@ -6151,14 +6654,16 @@ nla_put_failure: static void nft_flowtable_event(unsigned long event, struct net_device *dev, struct nft_flowtable *flowtable) { - int i; + struct nft_hook *hook; - for (i = 0; i < flowtable->ops_len; i++) { - if (flowtable->ops[i].dev != dev) + list_for_each_entry(hook, &flowtable->hook_list, list) { + if (hook->ops.dev != dev) continue; - nf_unregister_net_hook(dev_net(dev), &flowtable->ops[i]); - flowtable->ops[i].dev = NULL; + /* flow_offload_netdev_event() cleans up entries for us. */ + nft_unregister_flowtable_hook(dev_net(dev), flowtable, hook); + list_del_rcu(&hook->list); + kfree_rcu(hook, rcu); break; } } @@ -6431,6 +6936,20 @@ static void nft_chain_commit_update(struct nft_trans *trans) } } +static void nft_obj_commit_update(struct nft_trans *trans) +{ + struct nft_object *newobj; + struct nft_object *obj; + + obj = nft_trans_obj(trans); + newobj = nft_trans_obj_newobj(trans); + + if (obj->ops->update) + obj->ops->update(obj, newobj); + + kfree(newobj); +} + static void nft_commit_release(struct nft_trans *trans) { switch (trans->msg_type) { @@ -6620,6 +7139,18 @@ static void nft_chain_del(struct nft_chain *chain) list_del_rcu(&chain->list); } +static void nf_tables_module_autoload_cleanup(struct net *net) +{ + struct nft_module_request *req, *next; + + WARN_ON_ONCE(!list_empty(&net->nft.commit_list)); + list_for_each_entry_safe(req, next, &net->nft.module_list, list) { + WARN_ON_ONCE(!req->done); + list_del(&req->list); + kfree(req); + } +} + static void nf_tables_commit_release(struct net *net) { struct nft_trans *trans; @@ -6632,6 +7163,7 @@ static void nf_tables_commit_release(struct net *net) * to prevent expensive synchronize_rcu() in commit phase. */ if (list_empty(&net->nft.commit_list)) { + nf_tables_module_autoload_cleanup(net); mutex_unlock(&net->nft.commit_mutex); return; } @@ -6646,6 +7178,7 @@ static void nf_tables_commit_release(struct net *net) list_splice_tail_init(&net->nft.commit_list, &nf_tables_destroy_list); spin_unlock(&nf_tables_destroy_list_lock); + nf_tables_module_autoload_cleanup(net); mutex_unlock(&net->nft.commit_mutex); schedule_work(&trans_destroy_work); @@ -6795,10 +7328,18 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) te->set->ndeact--; break; case NFT_MSG_NEWOBJ: - nft_clear(net, nft_trans_obj(trans)); - nf_tables_obj_notify(&trans->ctx, nft_trans_obj(trans), - NFT_MSG_NEWOBJ); - nft_trans_destroy(trans); + if (nft_trans_obj_update(trans)) { + nft_obj_commit_update(trans); + nf_tables_obj_notify(&trans->ctx, + nft_trans_obj(trans), + NFT_MSG_NEWOBJ); + } else { + nft_clear(net, nft_trans_obj(trans)); + nf_tables_obj_notify(&trans->ctx, + nft_trans_obj(trans), + NFT_MSG_NEWOBJ); + nft_trans_destroy(trans); + } break; case NFT_MSG_DELOBJ: nft_obj_del(nft_trans_obj(trans)); @@ -6829,6 +7370,26 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) return 0; } +static void nf_tables_module_autoload(struct net *net) +{ + struct nft_module_request *req, *next; + LIST_HEAD(module_list); + + list_splice_init(&net->nft.module_list, &module_list); + mutex_unlock(&net->nft.commit_mutex); + list_for_each_entry_safe(req, next, &module_list, list) { + if (req->done) { + list_del(&req->list); + kfree(req); + } else { + request_module("%s", req->module); + req->done = true; + } + } + mutex_lock(&net->nft.commit_mutex); + list_splice(&module_list, &net->nft.module_list); +} + static void nf_tables_abort_release(struct nft_trans *trans) { switch (trans->msg_type) { @@ -6858,7 +7419,7 @@ static void nf_tables_abort_release(struct nft_trans *trans) kfree(trans); } -static int __nf_tables_abort(struct net *net) +static int __nf_tables_abort(struct net *net, bool autoload) { struct nft_trans *trans, *next; struct nft_trans_elem *te; @@ -6945,8 +7506,13 @@ static int __nf_tables_abort(struct net *net) nft_trans_destroy(trans); break; case NFT_MSG_NEWOBJ: - trans->ctx.table->use--; - nft_obj_del(nft_trans_obj(trans)); + if (nft_trans_obj_update(trans)) { + kfree(nft_trans_obj_newobj(trans)); + nft_trans_destroy(trans); + } else { + trans->ctx.table->use--; + nft_obj_del(nft_trans_obj(trans)); + } break; case NFT_MSG_DELOBJ: trans->ctx.table->use++; @@ -6975,6 +7541,11 @@ static int __nf_tables_abort(struct net *net) nf_tables_abort_release(trans); } + if (autoload) + nf_tables_module_autoload(net); + else + nf_tables_module_autoload_cleanup(net); + return 0; } @@ -6983,9 +7554,9 @@ static void nf_tables_cleanup(struct net *net) nft_validate_state_update(net, NFT_VALIDATE_SKIP); } -static int nf_tables_abort(struct net *net, struct sk_buff *skb) +static int nf_tables_abort(struct net *net, struct sk_buff *skb, bool autoload) { - int ret = __nf_tables_abort(net); + int ret = __nf_tables_abort(net, autoload); mutex_unlock(&net->nft.commit_mutex); @@ -7235,7 +7806,7 @@ int nft_validate_register_load(enum nft_registers reg, unsigned int len) return -EINVAL; if (len == 0) return -EINVAL; - if (reg * NFT_REG32_SIZE + len > FIELD_SIZEOF(struct nft_regs, data)) + if (reg * NFT_REG32_SIZE + len > sizeof_field(struct nft_regs, data)) return -ERANGE; return 0; @@ -7283,7 +7854,7 @@ int nft_validate_register_store(const struct nft_ctx *ctx, if (len == 0) return -EINVAL; if (reg * NFT_REG32_SIZE + len > - FIELD_SIZEOF(struct nft_regs, data)) + sizeof_field(struct nft_regs, data)) return -ERANGE; if (data != NULL && type != NFT_DATA_VALUE) @@ -7580,6 +8151,7 @@ static int __net_init nf_tables_init_net(struct net *net) { INIT_LIST_HEAD(&net->nft.tables); INIT_LIST_HEAD(&net->nft.commit_list); + INIT_LIST_HEAD(&net->nft.module_list); mutex_init(&net->nft.commit_mutex); net->nft.base_seq = 1; net->nft.validate_state = NFT_VALIDATE_SKIP; @@ -7591,7 +8163,7 @@ static void __net_exit nf_tables_exit_net(struct net *net) { mutex_lock(&net->nft.commit_mutex); if (!list_empty(&net->nft.commit_list)) - __nf_tables_abort(net); + __nf_tables_abort(net, false); __nft_release_tables(net); mutex_unlock(&net->nft.commit_mutex); WARN_ON_ONCE(!list_empty(&net->nft.tables)); @@ -7627,13 +8199,20 @@ static int __init nf_tables_module_init(void) if (err < 0) goto err4; + err = nft_offload_init(); + if (err < 0) + goto err5; + /* must be last */ err = nfnetlink_subsys_register(&nf_tables_subsys); if (err < 0) - goto err5; + goto err6; nft_chain_route_init(); + return err; +err6: + nft_offload_exit(); err5: rhltable_destroy(&nft_objname_ht); err4: @@ -7650,6 +8229,7 @@ err1: static void __exit nf_tables_module_exit(void) { nfnetlink_subsys_unregister(&nf_tables_subsys); + nft_offload_exit(); unregister_netdevice_notifier(&nf_tables_flowtable_notifier); nft_chain_filter_fini(); nft_chain_route_fini(); diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index c0d18c1d77ac..2bb28483af22 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -28,13 +28,10 @@ static struct nft_flow_rule *nft_flow_rule_alloc(int num_actions) return flow; } -struct nft_flow_rule *nft_flow_rule_create(const struct nft_rule *rule) +struct nft_flow_rule *nft_flow_rule_create(struct net *net, + const struct nft_rule *rule) { - struct nft_offload_ctx ctx = { - .dep = { - .type = NFT_OFFLOAD_DEP_UNSPEC, - }, - }; + struct nft_offload_ctx *ctx; struct nft_flow_rule *flow; int num_actions = 0, err; struct nft_expr *expr; @@ -47,26 +44,40 @@ struct nft_flow_rule *nft_flow_rule_create(const struct nft_rule *rule) expr = nft_expr_next(expr); } + if (num_actions == 0) + return ERR_PTR(-EOPNOTSUPP); + flow = nft_flow_rule_alloc(num_actions); if (!flow) return ERR_PTR(-ENOMEM); expr = nft_expr_first(rule); + + ctx = kzalloc(sizeof(struct nft_offload_ctx), GFP_KERNEL); + if (!ctx) { + err = -ENOMEM; + goto err_out; + } + ctx->net = net; + ctx->dep.type = NFT_OFFLOAD_DEP_UNSPEC; + while (expr->ops && expr != nft_expr_last(rule)) { if (!expr->ops->offload) { err = -EOPNOTSUPP; goto err_out; } - err = expr->ops->offload(&ctx, flow, expr); + err = expr->ops->offload(ctx, flow, expr); if (err < 0) goto err_out; expr = nft_expr_next(expr); } - flow->proto = ctx.dep.l3num; + flow->proto = ctx->dep.l3num; + kfree(ctx); return flow; err_out: + kfree(ctx); nft_flow_rule_destroy(flow); return ERR_PTR(err); @@ -74,6 +85,19 @@ err_out: void nft_flow_rule_destroy(struct nft_flow_rule *flow) { + struct flow_action_entry *entry; + int i; + + flow_action_for_each(i, entry, &flow->rule->action) { + switch (entry->id) { + case FLOW_ACTION_REDIRECT: + case FLOW_ACTION_MIRRED: + dev_put(entry->dev); + break; + default: + break; + } + } kfree(flow->rule); kfree(flow); } @@ -111,13 +135,13 @@ static void nft_flow_offload_common_init(struct flow_cls_common_offload *common, common->extack = extack; } -static int nft_setup_cb_call(struct nft_base_chain *basechain, - enum tc_setup_type type, void *type_data) +static int nft_setup_cb_call(enum tc_setup_type type, void *type_data, + struct list_head *cb_list) { struct flow_block_cb *block_cb; int err; - list_for_each_entry(block_cb, &basechain->flow_block.cb_list, list) { + list_for_each_entry(block_cb, cb_list, list) { err = block_cb->cb(type, type_data, block_cb->cb_priv); if (err < 0) return err; @@ -134,32 +158,46 @@ int nft_chain_offload_priority(struct nft_base_chain *basechain) return 0; } -static int nft_flow_offload_rule(struct nft_trans *trans, - enum flow_cls_command command) +static void nft_flow_cls_offload_setup(struct flow_cls_offload *cls_flow, + const struct nft_base_chain *basechain, + const struct nft_rule *rule, + const struct nft_flow_rule *flow, + struct netlink_ext_ack *extack, + enum flow_cls_command command) { - struct nft_flow_rule *flow = nft_trans_flow_rule(trans); - struct nft_rule *rule = nft_trans_rule(trans); - struct flow_cls_offload cls_flow = {}; - struct nft_base_chain *basechain; - struct netlink_ext_ack extack; __be16 proto = ETH_P_ALL; - if (!nft_is_base_chain(trans->ctx.chain)) - return -EOPNOTSUPP; - - basechain = nft_base_chain(trans->ctx.chain); + memset(cls_flow, 0, sizeof(*cls_flow)); if (flow) proto = flow->proto; - nft_flow_offload_common_init(&cls_flow.common, proto, - basechain->ops.priority, &extack); - cls_flow.command = command; - cls_flow.cookie = (unsigned long) rule; + nft_flow_offload_common_init(&cls_flow->common, proto, + basechain->ops.priority, extack); + cls_flow->command = command; + cls_flow->cookie = (unsigned long) rule; if (flow) - cls_flow.rule = flow->rule; + cls_flow->rule = flow->rule; +} + +static int nft_flow_offload_rule(struct nft_chain *chain, + struct nft_rule *rule, + struct nft_flow_rule *flow, + enum flow_cls_command command) +{ + struct netlink_ext_ack extack = {}; + struct flow_cls_offload cls_flow; + struct nft_base_chain *basechain; - return nft_setup_cb_call(basechain, TC_SETUP_CLSFLOWER, &cls_flow); + if (!nft_is_base_chain(chain)) + return -EOPNOTSUPP; + + basechain = nft_base_chain(chain); + nft_flow_cls_offload_setup(&cls_flow, basechain, rule, flow, &extack, + command); + + return nft_setup_cb_call(TC_SETUP_CLSFLOWER, &cls_flow, + &basechain->flow_block.cb_list); } static int nft_flow_offload_bind(struct flow_block_offload *bo, @@ -173,6 +211,18 @@ static int nft_flow_offload_unbind(struct flow_block_offload *bo, struct nft_base_chain *basechain) { struct flow_block_cb *block_cb, *next; + struct flow_cls_offload cls_flow; + struct netlink_ext_ack extack; + struct nft_chain *chain; + struct nft_rule *rule; + + chain = &basechain->chain; + list_for_each_entry(rule, &chain->rules, list) { + memset(&extack, 0, sizeof(extack)); + nft_flow_cls_offload_setup(&cls_flow, basechain, rule, NULL, + &extack, FLOW_CLS_DESTROY); + nft_setup_cb_call(TC_SETUP_CLSFLOWER, &cls_flow, &bo->cb_list); + } list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) { list_del(&block_cb->list); @@ -182,58 +232,220 @@ static int nft_flow_offload_unbind(struct flow_block_offload *bo, return 0; } +static int nft_block_setup(struct nft_base_chain *basechain, + struct flow_block_offload *bo, + enum flow_block_command cmd) +{ + int err; + + switch (cmd) { + case FLOW_BLOCK_BIND: + err = nft_flow_offload_bind(bo, basechain); + break; + case FLOW_BLOCK_UNBIND: + err = nft_flow_offload_unbind(bo, basechain); + break; + default: + WARN_ON_ONCE(1); + err = -EOPNOTSUPP; + } + + return err; +} + +static void nft_flow_block_offload_init(struct flow_block_offload *bo, + struct net *net, + enum flow_block_command cmd, + struct nft_base_chain *basechain, + struct netlink_ext_ack *extack) +{ + memset(bo, 0, sizeof(*bo)); + bo->net = net; + bo->block = &basechain->flow_block; + bo->command = cmd; + bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; + bo->extack = extack; + INIT_LIST_HEAD(&bo->cb_list); +} + +static int nft_block_offload_cmd(struct nft_base_chain *chain, + struct net_device *dev, + enum flow_block_command cmd) +{ + struct netlink_ext_ack extack = {}; + struct flow_block_offload bo; + int err; + + nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack); + + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); + if (err < 0) + return err; + + return nft_block_setup(chain, &bo, cmd); +} + +static void nft_indr_block_ing_cmd(struct net_device *dev, + struct nft_base_chain *chain, + flow_indr_block_bind_cb_t *cb, + void *cb_priv, + enum flow_block_command cmd) +{ + struct netlink_ext_ack extack = {}; + struct flow_block_offload bo; + + if (!chain) + return; + + nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack); + + cb(dev, cb_priv, TC_SETUP_BLOCK, &bo); + + nft_block_setup(chain, &bo, cmd); +} + +static int nft_indr_block_offload_cmd(struct nft_base_chain *chain, + struct net_device *dev, + enum flow_block_command cmd) +{ + struct netlink_ext_ack extack = {}; + struct flow_block_offload bo; + + nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack); + + flow_indr_block_call(dev, &bo, cmd); + + if (list_empty(&bo.cb_list)) + return -EOPNOTSUPP; + + return nft_block_setup(chain, &bo, cmd); +} + #define FLOW_SETUP_BLOCK TC_SETUP_BLOCK -static int nft_flow_offload_chain(struct nft_trans *trans, +static int nft_chain_offload_cmd(struct nft_base_chain *basechain, + struct net_device *dev, + enum flow_block_command cmd) +{ + int err; + + if (dev->netdev_ops->ndo_setup_tc) + err = nft_block_offload_cmd(basechain, dev, cmd); + else + err = nft_indr_block_offload_cmd(basechain, dev, cmd); + + return err; +} + +static int nft_flow_block_chain(struct nft_base_chain *basechain, + const struct net_device *this_dev, + enum flow_block_command cmd) +{ + struct net_device *dev; + struct nft_hook *hook; + int err, i = 0; + + list_for_each_entry(hook, &basechain->hook_list, list) { + dev = hook->ops.dev; + if (this_dev && this_dev != dev) + continue; + + err = nft_chain_offload_cmd(basechain, dev, cmd); + if (err < 0 && cmd == FLOW_BLOCK_BIND) { + if (!this_dev) + goto err_flow_block; + + return err; + } + i++; + } + + return 0; + +err_flow_block: + list_for_each_entry(hook, &basechain->hook_list, list) { + if (i-- <= 0) + break; + + dev = hook->ops.dev; + nft_chain_offload_cmd(basechain, dev, FLOW_BLOCK_UNBIND); + } + return err; +} + +static int nft_flow_offload_chain(struct nft_chain *chain, u8 *ppolicy, enum flow_block_command cmd) { - struct nft_chain *chain = trans->ctx.chain; - struct netlink_ext_ack extack = {}; - struct flow_block_offload bo = {}; struct nft_base_chain *basechain; - struct net_device *dev; - int err; + u8 policy; if (!nft_is_base_chain(chain)) return -EOPNOTSUPP; basechain = nft_base_chain(chain); - dev = basechain->ops.dev; - if (!dev || !dev->netdev_ops->ndo_setup_tc) - return -EOPNOTSUPP; + policy = ppolicy ? *ppolicy : basechain->policy; /* Only default policy to accept is supported for now. */ - if (cmd == FLOW_BLOCK_BIND && - nft_trans_chain_policy(trans) != -1 && - nft_trans_chain_policy(trans) != NF_ACCEPT) + if (cmd == FLOW_BLOCK_BIND && policy == NF_DROP) return -EOPNOTSUPP; - bo.command = cmd; - bo.block = &basechain->flow_block; - bo.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; - bo.extack = &extack; - INIT_LIST_HEAD(&bo.cb_list); + return nft_flow_block_chain(basechain, NULL, cmd); +} - err = dev->netdev_ops->ndo_setup_tc(dev, FLOW_SETUP_BLOCK, &bo); - if (err < 0) - return err; +static void nft_flow_rule_offload_abort(struct net *net, + struct nft_trans *trans) +{ + int err = 0; - switch (cmd) { - case FLOW_BLOCK_BIND: - err = nft_flow_offload_bind(&bo, basechain); - break; - case FLOW_BLOCK_UNBIND: - err = nft_flow_offload_unbind(&bo, basechain); - break; - } + list_for_each_entry_continue_reverse(trans, &net->nft.commit_list, list) { + if (trans->ctx.family != NFPROTO_NETDEV) + continue; - return err; + switch (trans->msg_type) { + case NFT_MSG_NEWCHAIN: + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) || + nft_trans_chain_update(trans)) + continue; + + err = nft_flow_offload_chain(trans->ctx.chain, NULL, + FLOW_BLOCK_UNBIND); + break; + case NFT_MSG_DELCHAIN: + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + continue; + + err = nft_flow_offload_chain(trans->ctx.chain, NULL, + FLOW_BLOCK_BIND); + break; + case NFT_MSG_NEWRULE: + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + continue; + + err = nft_flow_offload_rule(trans->ctx.chain, + nft_trans_rule(trans), + NULL, FLOW_CLS_DESTROY); + break; + case NFT_MSG_DELRULE: + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + continue; + + err = nft_flow_offload_rule(trans->ctx.chain, + nft_trans_rule(trans), + nft_trans_flow_rule(trans), + FLOW_CLS_REPLACE); + break; + } + + if (WARN_ON_ONCE(err)) + break; + } } int nft_flow_rule_offload_commit(struct net *net) { struct nft_trans *trans; int err = 0; + u8 policy; list_for_each_entry(trans, &net->nft.commit_list, list) { if (trans->ctx.family != NFPROTO_NETDEV) @@ -241,39 +453,171 @@ int nft_flow_rule_offload_commit(struct net *net) switch (trans->msg_type) { case NFT_MSG_NEWCHAIN: - if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) || + nft_trans_chain_update(trans)) continue; - err = nft_flow_offload_chain(trans, FLOW_BLOCK_BIND); + policy = nft_trans_chain_policy(trans); + err = nft_flow_offload_chain(trans->ctx.chain, &policy, + FLOW_BLOCK_BIND); break; case NFT_MSG_DELCHAIN: if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) continue; - err = nft_flow_offload_chain(trans, FLOW_BLOCK_UNBIND); + policy = nft_trans_chain_policy(trans); + err = nft_flow_offload_chain(trans->ctx.chain, &policy, + FLOW_BLOCK_UNBIND); break; case NFT_MSG_NEWRULE: if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) continue; if (trans->ctx.flags & NLM_F_REPLACE || - !(trans->ctx.flags & NLM_F_APPEND)) - return -EOPNOTSUPP; - - err = nft_flow_offload_rule(trans, FLOW_CLS_REPLACE); - nft_flow_rule_destroy(nft_trans_flow_rule(trans)); + !(trans->ctx.flags & NLM_F_APPEND)) { + err = -EOPNOTSUPP; + break; + } + err = nft_flow_offload_rule(trans->ctx.chain, + nft_trans_rule(trans), + nft_trans_flow_rule(trans), + FLOW_CLS_REPLACE); break; case NFT_MSG_DELRULE: if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) continue; - err = nft_flow_offload_rule(trans, FLOW_CLS_DESTROY); + err = nft_flow_offload_rule(trans->ctx.chain, + nft_trans_rule(trans), + NULL, FLOW_CLS_DESTROY); break; } - if (err) - return err; + if (err) { + nft_flow_rule_offload_abort(net, trans); + break; + } + } + + list_for_each_entry(trans, &net->nft.commit_list, list) { + if (trans->ctx.family != NFPROTO_NETDEV) + continue; + + switch (trans->msg_type) { + case NFT_MSG_NEWRULE: + case NFT_MSG_DELRULE: + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + continue; + + nft_flow_rule_destroy(nft_trans_flow_rule(trans)); + break; + default: + break; + } } return err; } + +static struct nft_chain *__nft_offload_get_chain(struct net_device *dev) +{ + struct nft_base_chain *basechain; + struct net *net = dev_net(dev); + struct nft_hook *hook, *found; + const struct nft_table *table; + struct nft_chain *chain; + + list_for_each_entry(table, &net->nft.tables, list) { + if (table->family != NFPROTO_NETDEV) + continue; + + list_for_each_entry(chain, &table->chains, list) { + if (!nft_is_base_chain(chain) || + !(chain->flags & NFT_CHAIN_HW_OFFLOAD)) + continue; + + found = NULL; + basechain = nft_base_chain(chain); + list_for_each_entry(hook, &basechain->hook_list, list) { + if (hook->ops.dev != dev) + continue; + + found = hook; + break; + } + if (!found) + continue; + + return chain; + } + } + + return NULL; +} + +static void nft_indr_block_cb(struct net_device *dev, + flow_indr_block_bind_cb_t *cb, void *cb_priv, + enum flow_block_command cmd) +{ + struct net *net = dev_net(dev); + struct nft_chain *chain; + + mutex_lock(&net->nft.commit_mutex); + chain = __nft_offload_get_chain(dev); + if (chain && chain->flags & NFT_CHAIN_HW_OFFLOAD) { + struct nft_base_chain *basechain; + + basechain = nft_base_chain(chain); + nft_indr_block_ing_cmd(dev, basechain, cb, cb_priv, cmd); + } + mutex_unlock(&net->nft.commit_mutex); +} + +static int nft_offload_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); + struct nft_chain *chain; + + if (event != NETDEV_UNREGISTER) + return NOTIFY_DONE; + + mutex_lock(&net->nft.commit_mutex); + chain = __nft_offload_get_chain(dev); + if (chain) + nft_flow_block_chain(nft_base_chain(chain), dev, + FLOW_BLOCK_UNBIND); + + mutex_unlock(&net->nft.commit_mutex); + + return NOTIFY_DONE; +} + +static struct flow_indr_block_entry block_ing_entry = { + .cb = nft_indr_block_cb, + .list = LIST_HEAD_INIT(block_ing_entry.list), +}; + +static struct notifier_block nft_offload_netdev_notifier = { + .notifier_call = nft_offload_netdev_event, +}; + +int nft_offload_init(void) +{ + int err; + + err = register_netdevice_notifier(&nft_offload_netdev_notifier); + if (err < 0) + return err; + + flow_indr_add_block_cb(&block_ing_entry); + + return 0; +} + +void nft_offload_exit(void) +{ + flow_indr_del_block_cb(&block_ing_entry); + unregister_netdevice_notifier(&nft_offload_netdev_notifier); +} diff --git a/net/netfilter/nf_tables_set_core.c b/net/netfilter/nf_tables_set_core.c index a9fce8d10051..586b621007eb 100644 --- a/net/netfilter/nf_tables_set_core.c +++ b/net/netfilter/nf_tables_set_core.c @@ -9,12 +9,14 @@ static int __init nf_tables_set_module_init(void) nft_register_set(&nft_set_rhash_type); nft_register_set(&nft_set_bitmap_type); nft_register_set(&nft_set_rbtree_type); + nft_register_set(&nft_set_pipapo_type); return 0; } static void __exit nf_tables_set_module_exit(void) { + nft_unregister_set(&nft_set_pipapo_type); nft_unregister_set(&nft_set_rbtree_type); nft_unregister_set(&nft_set_bitmap_type); nft_unregister_set(&nft_set_rhash_type); diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 4abbb452cf6c..99127e2d95a8 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -476,7 +476,7 @@ ack: } done: if (status & NFNL_BATCH_REPLAY) { - ss->abort(net, oskb); + ss->abort(net, oskb, true); nfnl_err_reset(&err_list); kfree_skb(skb); module_put(ss->owner); @@ -487,11 +487,11 @@ done: status |= NFNL_BATCH_REPLAY; goto done; } else if (err) { - ss->abort(net, oskb); + ss->abort(net, oskb, false); netlink_ack(oskb, nlmsg_hdr(oskb), err, NULL); } } else { - ss->abort(net, oskb); + ss->abort(net, oskb, false); } if (ss->cleanup) ss->cleanup(net); diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 7525063c25f5..de3a9596b7f1 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -236,7 +236,7 @@ nfnl_cthelper_create(const struct nlattr * const tb[], nla_strlcpy(helper->name, tb[NFCTH_NAME], NF_CT_HELPER_NAME_LEN); size = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN])); - if (size > FIELD_SIZEOF(struct nf_conn_help, data)) { + if (size > sizeof_field(struct nf_conn_help, data)) { ret = -ENOMEM; goto err2; } diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 6dee4f9a944c..0ba020ca38e6 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -385,6 +385,57 @@ nfulnl_timer(struct timer_list *t) instance_put(inst); } +static u32 nfulnl_get_bridge_size(const struct sk_buff *skb) +{ + u32 size = 0; + + if (!skb_mac_header_was_set(skb)) + return 0; + + if (skb_vlan_tag_present(skb)) { + size += nla_total_size(0); /* nested */ + size += nla_total_size(sizeof(u16)); /* id */ + size += nla_total_size(sizeof(u16)); /* tag */ + } + + if (skb->network_header > skb->mac_header) + size += nla_total_size(skb->network_header - skb->mac_header); + + return size; +} + +static int nfulnl_put_bridge(struct nfulnl_instance *inst, const struct sk_buff *skb) +{ + if (!skb_mac_header_was_set(skb)) + return 0; + + if (skb_vlan_tag_present(skb)) { + struct nlattr *nest; + + nest = nla_nest_start(inst->skb, NFULA_VLAN); + if (!nest) + goto nla_put_failure; + + if (nla_put_be16(inst->skb, NFULA_VLAN_TCI, htons(skb->vlan_tci)) || + nla_put_be16(inst->skb, NFULA_VLAN_PROTO, skb->vlan_proto)) + goto nla_put_failure; + + nla_nest_end(inst->skb, nest); + } + + if (skb->mac_header < skb->network_header) { + int len = (int)(skb->network_header - skb->mac_header); + + if (nla_put(inst->skb, NFULA_L2HDR, len, skb_mac_header(skb))) + goto nla_put_failure; + } + + return 0; + +nla_put_failure: + return -1; +} + /* This is an inline function, we don't really care about a long * list of arguments */ static inline int @@ -580,6 +631,10 @@ __build_packet_message(struct nfnl_log_net *log, NFULA_CT, NFULA_CT_INFO) < 0) goto nla_put_failure; + if ((pf == NFPROTO_NETDEV || pf == NFPROTO_BRIDGE) && + nfulnl_put_bridge(inst, skb) < 0) + goto nla_put_failure; + if (data_len) { struct nlattr *nla; int size = nla_attr_size(data_len); @@ -651,7 +706,7 @@ nfulnl_log_packet(struct net *net, /* FIXME: do we want to make the size calculation conditional based on * what is actually present? way more branches and checks, but more * memory efficient... */ - size = nlmsg_total_size(sizeof(struct nfgenmsg)) + size = nlmsg_total_size(sizeof(struct nfgenmsg)) + nla_total_size(sizeof(struct nfulnl_msg_packet_hdr)) + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + nla_total_size(sizeof(u_int32_t)) /* ifindex */ @@ -668,7 +723,7 @@ nfulnl_log_packet(struct net *net, + nla_total_size(sizeof(struct nfgenmsg)); /* NLMSG_DONE */ if (in && skb_mac_header_was_set(skb)) { - size += nla_total_size(skb->dev->hard_header_len) + size += nla_total_size(skb->dev->hard_header_len) + nla_total_size(sizeof(u_int16_t)) /* hwtype */ + nla_total_size(sizeof(u_int16_t)); /* hwlen */ } @@ -687,6 +742,8 @@ nfulnl_log_packet(struct net *net, size += nfnl_ct->build_size(ct); } } + if (pf == NFPROTO_NETDEV || pf == NFPROTO_BRIDGE) + size += nfulnl_get_bridge_size(skb); qthreshold = inst->qthreshold; /* per-rule qthreshold overrides per-instance */ diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index b6a7ce622c72..76535fd9278c 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -394,7 +394,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, char *secdata = NULL; u32 seclen = 0; - size = nlmsg_total_size(sizeof(struct nfgenmsg)) + size = nlmsg_total_size(sizeof(struct nfgenmsg)) + nla_total_size(sizeof(struct nfqnl_msg_packet_hdr)) + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + nla_total_size(sizeof(u_int32_t)) /* ifindex */ @@ -453,7 +453,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, } if (queue->flags & NFQA_CFG_F_UID_GID) { - size += (nla_total_size(sizeof(u_int32_t)) /* uid */ + size += (nla_total_size(sizeof(u_int32_t)) /* uid */ + nla_total_size(sizeof(u_int32_t))); /* gid */ } @@ -778,7 +778,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) { unsigned int queued; struct nfqnl_instance *queue; - struct sk_buff *skb, *segs; + struct sk_buff *skb, *segs, *nskb; int err = -ENOBUFS; struct net *net = entry->state.net; struct nfnl_queue_net *q = nfnl_queue_pernet(net); @@ -815,8 +815,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) goto out_err; queued = 0; err = 0; - do { - struct sk_buff *nskb = segs->next; + skb_list_walk_safe(segs, segs, nskb) { if (err == 0) err = __nfqnl_enqueue_packet_gso(net, queue, segs, entry); @@ -824,8 +823,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) queued++; else kfree_skb(segs); - segs = nskb; - } while (segs); + } if (queued) { if (err) /* some segments are already queued */ diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index b310b637b550..0ed2281f03be 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -13,25 +13,71 @@ #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_offload.h> struct nft_bitwise { enum nft_registers sreg:8; enum nft_registers dreg:8; + enum nft_bitwise_ops op:8; u8 len; struct nft_data mask; struct nft_data xor; + struct nft_data data; }; +static void nft_bitwise_eval_bool(u32 *dst, const u32 *src, + const struct nft_bitwise *priv) +{ + unsigned int i; + + for (i = 0; i < DIV_ROUND_UP(priv->len, 4); i++) + dst[i] = (src[i] & priv->mask.data[i]) ^ priv->xor.data[i]; +} + +static void nft_bitwise_eval_lshift(u32 *dst, const u32 *src, + const struct nft_bitwise *priv) +{ + u32 shift = priv->data.data[0]; + unsigned int i; + u32 carry = 0; + + for (i = DIV_ROUND_UP(priv->len, sizeof(u32)); i > 0; i--) { + dst[i - 1] = (src[i - 1] << shift) | carry; + carry = src[i - 1] >> (BITS_PER_TYPE(u32) - shift); + } +} + +static void nft_bitwise_eval_rshift(u32 *dst, const u32 *src, + const struct nft_bitwise *priv) +{ + u32 shift = priv->data.data[0]; + unsigned int i; + u32 carry = 0; + + for (i = 0; i < DIV_ROUND_UP(priv->len, sizeof(u32)); i++) { + dst[i] = carry | (src[i] >> shift); + carry = src[i] << (BITS_PER_TYPE(u32) - shift); + } +} + void nft_bitwise_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_bitwise *priv = nft_expr_priv(expr); const u32 *src = ®s->data[priv->sreg]; u32 *dst = ®s->data[priv->dreg]; - unsigned int i; - for (i = 0; i < DIV_ROUND_UP(priv->len, 4); i++) - dst[i] = (src[i] & priv->mask.data[i]) ^ priv->xor.data[i]; + switch (priv->op) { + case NFT_BITWISE_BOOL: + nft_bitwise_eval_bool(dst, src, priv); + break; + case NFT_BITWISE_LSHIFT: + nft_bitwise_eval_lshift(dst, src, priv); + break; + case NFT_BITWISE_RSHIFT: + nft_bitwise_eval_rshift(dst, src, priv); + break; + } } static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = { @@ -40,22 +86,86 @@ static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = { [NFTA_BITWISE_LEN] = { .type = NLA_U32 }, [NFTA_BITWISE_MASK] = { .type = NLA_NESTED }, [NFTA_BITWISE_XOR] = { .type = NLA_NESTED }, + [NFTA_BITWISE_OP] = { .type = NLA_U32 }, + [NFTA_BITWISE_DATA] = { .type = NLA_NESTED }, }; +static int nft_bitwise_init_bool(struct nft_bitwise *priv, + const struct nlattr *const tb[]) +{ + struct nft_data_desc d1, d2; + int err; + + if (tb[NFTA_BITWISE_DATA]) + return -EINVAL; + + if (!tb[NFTA_BITWISE_MASK] || + !tb[NFTA_BITWISE_XOR]) + return -EINVAL; + + err = nft_data_init(NULL, &priv->mask, sizeof(priv->mask), &d1, + tb[NFTA_BITWISE_MASK]); + if (err < 0) + return err; + if (d1.type != NFT_DATA_VALUE || d1.len != priv->len) { + err = -EINVAL; + goto err1; + } + + err = nft_data_init(NULL, &priv->xor, sizeof(priv->xor), &d2, + tb[NFTA_BITWISE_XOR]); + if (err < 0) + goto err1; + if (d2.type != NFT_DATA_VALUE || d2.len != priv->len) { + err = -EINVAL; + goto err2; + } + + return 0; +err2: + nft_data_release(&priv->xor, d2.type); +err1: + nft_data_release(&priv->mask, d1.type); + return err; +} + +static int nft_bitwise_init_shift(struct nft_bitwise *priv, + const struct nlattr *const tb[]) +{ + struct nft_data_desc d; + int err; + + if (tb[NFTA_BITWISE_MASK] || + tb[NFTA_BITWISE_XOR]) + return -EINVAL; + + if (!tb[NFTA_BITWISE_DATA]) + return -EINVAL; + + err = nft_data_init(NULL, &priv->data, sizeof(priv->data), &d, + tb[NFTA_BITWISE_DATA]); + if (err < 0) + return err; + if (d.type != NFT_DATA_VALUE || d.len != sizeof(u32) || + priv->data.data[0] >= BITS_PER_TYPE(u32)) { + nft_data_release(&priv->data, d.type); + return -EINVAL; + } + + return 0; +} + static int nft_bitwise_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_bitwise *priv = nft_expr_priv(expr); - struct nft_data_desc d1, d2; u32 len; int err; - if (tb[NFTA_BITWISE_SREG] == NULL || - tb[NFTA_BITWISE_DREG] == NULL || - tb[NFTA_BITWISE_LEN] == NULL || - tb[NFTA_BITWISE_MASK] == NULL || - tb[NFTA_BITWISE_XOR] == NULL) + if (!tb[NFTA_BITWISE_SREG] || + !tb[NFTA_BITWISE_DREG] || + !tb[NFTA_BITWISE_LEN]) return -EINVAL; err = nft_parse_u32_check(tb[NFTA_BITWISE_LEN], U8_MAX, &len); @@ -75,55 +185,102 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, if (err < 0) return err; - err = nft_data_init(NULL, &priv->mask, sizeof(priv->mask), &d1, - tb[NFTA_BITWISE_MASK]); - if (err < 0) - return err; - if (d1.len != priv->len) { - err = -EINVAL; - goto err1; + if (tb[NFTA_BITWISE_OP]) { + priv->op = ntohl(nla_get_be32(tb[NFTA_BITWISE_OP])); + switch (priv->op) { + case NFT_BITWISE_BOOL: + case NFT_BITWISE_LSHIFT: + case NFT_BITWISE_RSHIFT: + break; + default: + return -EOPNOTSUPP; + } + } else { + priv->op = NFT_BITWISE_BOOL; } - err = nft_data_init(NULL, &priv->xor, sizeof(priv->xor), &d2, - tb[NFTA_BITWISE_XOR]); - if (err < 0) - goto err1; - if (d2.len != priv->len) { - err = -EINVAL; - goto err2; + switch(priv->op) { + case NFT_BITWISE_BOOL: + err = nft_bitwise_init_bool(priv, tb); + break; + case NFT_BITWISE_LSHIFT: + case NFT_BITWISE_RSHIFT: + err = nft_bitwise_init_shift(priv, tb); + break; } - return 0; -err2: - nft_data_release(&priv->xor, d2.type); -err1: - nft_data_release(&priv->mask, d1.type); return err; } +static int nft_bitwise_dump_bool(struct sk_buff *skb, + const struct nft_bitwise *priv) +{ + if (nft_data_dump(skb, NFTA_BITWISE_MASK, &priv->mask, + NFT_DATA_VALUE, priv->len) < 0) + return -1; + + if (nft_data_dump(skb, NFTA_BITWISE_XOR, &priv->xor, + NFT_DATA_VALUE, priv->len) < 0) + return -1; + + return 0; +} + +static int nft_bitwise_dump_shift(struct sk_buff *skb, + const struct nft_bitwise *priv) +{ + if (nft_data_dump(skb, NFTA_BITWISE_DATA, &priv->data, + NFT_DATA_VALUE, sizeof(u32)) < 0) + return -1; + return 0; +} + static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_bitwise *priv = nft_expr_priv(expr); + int err = 0; if (nft_dump_register(skb, NFTA_BITWISE_SREG, priv->sreg)) - goto nla_put_failure; + return -1; if (nft_dump_register(skb, NFTA_BITWISE_DREG, priv->dreg)) - goto nla_put_failure; + return -1; if (nla_put_be32(skb, NFTA_BITWISE_LEN, htonl(priv->len))) - goto nla_put_failure; + return -1; + if (nla_put_be32(skb, NFTA_BITWISE_OP, htonl(priv->op))) + return -1; - if (nft_data_dump(skb, NFTA_BITWISE_MASK, &priv->mask, - NFT_DATA_VALUE, priv->len) < 0) - goto nla_put_failure; + switch (priv->op) { + case NFT_BITWISE_BOOL: + err = nft_bitwise_dump_bool(skb, priv); + break; + case NFT_BITWISE_LSHIFT: + case NFT_BITWISE_RSHIFT: + err = nft_bitwise_dump_shift(skb, priv); + break; + } - if (nft_data_dump(skb, NFTA_BITWISE_XOR, &priv->xor, - NFT_DATA_VALUE, priv->len) < 0) - goto nla_put_failure; + return err; +} - return 0; +static struct nft_data zero; -nla_put_failure: - return -1; +static int nft_bitwise_offload(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_expr *expr) +{ + const struct nft_bitwise *priv = nft_expr_priv(expr); + struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; + + if (priv->op != NFT_BITWISE_BOOL) + return -EOPNOTSUPP; + + if (memcmp(&priv->xor, &zero, sizeof(priv->xor)) || + priv->sreg != priv->dreg || priv->len != reg->len) + return -EOPNOTSUPP; + + memcpy(®->mask, &priv->mask, sizeof(priv->mask)); + + return 0; } static const struct nft_expr_ops nft_bitwise_ops = { @@ -132,6 +289,7 @@ static const struct nft_expr_ops nft_bitwise_ops = { .eval = nft_bitwise_eval, .init = nft_bitwise_init, .dump = nft_bitwise_dump, + .offload = nft_bitwise_offload, }; struct nft_expr_type nft_bitwise_type __read_mostly = { diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c index e06318428ea0..12bed3f7bbc6 100644 --- a/net/netfilter/nft_byteorder.c +++ b/net/netfilter/nft_byteorder.c @@ -43,14 +43,15 @@ void nft_byteorder_eval(const struct nft_expr *expr, switch (priv->op) { case NFT_BYTEORDER_NTOH: for (i = 0; i < priv->len / 8; i++) { - src64 = get_unaligned((u64 *)&src[i]); - put_unaligned_be64(src64, &dst[i]); + src64 = nft_reg_load64(&src[i]); + nft_reg_store64(&dst[i], be64_to_cpu(src64)); } break; case NFT_BYTEORDER_HTON: for (i = 0; i < priv->len / 8; i++) { - src64 = get_unaligned_be64(&src[i]); - put_unaligned(src64, (u64 *)&dst[i]); + src64 = (__force __u64) + cpu_to_be64(nft_reg_load64(&src[i])); + nft_reg_store64(&dst[i], src64); } break; } diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index b5d5d071d765..c78d01bc02e9 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -287,28 +287,35 @@ static void nft_netdev_event(unsigned long event, struct net_device *dev, struct nft_ctx *ctx) { struct nft_base_chain *basechain = nft_base_chain(ctx->chain); + struct nft_hook *hook, *found = NULL; + int n = 0; - switch (event) { - case NETDEV_UNREGISTER: - if (strcmp(basechain->dev_name, dev->name) != 0) - return; - - /* UNREGISTER events are also happpening on netns exit. - * - * Altough nf_tables core releases all tables/chains, only - * this event handler provides guarantee that - * basechain.ops->dev is still accessible, so we cannot - * skip exiting net namespaces. - */ - __nft_release_basechain(ctx); - break; - case NETDEV_CHANGENAME: - if (dev->ifindex != basechain->ops.dev->ifindex) - return; + if (event != NETDEV_UNREGISTER) + return; - strncpy(basechain->dev_name, dev->name, IFNAMSIZ); - break; + list_for_each_entry(hook, &basechain->hook_list, list) { + if (hook->ops.dev == dev) + found = hook; + + n++; } + if (!found) + return; + + if (n > 1) { + nf_unregister_net_hook(ctx->net, &found->ops); + list_del_rcu(&found->list); + kfree_rcu(found, rcu); + return; + } + + /* UNREGISTER events are also happening on netns exit. + * + * Although nf_tables core releases all tables/chains, only this event + * handler provides guarantee that hook->ops.dev is still accessible, + * so we cannot skip exiting net namespaces. + */ + __nft_release_basechain(ctx); } static int nf_tables_netdev_event(struct notifier_block *this, diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index bd173b1824c6..8a28c127effc 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -10,6 +10,7 @@ #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> +#include <linux/if_arp.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables_offload.h> @@ -80,6 +81,12 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr, if (err < 0) return err; + if (desc.type != NFT_DATA_VALUE) { + err = -EINVAL; + nft_data_release(&priv->data, desc.type); + return err; + } + priv->sreg = nft_parse_register(tb[NFTA_CMP_SREG]); err = nft_validate_register_load(priv->sreg, desc.len); if (err < 0) @@ -116,7 +123,7 @@ static int __nft_cmp_offload(struct nft_offload_ctx *ctx, u8 *mask = (u8 *)&flow->match.mask; u8 *key = (u8 *)&flow->match.key; - if (priv->op != NFT_CMP_EQ) + if (priv->op != NFT_CMP_EQ || reg->len != priv->len) return -EOPNOTSUPP; memcpy(key + reg->offset, &priv->data, priv->len); @@ -125,6 +132,11 @@ static int __nft_cmp_offload(struct nft_offload_ctx *ctx, flow->match.dissector.used_keys |= BIT(reg->key); flow->match.dissector.offset[reg->key] = reg->base_offset; + if (reg->key == FLOW_DISSECTOR_KEY_META && + reg->offset == offsetof(struct nft_flow_key, meta.ingress_iftype) && + nft_reg_load16(priv->data.data) != ARPHRD_ETHER) + return -EOPNOTSUPP; + nft_offload_update_dependency(ctx, &priv->data, priv->len); return 0; diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index af1497ab9464..69d6173f91e2 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -218,8 +218,13 @@ static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx, static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr) { struct nft_connlimit *priv = nft_expr_priv(expr); + bool ret; - return nf_conncount_gc_list(net, &priv->list); + local_bh_disable(); + ret = nf_conncount_gc_list(net, &priv->list); + local_bh_enable(); + + return ret; } static struct nft_expr_type nft_connlimit_type; diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 46ca8bcca1bd..faea72c2df32 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -440,12 +440,12 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, switch (ctx->family) { case NFPROTO_IPV4: - len = FIELD_SIZEOF(struct nf_conntrack_tuple, + len = sizeof_field(struct nf_conntrack_tuple, src.u3.ip); break; case NFPROTO_IPV6: case NFPROTO_INET: - len = FIELD_SIZEOF(struct nf_conntrack_tuple, + len = sizeof_field(struct nf_conntrack_tuple, src.u3.ip6); break; default: @@ -457,20 +457,20 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, if (tb[NFTA_CT_DIRECTION] == NULL) return -EINVAL; - len = FIELD_SIZEOF(struct nf_conntrack_tuple, src.u3.ip); + len = sizeof_field(struct nf_conntrack_tuple, src.u3.ip); break; case NFT_CT_SRC_IP6: case NFT_CT_DST_IP6: if (tb[NFTA_CT_DIRECTION] == NULL) return -EINVAL; - len = FIELD_SIZEOF(struct nf_conntrack_tuple, src.u3.ip6); + len = sizeof_field(struct nf_conntrack_tuple, src.u3.ip6); break; case NFT_CT_PROTO_SRC: case NFT_CT_PROTO_DST: if (tb[NFTA_CT_DIRECTION] == NULL) return -EINVAL; - len = FIELD_SIZEOF(struct nf_conntrack_tuple, src.u.all); + len = sizeof_field(struct nf_conntrack_tuple, src.u.all); break; case NFT_CT_BYTES: case NFT_CT_PKTS: @@ -551,7 +551,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, case NFT_CT_MARK: if (tb[NFTA_CT_DIRECTION]) return -EINVAL; - len = FIELD_SIZEOF(struct nf_conn, mark); + len = sizeof_field(struct nf_conn, mark); break; #endif #ifdef CONFIG_NF_CONNTRACK_LABELS diff --git a/net/netfilter/nft_dup_netdev.c b/net/netfilter/nft_dup_netdev.c index c6052fdd2c40..c2e78c160fd7 100644 --- a/net/netfilter/nft_dup_netdev.c +++ b/net/netfilter/nft_dup_netdev.c @@ -10,6 +10,7 @@ #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_offload.h> #include <net/netfilter/nf_dup_netdev.h> struct nft_dup_netdev { @@ -56,6 +57,16 @@ nla_put_failure: return -1; } +static int nft_dup_netdev_offload(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_expr *expr) +{ + const struct nft_dup_netdev *priv = nft_expr_priv(expr); + int oif = ctx->regs[priv->sreg_dev].data.data[0]; + + return nft_fwd_dup_netdev_offload(ctx, flow, FLOW_ACTION_MIRRED, oif); +} + static struct nft_expr_type nft_dup_netdev_type; static const struct nft_expr_ops nft_dup_netdev_ops = { .type = &nft_dup_netdev_type, @@ -63,6 +74,7 @@ static const struct nft_expr_ops nft_dup_netdev_ops = { .eval = nft_dup_netdev_eval, .init = nft_dup_netdev_init, .dump = nft_dup_netdev_dump, + .offload = nft_dup_netdev_offload, }; static struct nft_expr_type nft_dup_netdev_type __read_mostly = { diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 33833a0cb989..683785225a3e 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -54,7 +54,7 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, timeout = priv->timeout ? : set->timeout; elem = nft_set_elem_init(set, &priv->tmpl, - ®s->data[priv->sreg_key], + ®s->data[priv->sreg_key], NULL, ®s->data[priv->sreg_data], timeout, 0, GFP_ATOMIC); if (elem == NULL) @@ -84,6 +84,11 @@ void nft_dynset_eval(const struct nft_expr *expr, const struct nft_expr *sexpr; u64 timeout; + if (priv->op == NFT_DYNSET_OP_DELETE) { + set->ops->delete(set, ®s->data[priv->sreg_key]); + return; + } + if (set->ops->update(set, ®s->data[priv->sreg_key], nft_dynset_new, expr, regs, &ext)) { sexpr = NULL; @@ -161,6 +166,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx, priv->op = ntohl(nla_get_be32(tb[NFTA_DYNSET_OP])); switch (priv->op) { case NFT_DYNSET_OP_ADD: + case NFT_DYNSET_OP_DELETE: break; case NFT_DYNSET_OP_UPDATE: if (!(set->flags & NFT_SET_TIMEOUT)) diff --git a/net/netfilter/nft_fib_netdev.c b/net/netfilter/nft_fib_netdev.c index 2cf3f32fe6d2..a2e726ae7f07 100644 --- a/net/netfilter/nft_fib_netdev.c +++ b/net/netfilter/nft_fib_netdev.c @@ -14,6 +14,7 @@ #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> +#include <net/ipv6.h> #include <net/netfilter/nft_fib.h> @@ -34,6 +35,8 @@ static void nft_fib_netdev_eval(const struct nft_expr *expr, } break; case ETH_P_IPV6: + if (!ipv6_mod_enabled()) + break; switch (priv->result) { case NFT_FIB_RESULT_OIF: case NFT_FIB_RESULT_OIFNAME: diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 060a4ed46d5e..b70b48996801 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -6,12 +6,13 @@ #include <linux/netfilter.h> #include <linux/workqueue.h> #include <linux/spinlock.h> +#include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_tables.h> #include <net/ip.h> /* for ipv4 options. */ #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_conntrack_core.h> -#include <linux/netfilter/nf_conntrack_common.h> +#include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_flow_table.h> struct nft_flow_offload { @@ -114,10 +115,13 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, if (nft_flow_route(pkt, ct, &route, dir) < 0) goto err_flow_route; - flow = flow_offload_alloc(ct, &route); + flow = flow_offload_alloc(ct); if (!flow) goto err_flow_alloc; + if (flow_offload_route_init(flow, &route) < 0) + goto err_flow_add; + if (tcph) { ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; @@ -149,6 +153,11 @@ static int nft_flow_offload_validate(const struct nft_ctx *ctx, return nft_chain_validate_hooks(ctx->chain, hook_mask); } +static const struct nla_policy nft_flow_offload_policy[NFTA_FLOW_MAX + 1] = { + [NFTA_FLOW_TABLE_NAME] = { .type = NLA_STRING, + .len = NFT_NAME_MAXLEN - 1 }, +}; + static int nft_flow_offload_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -171,12 +180,26 @@ static int nft_flow_offload_init(const struct nft_ctx *ctx, return nf_ct_netns_get(ctx->net, ctx->family); } -static void nft_flow_offload_destroy(const struct nft_ctx *ctx, - const struct nft_expr *expr) +static void nft_flow_offload_deactivate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + enum nft_trans_phase phase) +{ + struct nft_flow_offload *priv = nft_expr_priv(expr); + + nf_tables_deactivate_flowtable(ctx, priv->flowtable, phase); +} + +static void nft_flow_offload_activate(const struct nft_ctx *ctx, + const struct nft_expr *expr) { struct nft_flow_offload *priv = nft_expr_priv(expr); - priv->flowtable->use--; + priv->flowtable->use++; +} + +static void nft_flow_offload_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ nf_ct_netns_put(ctx->net, ctx->family); } @@ -199,6 +222,8 @@ static const struct nft_expr_ops nft_flow_offload_ops = { .size = NFT_EXPR_SIZE(sizeof(struct nft_flow_offload)), .eval = nft_flow_offload_eval, .init = nft_flow_offload_init, + .activate = nft_flow_offload_activate, + .deactivate = nft_flow_offload_deactivate, .destroy = nft_flow_offload_destroy, .validate = nft_flow_offload_validate, .dump = nft_flow_offload_dump, @@ -207,6 +232,7 @@ static const struct nft_expr_ops nft_flow_offload_ops = { static struct nft_expr_type nft_flow_offload_type __read_mostly = { .name = "flow_offload", .ops = &nft_flow_offload_ops, + .policy = nft_flow_offload_policy, .maxattr = NFTA_FLOW_MAX, .owner = THIS_MODULE, }; diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index 61b7f93ac681..aba11c2333f3 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -12,6 +12,7 @@ #include <linux/ip.h> #include <linux/ipv6.h> #include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_offload.h> #include <net/netfilter/nf_dup_netdev.h> #include <net/neighbour.h> #include <net/ip.h> @@ -63,6 +64,16 @@ nla_put_failure: return -1; } +static int nft_fwd_netdev_offload(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_expr *expr) +{ + const struct nft_fwd_netdev *priv = nft_expr_priv(expr); + int oif = ctx->regs[priv->sreg_dev].data.data[0]; + + return nft_fwd_dup_netdev_offload(ctx, flow, FLOW_ACTION_REDIRECT, oif); +} + struct nft_fwd_neigh { enum nft_registers sreg_dev:8; enum nft_registers sreg_addr:8; @@ -194,6 +205,7 @@ static const struct nft_expr_ops nft_fwd_netdev_ops = { .eval = nft_fwd_netdev_eval, .init = nft_fwd_netdev_init, .dump = nft_fwd_netdev_dump, + .offload = nft_fwd_netdev_offload, }; static const struct nft_expr_ops * diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index ca2ae4b95a8d..c7f0ef73d939 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -125,17 +125,13 @@ static int nft_immediate_validate(const struct nft_ctx *ctx, return 0; } -static int nft_immediate_offload(struct nft_offload_ctx *ctx, - struct nft_flow_rule *flow, - const struct nft_expr *expr) +static int nft_immediate_offload_verdict(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_immediate_expr *priv) { - const struct nft_immediate_expr *priv = nft_expr_priv(expr); struct flow_action_entry *entry; const struct nft_data *data; - if (priv->dreg != NFT_REG_VERDICT) - return -EOPNOTSUPP; - entry = &flow->rule->action.entries[ctx->num_actions++]; data = &priv->data; @@ -153,6 +149,20 @@ static int nft_immediate_offload(struct nft_offload_ctx *ctx, return 0; } +static int nft_immediate_offload(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_expr *expr) +{ + const struct nft_immediate_expr *priv = nft_expr_priv(expr); + + if (priv->dreg == NFT_REG_VERDICT) + return nft_immediate_offload_verdict(ctx, flow, priv); + + memcpy(&ctx->regs[priv->dreg].data, &priv->data, sizeof(priv->data)); + + return 0; +} + static const struct nft_expr_ops nft_imm_ops = { .type = &nft_imm_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)), diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index c0560bf3c31b..660bad688e2b 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -73,9 +73,6 @@ static int nft_lookup_init(const struct nft_ctx *ctx, if (IS_ERR(set)) return PTR_ERR(set); - if (set->flags & NFT_SET_EVAL) - return -EOPNOTSUPP; - priv->sreg = nft_parse_register(tb[NFTA_LOOKUP_SREG]); err = nft_validate_register_load(priv->sreg, set->klen); if (err < 0) diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c index 39dc94f2491e..bc9fd98c5d6d 100644 --- a/net/netfilter/nft_masq.c +++ b/net/netfilter/nft_masq.c @@ -43,7 +43,7 @@ static int nft_masq_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { - u32 plen = FIELD_SIZEOF(struct nf_nat_range, min_addr.all); + u32 plen = sizeof_field(struct nf_nat_range, min_addr.all); struct nft_masq *priv = nft_expr_priv(expr); int err; diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index f69afb9ff3cb..951b6e87ed5d 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -17,6 +17,7 @@ #include <linux/smp.h> #include <linux/static_key.h> #include <net/dst.h> +#include <net/ip.h> #include <net/sock.h> #include <net/tcp_states.h> /* for TCP_TIME_WAIT */ #include <net/netfilter/nf_tables.h> @@ -26,16 +27,295 @@ #include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */ +#define NFT_META_SECS_PER_MINUTE 60 +#define NFT_META_SECS_PER_HOUR 3600 +#define NFT_META_SECS_PER_DAY 86400 +#define NFT_META_DAYS_PER_WEEK 7 + static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state); +static u8 nft_meta_weekday(void) +{ + time64_t secs = ktime_get_real_seconds(); + unsigned int dse; + u8 wday; + + secs -= NFT_META_SECS_PER_MINUTE * sys_tz.tz_minuteswest; + dse = div_u64(secs, NFT_META_SECS_PER_DAY); + wday = (4 + dse) % NFT_META_DAYS_PER_WEEK; + + return wday; +} + +static u32 nft_meta_hour(time64_t secs) +{ + struct tm tm; + + time64_to_tm(secs, 0, &tm); + + return tm.tm_hour * NFT_META_SECS_PER_HOUR + + tm.tm_min * NFT_META_SECS_PER_MINUTE + + tm.tm_sec; +} + +static noinline_for_stack void +nft_meta_get_eval_time(enum nft_meta_keys key, + u32 *dest) +{ + switch (key) { + case NFT_META_TIME_NS: + nft_reg_store64(dest, ktime_get_real_ns()); + break; + case NFT_META_TIME_DAY: + nft_reg_store8(dest, nft_meta_weekday()); + break; + case NFT_META_TIME_HOUR: + *dest = nft_meta_hour(ktime_get_real_seconds()); + break; + default: + break; + } +} + +static noinline bool +nft_meta_get_eval_pkttype_lo(const struct nft_pktinfo *pkt, + u32 *dest) +{ + const struct sk_buff *skb = pkt->skb; + + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + if (ipv4_is_multicast(ip_hdr(skb)->daddr)) + nft_reg_store8(dest, PACKET_MULTICAST); + else + nft_reg_store8(dest, PACKET_BROADCAST); + break; + case NFPROTO_IPV6: + nft_reg_store8(dest, PACKET_MULTICAST); + break; + case NFPROTO_NETDEV: + switch (skb->protocol) { + case htons(ETH_P_IP): { + int noff = skb_network_offset(skb); + struct iphdr *iph, _iph; + + iph = skb_header_pointer(skb, noff, + sizeof(_iph), &_iph); + if (!iph) + return false; + + if (ipv4_is_multicast(iph->daddr)) + nft_reg_store8(dest, PACKET_MULTICAST); + else + nft_reg_store8(dest, PACKET_BROADCAST); + + break; + } + case htons(ETH_P_IPV6): + nft_reg_store8(dest, PACKET_MULTICAST); + break; + default: + WARN_ON_ONCE(1); + return false; + } + break; + default: + WARN_ON_ONCE(1); + return false; + } + + return true; +} + +static noinline bool +nft_meta_get_eval_skugid(enum nft_meta_keys key, + u32 *dest, + const struct nft_pktinfo *pkt) +{ + struct sock *sk = skb_to_full_sk(pkt->skb); + struct socket *sock; + + if (!sk || !sk_fullsock(sk) || !net_eq(nft_net(pkt), sock_net(sk))) + return false; + + read_lock_bh(&sk->sk_callback_lock); + sock = sk->sk_socket; + if (!sock || !sock->file) { + read_unlock_bh(&sk->sk_callback_lock); + return false; + } + + switch (key) { + case NFT_META_SKUID: + *dest = from_kuid_munged(&init_user_ns, + sock->file->f_cred->fsuid); + break; + case NFT_META_SKGID: + *dest = from_kgid_munged(&init_user_ns, + sock->file->f_cred->fsgid); + break; + default: + break; + } + + read_unlock_bh(&sk->sk_callback_lock); + return true; +} + +#ifdef CONFIG_CGROUP_NET_CLASSID +static noinline bool +nft_meta_get_eval_cgroup(u32 *dest, const struct nft_pktinfo *pkt) +{ + struct sock *sk = skb_to_full_sk(pkt->skb); + + if (!sk || !sk_fullsock(sk) || !net_eq(nft_net(pkt), sock_net(sk))) + return false; + + *dest = sock_cgroup_classid(&sk->sk_cgrp_data); + return true; +} +#endif + +static noinline bool nft_meta_get_eval_kind(enum nft_meta_keys key, + u32 *dest, + const struct nft_pktinfo *pkt) +{ + const struct net_device *in = nft_in(pkt), *out = nft_out(pkt); + + switch (key) { + case NFT_META_IIFKIND: + if (!in || !in->rtnl_link_ops) + return false; + strncpy((char *)dest, in->rtnl_link_ops->kind, IFNAMSIZ); + break; + case NFT_META_OIFKIND: + if (!out || !out->rtnl_link_ops) + return false; + strncpy((char *)dest, out->rtnl_link_ops->kind, IFNAMSIZ); + break; + default: + return false; + } + + return true; +} + +static void nft_meta_store_ifindex(u32 *dest, const struct net_device *dev) +{ + *dest = dev ? dev->ifindex : 0; +} + +static void nft_meta_store_ifname(u32 *dest, const struct net_device *dev) +{ + strncpy((char *)dest, dev ? dev->name : "", IFNAMSIZ); +} + +static bool nft_meta_store_iftype(u32 *dest, const struct net_device *dev) +{ + if (!dev) + return false; + + nft_reg_store16(dest, dev->type); + return true; +} + +static bool nft_meta_store_ifgroup(u32 *dest, const struct net_device *dev) +{ + if (!dev) + return false; + + *dest = dev->group; + return true; +} + +static bool nft_meta_get_eval_ifname(enum nft_meta_keys key, u32 *dest, + const struct nft_pktinfo *pkt) +{ + switch (key) { + case NFT_META_IIFNAME: + nft_meta_store_ifname(dest, nft_in(pkt)); + break; + case NFT_META_OIFNAME: + nft_meta_store_ifname(dest, nft_out(pkt)); + break; + case NFT_META_IIF: + nft_meta_store_ifindex(dest, nft_in(pkt)); + break; + case NFT_META_OIF: + nft_meta_store_ifindex(dest, nft_out(pkt)); + break; + case NFT_META_IIFTYPE: + if (!nft_meta_store_iftype(dest, nft_in(pkt))) + return false; + break; + case NFT_META_OIFTYPE: + if (!nft_meta_store_iftype(dest, nft_out(pkt))) + return false; + break; + case NFT_META_IIFGROUP: + if (!nft_meta_store_ifgroup(dest, nft_out(pkt))) + return false; + break; + case NFT_META_OIFGROUP: + if (!nft_meta_store_ifgroup(dest, nft_out(pkt))) + return false; + break; + default: + return false; + } + + return true; +} + +static noinline u32 nft_prandom_u32(void) +{ + struct rnd_state *state = this_cpu_ptr(&nft_prandom_state); + + return prandom_u32_state(state); +} + +#ifdef CONFIG_IP_ROUTE_CLASSID +static noinline bool +nft_meta_get_eval_rtclassid(const struct sk_buff *skb, u32 *dest) +{ + const struct dst_entry *dst = skb_dst(skb); + + if (!dst) + return false; + + *dest = dst->tclassid; + return true; +} +#endif + +static noinline u32 nft_meta_get_eval_sdif(const struct nft_pktinfo *pkt) +{ + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + return inet_sdif(pkt->skb); + case NFPROTO_IPV6: + return inet6_sdif(pkt->skb); + } + + return 0; +} + +static noinline void +nft_meta_get_eval_sdifname(u32 *dest, const struct nft_pktinfo *pkt) +{ + u32 sdif = nft_meta_get_eval_sdif(pkt); + const struct net_device *dev; + + dev = sdif ? dev_get_by_index_rcu(nft_net(pkt), sdif) : NULL; + nft_meta_store_ifname(dest, dev); +} + void nft_meta_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_meta *priv = nft_expr_priv(expr); const struct sk_buff *skb = pkt->skb; - const struct net_device *in = nft_in(pkt), *out = nft_out(pkt); - struct sock *sk; u32 *dest = ®s->data[priv->dreg]; switch (priv->key) { @@ -60,69 +340,26 @@ void nft_meta_get_eval(const struct nft_expr *expr, *dest = skb->mark; break; case NFT_META_IIF: - *dest = in ? in->ifindex : 0; - break; case NFT_META_OIF: - *dest = out ? out->ifindex : 0; - break; case NFT_META_IIFNAME: - strncpy((char *)dest, in ? in->name : "", IFNAMSIZ); - break; case NFT_META_OIFNAME: - strncpy((char *)dest, out ? out->name : "", IFNAMSIZ); - break; case NFT_META_IIFTYPE: - if (in == NULL) - goto err; - nft_reg_store16(dest, in->type); - break; case NFT_META_OIFTYPE: - if (out == NULL) + case NFT_META_IIFGROUP: + case NFT_META_OIFGROUP: + if (!nft_meta_get_eval_ifname(priv->key, dest, pkt)) goto err; - nft_reg_store16(dest, out->type); break; case NFT_META_SKUID: - sk = skb_to_full_sk(skb); - if (!sk || !sk_fullsock(sk) || - !net_eq(nft_net(pkt), sock_net(sk))) - goto err; - - read_lock_bh(&sk->sk_callback_lock); - if (sk->sk_socket == NULL || - sk->sk_socket->file == NULL) { - read_unlock_bh(&sk->sk_callback_lock); - goto err; - } - - *dest = from_kuid_munged(&init_user_ns, - sk->sk_socket->file->f_cred->fsuid); - read_unlock_bh(&sk->sk_callback_lock); - break; case NFT_META_SKGID: - sk = skb_to_full_sk(skb); - if (!sk || !sk_fullsock(sk) || - !net_eq(nft_net(pkt), sock_net(sk))) - goto err; - - read_lock_bh(&sk->sk_callback_lock); - if (sk->sk_socket == NULL || - sk->sk_socket->file == NULL) { - read_unlock_bh(&sk->sk_callback_lock); + if (!nft_meta_get_eval_skugid(priv->key, dest, pkt)) goto err; - } - *dest = from_kgid_munged(&init_user_ns, - sk->sk_socket->file->f_cred->fsgid); - read_unlock_bh(&sk->sk_callback_lock); break; #ifdef CONFIG_IP_ROUTE_CLASSID - case NFT_META_RTCLASSID: { - const struct dst_entry *dst = skb_dst(skb); - - if (dst == NULL) + case NFT_META_RTCLASSID: + if (!nft_meta_get_eval_rtclassid(skb, dest)) goto err; - *dest = dst->tclassid; break; - } #endif #ifdef CONFIG_NETWORK_SECMARK case NFT_META_SECMARK: @@ -135,88 +372,41 @@ void nft_meta_get_eval(const struct nft_expr *expr, break; } - switch (nft_pf(pkt)) { - case NFPROTO_IPV4: - if (ipv4_is_multicast(ip_hdr(skb)->daddr)) - nft_reg_store8(dest, PACKET_MULTICAST); - else - nft_reg_store8(dest, PACKET_BROADCAST); - break; - case NFPROTO_IPV6: - nft_reg_store8(dest, PACKET_MULTICAST); - break; - case NFPROTO_NETDEV: - switch (skb->protocol) { - case htons(ETH_P_IP): { - int noff = skb_network_offset(skb); - struct iphdr *iph, _iph; - - iph = skb_header_pointer(skb, noff, - sizeof(_iph), &_iph); - if (!iph) - goto err; - - if (ipv4_is_multicast(iph->daddr)) - nft_reg_store8(dest, PACKET_MULTICAST); - else - nft_reg_store8(dest, PACKET_BROADCAST); - - break; - } - case htons(ETH_P_IPV6): - nft_reg_store8(dest, PACKET_MULTICAST); - break; - default: - WARN_ON_ONCE(1); - goto err; - } - break; - default: - WARN_ON_ONCE(1); + if (!nft_meta_get_eval_pkttype_lo(pkt, dest)) goto err; - } break; case NFT_META_CPU: *dest = raw_smp_processor_id(); break; - case NFT_META_IIFGROUP: - if (in == NULL) - goto err; - *dest = in->group; - break; - case NFT_META_OIFGROUP: - if (out == NULL) - goto err; - *dest = out->group; - break; #ifdef CONFIG_CGROUP_NET_CLASSID case NFT_META_CGROUP: - sk = skb_to_full_sk(skb); - if (!sk || !sk_fullsock(sk) || - !net_eq(nft_net(pkt), sock_net(sk))) + if (!nft_meta_get_eval_cgroup(dest, pkt)) goto err; - *dest = sock_cgroup_classid(&sk->sk_cgrp_data); break; #endif - case NFT_META_PRANDOM: { - struct rnd_state *state = this_cpu_ptr(&nft_prandom_state); - *dest = prandom_u32_state(state); + case NFT_META_PRANDOM: + *dest = nft_prandom_u32(); break; - } #ifdef CONFIG_XFRM case NFT_META_SECPATH: nft_reg_store8(dest, secpath_exists(skb)); break; #endif case NFT_META_IIFKIND: - if (in == NULL || in->rtnl_link_ops == NULL) - goto err; - strncpy((char *)dest, in->rtnl_link_ops->kind, IFNAMSIZ); - break; case NFT_META_OIFKIND: - if (out == NULL || out->rtnl_link_ops == NULL) + if (!nft_meta_get_eval_kind(priv->key, dest, pkt)) goto err; - strncpy((char *)dest, out->rtnl_link_ops->kind, IFNAMSIZ); + break; + case NFT_META_TIME_NS: + case NFT_META_TIME_DAY: + case NFT_META_TIME_HOUR: + nft_meta_get_eval_time(priv->key, dest); + break; + case NFT_META_SDIF: + *dest = nft_meta_get_eval_sdif(pkt); + break; + case NFT_META_SDIFNAME: + nft_meta_get_eval_sdifname(dest, pkt); break; default: WARN_ON(1); @@ -298,6 +488,7 @@ int nft_meta_get_init(const struct nft_ctx *ctx, case NFT_META_MARK: case NFT_META_IIF: case NFT_META_OIF: + case NFT_META_SDIF: case NFT_META_SKUID: case NFT_META_SKGID: #ifdef CONFIG_IP_ROUTE_CLASSID @@ -319,6 +510,7 @@ int nft_meta_get_init(const struct nft_ctx *ctx, case NFT_META_OIFNAME: case NFT_META_IIFKIND: case NFT_META_OIFKIND: + case NFT_META_SDIFNAME: len = IFNAMSIZ; break; case NFT_META_PRANDOM: @@ -330,6 +522,15 @@ int nft_meta_get_init(const struct nft_ctx *ctx, len = sizeof(u8); break; #endif + case NFT_META_TIME_NS: + len = sizeof(u64); + break; + case NFT_META_TIME_DAY: + len = sizeof(u8); + break; + case NFT_META_TIME_HOUR: + len = sizeof(u32); + break; default: return -EOPNOTSUPP; } @@ -340,16 +541,28 @@ int nft_meta_get_init(const struct nft_ctx *ctx, } EXPORT_SYMBOL_GPL(nft_meta_get_init); -static int nft_meta_get_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) +static int nft_meta_get_validate_sdif(const struct nft_ctx *ctx) { -#ifdef CONFIG_XFRM - const struct nft_meta *priv = nft_expr_priv(expr); unsigned int hooks; - if (priv->key != NFT_META_SECPATH) - return 0; + switch (ctx->family) { + case NFPROTO_IPV4: + case NFPROTO_IPV6: + case NFPROTO_INET: + hooks = (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD); + break; + default: + return -EOPNOTSUPP; + } + + return nft_chain_validate_hooks(ctx->chain, hooks); +} + +static int nft_meta_get_validate_xfrm(const struct nft_ctx *ctx) +{ +#ifdef CONFIG_XFRM + unsigned int hooks; switch (ctx->family) { case NFPROTO_NETDEV: @@ -372,6 +585,25 @@ static int nft_meta_get_validate(const struct nft_ctx *ctx, #endif } +static int nft_meta_get_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + const struct nft_meta *priv = nft_expr_priv(expr); + + switch (priv->key) { + case NFT_META_SECPATH: + return nft_meta_get_validate_xfrm(ctx); + case NFT_META_SDIF: + case NFT_META_SDIFNAME: + return nft_meta_get_validate_sdif(ctx); + default: + break; + } + + return 0; +} + int nft_meta_set_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data) @@ -501,6 +733,14 @@ static int nft_meta_get_offload(struct nft_offload_ctx *ctx, sizeof(__u8), reg); nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT); break; + case NFT_META_IIF: + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_META, meta, + ingress_ifindex, sizeof(__u32), reg); + break; + case NFT_META_IIFTYPE: + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_META, meta, + ingress_iftype, sizeof(__u16), reg); + break; default: return -EOPNOTSUPP; } diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index c3c93e95b46e..8b44a4de5329 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -141,10 +141,10 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, switch (family) { case NFPROTO_IPV4: - alen = FIELD_SIZEOF(struct nf_nat_range, min_addr.ip); + alen = sizeof_field(struct nf_nat_range, min_addr.ip); break; case NFPROTO_IPV6: - alen = FIELD_SIZEOF(struct nf_nat_range, min_addr.ip6); + alen = sizeof_field(struct nf_nat_range, min_addr.ip6); break; default: return -EAFNOSUPPORT; @@ -171,7 +171,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, } } - plen = FIELD_SIZEOF(struct nf_nat_range, min_addr.all); + plen = sizeof_field(struct nf_nat_range, min_addr.all); if (tb[NFTA_NAT_REG_PROTO_MIN]) { priv->sreg_proto_min = nft_parse_register(tb[NFTA_NAT_REG_PROTO_MIN]); diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index f54d6ae15bb1..b42247aa48a9 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -61,6 +61,9 @@ static int nft_osf_init(const struct nft_ctx *ctx, int err; u8 ttl; + if (!tb[NFTA_OSF_DREG]) + return -EINVAL; + if (tb[NFTA_OSF_TTL]) { ttl = nla_get_u8(tb[NFTA_OSF_TTL]); if (ttl > 2) diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 22a80eb60222..1993af3a2979 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -23,50 +23,58 @@ #include <linux/ip.h> #include <linux/ipv6.h> +static bool nft_payload_rebuild_vlan_hdr(const struct sk_buff *skb, int mac_off, + struct vlan_ethhdr *veth) +{ + if (skb_copy_bits(skb, mac_off, veth, ETH_HLEN)) + return false; + + veth->h_vlan_proto = skb->vlan_proto; + veth->h_vlan_TCI = htons(skb_vlan_tag_get(skb)); + veth->h_vlan_encapsulated_proto = skb->protocol; + + return true; +} + /* add vlan header into the user buffer for if tag was removed by offloads */ static bool nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u8 offset, u8 len) { int mac_off = skb_mac_header(skb) - skb->data; - u8 vlan_len, *vlanh, *dst_u8 = (u8 *) d; + u8 *vlanh, *dst_u8 = (u8 *) d; struct vlan_ethhdr veth; + u8 vlan_hlen = 0; + + if ((skb->protocol == htons(ETH_P_8021AD) || + skb->protocol == htons(ETH_P_8021Q)) && + offset >= VLAN_ETH_HLEN && offset < VLAN_ETH_HLEN + VLAN_HLEN) + vlan_hlen += VLAN_HLEN; vlanh = (u8 *) &veth; - if (offset < ETH_HLEN) { - u8 ethlen = min_t(u8, len, ETH_HLEN - offset); + if (offset < VLAN_ETH_HLEN + vlan_hlen) { + u8 ethlen = len; - if (skb_copy_bits(skb, mac_off, &veth, ETH_HLEN)) + if (vlan_hlen && + skb_copy_bits(skb, mac_off, &veth, VLAN_ETH_HLEN) < 0) + return false; + else if (!nft_payload_rebuild_vlan_hdr(skb, mac_off, &veth)) return false; - veth.h_vlan_proto = skb->vlan_proto; + if (offset + len > VLAN_ETH_HLEN + vlan_hlen) + ethlen -= offset + len - VLAN_ETH_HLEN + vlan_hlen; - memcpy(dst_u8, vlanh + offset, ethlen); + memcpy(dst_u8, vlanh + offset - vlan_hlen, ethlen); len -= ethlen; if (len == 0) return true; dst_u8 += ethlen; - offset = ETH_HLEN; - } else if (offset >= VLAN_ETH_HLEN) { - offset -= VLAN_HLEN; - goto skip; + offset = ETH_HLEN + vlan_hlen; + } else { + offset -= VLAN_HLEN + vlan_hlen; } - veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb)); - veth.h_vlan_encapsulated_proto = skb->protocol; - - vlanh += offset; - - vlan_len = min_t(u8, len, VLAN_ETH_HLEN - offset); - memcpy(dst_u8, vlanh, vlan_len); - - len -= vlan_len; - if (!len) - return true; - - dst_u8 += vlan_len; - skip: return skb_copy_bits(skb, offset + mac_off, dst_u8, len) == 0; } @@ -161,13 +169,59 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx, switch (priv->offset) { case offsetof(struct ethhdr, h_source): + if (priv->len != ETH_ALEN) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs, src, ETH_ALEN, reg); break; case offsetof(struct ethhdr, h_dest): + if (priv->len != ETH_ALEN) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs, dst, ETH_ALEN, reg); break; + case offsetof(struct ethhdr, h_proto): + if (priv->len != sizeof(__be16)) + return -EOPNOTSUPP; + + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, + n_proto, sizeof(__be16), reg); + nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK); + break; + case offsetof(struct vlan_ethhdr, h_vlan_TCI): + if (priv->len != sizeof(__be16)) + return -EOPNOTSUPP; + + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_VLAN, vlan, + vlan_tci, sizeof(__be16), reg); + break; + case offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto): + if (priv->len != sizeof(__be16)) + return -EOPNOTSUPP; + + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_VLAN, vlan, + vlan_tpid, sizeof(__be16), reg); + nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK); + break; + case offsetof(struct vlan_ethhdr, h_vlan_TCI) + sizeof(struct vlan_hdr): + if (priv->len != sizeof(__be16)) + return -EOPNOTSUPP; + + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, vlan, + vlan_tci, sizeof(__be16), reg); + break; + case offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto) + + sizeof(struct vlan_hdr): + if (priv->len != sizeof(__be16)) + return -EOPNOTSUPP; + + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, vlan, + vlan_tpid, sizeof(__be16), reg); + break; + default: + return -EOPNOTSUPP; } return 0; @@ -181,14 +235,23 @@ static int nft_payload_offload_ip(struct nft_offload_ctx *ctx, switch (priv->offset) { case offsetof(struct iphdr, saddr): + if (priv->len != sizeof(struct in_addr)) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4, src, sizeof(struct in_addr), reg); break; case offsetof(struct iphdr, daddr): + if (priv->len != sizeof(struct in_addr)) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4, dst, sizeof(struct in_addr), reg); break; case offsetof(struct iphdr, protocol): + if (priv->len != sizeof(__u8)) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto, sizeof(__u8), reg); nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT); @@ -208,14 +271,23 @@ static int nft_payload_offload_ip6(struct nft_offload_ctx *ctx, switch (priv->offset) { case offsetof(struct ipv6hdr, saddr): + if (priv->len != sizeof(struct in6_addr)) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6, src, sizeof(struct in6_addr), reg); break; case offsetof(struct ipv6hdr, daddr): + if (priv->len != sizeof(struct in6_addr)) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6, dst, sizeof(struct in6_addr), reg); break; case offsetof(struct ipv6hdr, nexthdr): + if (priv->len != sizeof(__u8)) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto, sizeof(__u8), reg); nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT); @@ -255,10 +327,16 @@ static int nft_payload_offload_tcp(struct nft_offload_ctx *ctx, switch (priv->offset) { case offsetof(struct tcphdr, source): + if (priv->len != sizeof(__be16)) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, src, sizeof(__be16), reg); break; case offsetof(struct tcphdr, dest): + if (priv->len != sizeof(__be16)) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, dst, sizeof(__be16), reg); break; @@ -277,10 +355,16 @@ static int nft_payload_offload_udp(struct nft_offload_ctx *ctx, switch (priv->offset) { case offsetof(struct udphdr, source): + if (priv->len != sizeof(__be16)) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, src, sizeof(__be16), reg); break; case offsetof(struct udphdr, dest): + if (priv->len != sizeof(__be16)) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, dst, sizeof(__be16), reg); break; diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index c8745d454bf8..4413690591f2 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -13,7 +13,7 @@ #include <net/netfilter/nf_tables.h> struct nft_quota { - u64 quota; + atomic64_t quota; unsigned long flags; atomic64_t consumed; }; @@ -21,7 +21,8 @@ struct nft_quota { static inline bool nft_overquota(struct nft_quota *priv, const struct sk_buff *skb) { - return atomic64_add_return(skb->len, &priv->consumed) >= priv->quota; + return atomic64_add_return(skb->len, &priv->consumed) >= + atomic64_read(&priv->quota); } static inline bool nft_quota_invert(struct nft_quota *priv) @@ -89,7 +90,7 @@ static int nft_quota_do_init(const struct nlattr * const tb[], return -EOPNOTSUPP; } - priv->quota = quota; + atomic64_set(&priv->quota, quota); priv->flags = flags; atomic64_set(&priv->consumed, consumed); @@ -105,10 +106,22 @@ static int nft_quota_obj_init(const struct nft_ctx *ctx, return nft_quota_do_init(tb, priv); } +static void nft_quota_obj_update(struct nft_object *obj, + struct nft_object *newobj) +{ + struct nft_quota *newpriv = nft_obj_data(newobj); + struct nft_quota *priv = nft_obj_data(obj); + u64 newquota; + + newquota = atomic64_read(&newpriv->quota); + atomic64_set(&priv->quota, newquota); + priv->flags = newpriv->flags; +} + static int nft_quota_do_dump(struct sk_buff *skb, struct nft_quota *priv, bool reset) { - u64 consumed, consumed_cap; + u64 consumed, consumed_cap, quota; u32 flags = priv->flags; /* Since we inconditionally increment consumed quota for each packet @@ -116,14 +129,15 @@ static int nft_quota_do_dump(struct sk_buff *skb, struct nft_quota *priv, * userspace. */ consumed = atomic64_read(&priv->consumed); - if (consumed >= priv->quota) { - consumed_cap = priv->quota; + quota = atomic64_read(&priv->quota); + if (consumed >= quota) { + consumed_cap = quota; flags |= NFT_QUOTA_F_DEPLETED; } else { consumed_cap = consumed; } - if (nla_put_be64(skb, NFTA_QUOTA_BYTES, cpu_to_be64(priv->quota), + if (nla_put_be64(skb, NFTA_QUOTA_BYTES, cpu_to_be64(quota), NFTA_QUOTA_PAD) || nla_put_be64(skb, NFTA_QUOTA_CONSUMED, cpu_to_be64(consumed_cap), NFTA_QUOTA_PAD) || @@ -155,6 +169,7 @@ static const struct nft_object_ops nft_quota_obj_ops = { .init = nft_quota_obj_init, .eval = nft_quota_obj_eval, .dump = nft_quota_obj_dump, + .update = nft_quota_obj_update, }; static struct nft_object_type nft_quota_obj_type __read_mostly = { diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c index 4701fa8a45e7..89efcc5a533d 100644 --- a/net/netfilter/nft_range.c +++ b/net/netfilter/nft_range.c @@ -66,11 +66,21 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr if (err < 0) return err; + if (desc_from.type != NFT_DATA_VALUE) { + err = -EINVAL; + goto err1; + } + err = nft_data_init(NULL, &priv->data_to, sizeof(priv->data_to), &desc_to, tb[NFTA_RANGE_TO_DATA]); if (err < 0) goto err1; + if (desc_to.type != NFT_DATA_VALUE) { + err = -EINVAL; + goto err2; + } + if (desc_from.len != desc_to.len) { err = -EINVAL; goto err2; diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c index 43eeb1f609f1..5b779171565c 100644 --- a/net/netfilter/nft_redir.c +++ b/net/netfilter/nft_redir.c @@ -48,7 +48,7 @@ static int nft_redir_init(const struct nft_ctx *ctx, unsigned int plen; int err; - plen = FIELD_SIZEOF(struct nf_nat_range, min_addr.all); + plen = sizeof_field(struct nf_nat_range, min_addr.all); if (tb[NFTA_REDIR_REG_PROTO_MIN]) { priv->sreg_proto_min = nft_parse_register(tb[NFTA_REDIR_REG_PROTO_MIN]); diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index b5aeccdddb22..87e8d9ba0c9b 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -10,7 +10,7 @@ #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> struct nft_bitmap_elem { struct list_head head; @@ -259,8 +259,8 @@ static u64 nft_bitmap_privsize(const struct nlattr * const nla[], } static int nft_bitmap_init(const struct nft_set *set, - const struct nft_set_desc *desc, - const struct nlattr * const nla[]) + const struct nft_set_desc *desc, + const struct nlattr * const nla[]) { struct nft_bitmap *priv = nft_set_priv(set); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 6e8d20c03e3d..d350a7cd3af0 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -16,7 +16,7 @@ #include <linux/rhashtable.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> /* We target a hash table size of 4, element hint is 75% of final size */ #define NFT_RHASH_ELEMENT_HINT 3 @@ -234,6 +234,24 @@ static void nft_rhash_remove(const struct net *net, rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params); } +static bool nft_rhash_delete(const struct nft_set *set, + const u32 *key) +{ + struct nft_rhash *priv = nft_set_priv(set); + struct nft_rhash_cmp_arg arg = { + .genmask = NFT_GENMASK_ANY, + .set = set, + .key = key, + }; + struct nft_rhash_elem *he; + + he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); + if (he == NULL) + return false; + + return rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params) == 0; +} + static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter) { @@ -627,7 +645,7 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, } static bool nft_hash_fast_estimate(const struct nft_set_desc *desc, u32 features, - struct nft_set_estimate *est) + struct nft_set_estimate *est) { if (!desc->size) return false; @@ -662,6 +680,7 @@ struct nft_set_type nft_set_rhash_type __read_mostly = { .remove = nft_rhash_remove, .lookup = nft_rhash_lookup, .update = nft_rhash_update, + .delete = nft_rhash_delete, .walk = nft_rhash_walk, .get = nft_rhash_get, }, diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c new file mode 100644 index 000000000000..f0cb1e13af50 --- /dev/null +++ b/net/netfilter/nft_set_pipapo.c @@ -0,0 +1,2102 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* PIPAPO: PIle PAcket POlicies: set for arbitrary concatenations of ranges + * + * Copyright (c) 2019-2020 Red Hat GmbH + * + * Author: Stefano Brivio <sbrivio@redhat.com> + */ + +/** + * DOC: Theory of Operation + * + * + * Problem + * ------- + * + * Match packet bytes against entries composed of ranged or non-ranged packet + * field specifiers, mapping them to arbitrary references. For example: + * + * :: + * + * --- fields ---> + * | [net],[port],[net]... => [reference] + * entries [net],[port],[net]... => [reference] + * | [net],[port],[net]... => [reference] + * V ... + * + * where [net] fields can be IP ranges or netmasks, and [port] fields are port + * ranges. Arbitrary packet fields can be matched. + * + * + * Algorithm Overview + * ------------------ + * + * This algorithm is loosely inspired by [Ligatti 2010], and fundamentally + * relies on the consideration that every contiguous range in a space of b bits + * can be converted into b * 2 netmasks, from Theorem 3 in [Rottenstreich 2010], + * as also illustrated in Section 9 of [Kogan 2014]. + * + * Classification against a number of entries, that require matching given bits + * of a packet field, is performed by grouping those bits in sets of arbitrary + * size, and classifying packet bits one group at a time. + * + * Example: + * to match the source port (16 bits) of a packet, we can divide those 16 bits + * in 4 groups of 4 bits each. Given the entry: + * 0000 0001 0101 1001 + * and a packet with source port: + * 0000 0001 1010 1001 + * first and second groups match, but the third doesn't. We conclude that the + * packet doesn't match the given entry. + * + * Translate the set to a sequence of lookup tables, one per field. Each table + * has two dimensions: bit groups to be matched for a single packet field, and + * all the possible values of said groups (buckets). Input entries are + * represented as one or more rules, depending on the number of composing + * netmasks for the given field specifier, and a group match is indicated as a + * set bit, with number corresponding to the rule index, in all the buckets + * whose value matches the entry for a given group. + * + * Rules are mapped between fields through an array of x, n pairs, with each + * item mapping a matched rule to one or more rules. The position of the pair in + * the array indicates the matched rule to be mapped to the next field, x + * indicates the first rule index in the next field, and n the amount of + * next-field rules the current rule maps to. + * + * The mapping array for the last field maps to the desired references. + * + * To match, we perform table lookups using the values of grouped packet bits, + * and use a sequence of bitwise operations to progressively evaluate rule + * matching. + * + * A stand-alone, reference implementation, also including notes about possible + * future optimisations, is available at: + * https://pipapo.lameexcu.se/ + * + * Insertion + * --------- + * + * - For each packet field: + * + * - divide the b packet bits we want to classify into groups of size t, + * obtaining ceil(b / t) groups + * + * Example: match on destination IP address, with t = 4: 32 bits, 8 groups + * of 4 bits each + * + * - allocate a lookup table with one column ("bucket") for each possible + * value of a group, and with one row for each group + * + * Example: 8 groups, 2^4 buckets: + * + * :: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 + * 1 + * 2 + * 3 + * 4 + * 5 + * 6 + * 7 + * + * - map the bits we want to classify for the current field, for a given + * entry, to a single rule for non-ranged and netmask set items, and to one + * or multiple rules for ranges. Ranges are expanded to composing netmasks + * by pipapo_expand(). + * + * Example: 2 entries, 10.0.0.5:1024 and 192.168.1.0-192.168.2.1:2048 + * - rule #0: 10.0.0.5 + * - rule #1: 192.168.1.0/24 + * - rule #2: 192.168.2.0/31 + * + * - insert references to the rules in the lookup table, selecting buckets + * according to bit values of a rule in the given group. This is done by + * pipapo_insert(). + * + * Example: given: + * - rule #0: 10.0.0.5 mapping to buckets + * < 0 10 0 0 0 0 0 5 > + * - rule #1: 192.168.1.0/24 mapping to buckets + * < 12 0 10 8 0 1 < 0..15 > < 0..15 > > + * - rule #2: 192.168.2.0/31 mapping to buckets + * < 12 0 10 8 0 2 0 < 0..1 > > + * + * these bits are set in the lookup table: + * + * :: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0 1,2 + * 1 1,2 0 + * 2 0 1,2 + * 3 0 1,2 + * 4 0,1,2 + * 5 0 1 2 + * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 + * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 + * + * - if this is not the last field in the set, fill a mapping array that maps + * rules from the lookup table to rules belonging to the same entry in + * the next lookup table, done by pipapo_map(). + * + * Note that as rules map to contiguous ranges of rules, given how netmask + * expansion and insertion is performed, &union nft_pipapo_map_bucket stores + * this information as pairs of first rule index, rule count. + * + * Example: 2 entries, 10.0.0.5:1024 and 192.168.1.0-192.168.2.1:2048, + * given lookup table #0 for field 0 (see example above): + * + * :: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0 1,2 + * 1 1,2 0 + * 2 0 1,2 + * 3 0 1,2 + * 4 0,1,2 + * 5 0 1 2 + * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 + * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 + * + * and lookup table #1 for field 1 with: + * - rule #0: 1024 mapping to buckets + * < 0 0 4 0 > + * - rule #1: 2048 mapping to buckets + * < 0 0 5 0 > + * + * :: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0,1 + * 1 0,1 + * 2 0 1 + * 3 0,1 + * + * we need to map rules for 10.0.0.5 in lookup table #0 (rule #0) to 1024 + * in lookup table #1 (rule #0) and rules for 192.168.1.0-192.168.2.1 + * (rules #1, #2) to 2048 in lookup table #2 (rule #1): + * + * :: + * + * rule indices in current field: 0 1 2 + * map to rules in next field: 0 1 1 + * + * - if this is the last field in the set, fill a mapping array that maps + * rules from the last lookup table to element pointers, also done by + * pipapo_map(). + * + * Note that, in this implementation, we have two elements (start, end) for + * each entry. The pointer to the end element is stored in this array, and + * the pointer to the start element is linked from it. + * + * Example: entry 10.0.0.5:1024 has a corresponding &struct nft_pipapo_elem + * pointer, 0x66, and element for 192.168.1.0-192.168.2.1:2048 is at 0x42. + * From the rules of lookup table #1 as mapped above: + * + * :: + * + * rule indices in last field: 0 1 + * map to elements: 0x42 0x66 + * + * + * Matching + * -------- + * + * We use a result bitmap, with the size of a single lookup table bucket, to + * represent the matching state that applies at every algorithm step. This is + * done by pipapo_lookup(). + * + * - For each packet field: + * + * - start with an all-ones result bitmap (res_map in pipapo_lookup()) + * + * - perform a lookup into the table corresponding to the current field, + * for each group, and at every group, AND the current result bitmap with + * the value from the lookup table bucket + * + * :: + * + * Example: 192.168.1.5 < 12 0 10 8 0 1 0 5 >, with lookup table from + * insertion examples. + * Lookup table buckets are at least 3 bits wide, we'll assume 8 bits for + * convenience in this example. Initial result bitmap is 0xff, the steps + * below show the value of the result bitmap after each group is processed: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0 1,2 + * result bitmap is now: 0xff & 0x6 [bucket 12] = 0x6 + * + * 1 1,2 0 + * result bitmap is now: 0x6 & 0x6 [bucket 0] = 0x6 + * + * 2 0 1,2 + * result bitmap is now: 0x6 & 0x6 [bucket 10] = 0x6 + * + * 3 0 1,2 + * result bitmap is now: 0x6 & 0x6 [bucket 8] = 0x6 + * + * 4 0,1,2 + * result bitmap is now: 0x6 & 0x7 [bucket 0] = 0x6 + * + * 5 0 1 2 + * result bitmap is now: 0x6 & 0x2 [bucket 1] = 0x2 + * + * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 + * result bitmap is now: 0x2 & 0x7 [bucket 0] = 0x2 + * + * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 + * final result bitmap for this field is: 0x2 & 0x3 [bucket 5] = 0x2 + * + * - at the next field, start with a new, all-zeroes result bitmap. For each + * bit set in the previous result bitmap, fill the new result bitmap + * (fill_map in pipapo_lookup()) with the rule indices from the + * corresponding buckets of the mapping field for this field, done by + * pipapo_refill() + * + * Example: with mapping table from insertion examples, with the current + * result bitmap from the previous example, 0x02: + * + * :: + * + * rule indices in current field: 0 1 2 + * map to rules in next field: 0 1 1 + * + * the new result bitmap will be 0x02: rule 1 was set, and rule 1 will be + * set. + * + * We can now extend this example to cover the second iteration of the step + * above (lookup and AND bitmap): assuming the port field is + * 2048 < 0 0 5 0 >, with starting result bitmap 0x2, and lookup table + * for "port" field from pre-computation example: + * + * :: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0,1 + * 1 0,1 + * 2 0 1 + * 3 0,1 + * + * operations are: 0x2 & 0x3 [bucket 0] & 0x3 [bucket 0] & 0x2 [bucket 5] + * & 0x3 [bucket 0], resulting bitmap is 0x2. + * + * - if this is the last field in the set, look up the value from the mapping + * array corresponding to the final result bitmap + * + * Example: 0x2 resulting bitmap from 192.168.1.5:2048, mapping array for + * last field from insertion example: + * + * :: + * + * rule indices in last field: 0 1 + * map to elements: 0x42 0x66 + * + * the matching element is at 0x42. + * + * + * References + * ---------- + * + * [Ligatti 2010] + * A Packet-classification Algorithm for Arbitrary Bitmask Rules, with + * Automatic Time-space Tradeoffs + * Jay Ligatti, Josh Kuhn, and Chris Gage. + * Proceedings of the IEEE International Conference on Computer + * Communication Networks (ICCCN), August 2010. + * http://www.cse.usf.edu/~ligatti/papers/grouper-conf.pdf + * + * [Rottenstreich 2010] + * Worst-Case TCAM Rule Expansion + * Ori Rottenstreich and Isaac Keslassy. + * 2010 Proceedings IEEE INFOCOM, San Diego, CA, 2010. + * http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.212.4592&rep=rep1&type=pdf + * + * [Kogan 2014] + * SAX-PAC (Scalable And eXpressive PAcket Classification) + * Kirill Kogan, Sergey Nikolenko, Ori Rottenstreich, William Culhane, + * and Patrick Eugster. + * Proceedings of the 2014 ACM conference on SIGCOMM, August 2014. + * http://www.sigcomm.org/sites/default/files/ccr/papers/2014/August/2619239-2626294.pdf + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/log2.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <uapi/linux/netfilter/nf_tables.h> +#include <net/ipv6.h> /* For the maximum length of a field */ +#include <linux/bitmap.h> +#include <linux/bitops.h> + +/* Count of concatenated fields depends on count of 32-bit nftables registers */ +#define NFT_PIPAPO_MAX_FIELDS NFT_REG32_COUNT + +/* Largest supported field size */ +#define NFT_PIPAPO_MAX_BYTES (sizeof(struct in6_addr)) +#define NFT_PIPAPO_MAX_BITS (NFT_PIPAPO_MAX_BYTES * BITS_PER_BYTE) + +/* Number of bits to be grouped together in lookup table buckets, arbitrary */ +#define NFT_PIPAPO_GROUP_BITS 4 +#define NFT_PIPAPO_GROUPS_PER_BYTE (BITS_PER_BYTE / NFT_PIPAPO_GROUP_BITS) + +/* Fields are padded to 32 bits in input registers */ +#define NFT_PIPAPO_GROUPS_PADDED_SIZE(x) \ + (round_up((x) / NFT_PIPAPO_GROUPS_PER_BYTE, sizeof(u32))) +#define NFT_PIPAPO_GROUPS_PADDING(x) \ + (NFT_PIPAPO_GROUPS_PADDED_SIZE((x)) - (x) / NFT_PIPAPO_GROUPS_PER_BYTE) + +/* Number of buckets, given by 2 ^ n, with n grouped bits */ +#define NFT_PIPAPO_BUCKETS (1 << NFT_PIPAPO_GROUP_BITS) + +/* Each n-bit range maps to up to n * 2 rules */ +#define NFT_PIPAPO_MAP_NBITS (const_ilog2(NFT_PIPAPO_MAX_BITS * 2)) + +/* Use the rest of mapping table buckets for rule indices, but it makes no sense + * to exceed 32 bits + */ +#if BITS_PER_LONG == 64 +#define NFT_PIPAPO_MAP_TOBITS 32 +#else +#define NFT_PIPAPO_MAP_TOBITS (BITS_PER_LONG - NFT_PIPAPO_MAP_NBITS) +#endif + +/* ...which gives us the highest allowed index for a rule */ +#define NFT_PIPAPO_RULE0_MAX ((1UL << (NFT_PIPAPO_MAP_TOBITS - 1)) \ + - (1UL << NFT_PIPAPO_MAP_NBITS)) + +#define nft_pipapo_for_each_field(field, index, match) \ + for ((field) = (match)->f, (index) = 0; \ + (index) < (match)->field_count; \ + (index)++, (field)++) + +/** + * union nft_pipapo_map_bucket - Bucket of mapping table + * @to: First rule number (in next field) this rule maps to + * @n: Number of rules (in next field) this rule maps to + * @e: If there's no next field, pointer to element this rule maps to + */ +union nft_pipapo_map_bucket { + struct { +#if BITS_PER_LONG == 64 + static_assert(NFT_PIPAPO_MAP_TOBITS <= 32); + u32 to; + + static_assert(NFT_PIPAPO_MAP_NBITS <= 32); + u32 n; +#else + unsigned long to:NFT_PIPAPO_MAP_TOBITS; + unsigned long n:NFT_PIPAPO_MAP_NBITS; +#endif + }; + struct nft_pipapo_elem *e; +}; + +/** + * struct nft_pipapo_field - Lookup, mapping tables and related data for a field + * @groups: Amount of 4-bit groups + * @rules: Number of inserted rules + * @bsize: Size of each bucket in lookup table, in longs + * @lt: Lookup table: 'groups' rows of NFT_PIPAPO_BUCKETS buckets + * @mt: Mapping table: one bucket per rule + */ +struct nft_pipapo_field { + int groups; + unsigned long rules; + size_t bsize; + unsigned long *lt; + union nft_pipapo_map_bucket *mt; +}; + +/** + * struct nft_pipapo_match - Data used for lookup and matching + * @field_count Amount of fields in set + * @scratch: Preallocated per-CPU maps for partial matching results + * @bsize_max: Maximum lookup table bucket size of all fields, in longs + * @rcu Matching data is swapped on commits + * @f: Fields, with lookup and mapping tables + */ +struct nft_pipapo_match { + int field_count; + unsigned long * __percpu *scratch; + size_t bsize_max; + struct rcu_head rcu; + struct nft_pipapo_field f[0]; +}; + +/* Current working bitmap index, toggled between field matches */ +static DEFINE_PER_CPU(bool, nft_pipapo_scratch_index); + +/** + * struct nft_pipapo - Representation of a set + * @match: Currently in-use matching data + * @clone: Copy where pending insertions and deletions are kept + * @groups: Total amount of 4-bit groups for fields in this set + * @width: Total bytes to be matched for one packet, including padding + * @dirty: Working copy has pending insertions or deletions + * @last_gc: Timestamp of last garbage collection run, jiffies + */ +struct nft_pipapo { + struct nft_pipapo_match __rcu *match; + struct nft_pipapo_match *clone; + int groups; + int width; + bool dirty; + unsigned long last_gc; +}; + +struct nft_pipapo_elem; + +/** + * struct nft_pipapo_elem - API-facing representation of single set element + * @ext: nftables API extensions + */ +struct nft_pipapo_elem { + struct nft_set_ext ext; +}; + +/** + * pipapo_refill() - For each set bit, set bits from selected mapping table item + * @map: Bitmap to be scanned for set bits + * @len: Length of bitmap in longs + * @rules: Number of rules in field + * @dst: Destination bitmap + * @mt: Mapping table containing bit set specifiers + * @match_only: Find a single bit and return, don't fill + * + * Iteration over set bits with __builtin_ctzl(): Daniel Lemire, public domain. + * + * For each bit set in map, select the bucket from mapping table with index + * corresponding to the position of the bit set. Use start bit and amount of + * bits specified in bucket to fill region in dst. + * + * Return: -1 on no match, bit position on 'match_only', 0 otherwise. + */ +static int pipapo_refill(unsigned long *map, int len, int rules, + unsigned long *dst, union nft_pipapo_map_bucket *mt, + bool match_only) +{ + unsigned long bitset; + int k, ret = -1; + + for (k = 0; k < len; k++) { + bitset = map[k]; + while (bitset) { + unsigned long t = bitset & -bitset; + int r = __builtin_ctzl(bitset); + int i = k * BITS_PER_LONG + r; + + if (unlikely(i >= rules)) { + map[k] = 0; + return -1; + } + + if (unlikely(match_only)) { + bitmap_clear(map, i, 1); + return i; + } + + ret = 0; + + bitmap_set(dst, mt[i].to, mt[i].n); + + bitset ^= t; + } + map[k] = 0; + } + + return ret; +} + +/** + * nft_pipapo_lookup() - Lookup function + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * @ext: nftables API extension pointer, filled with matching reference + * + * For more details, see DOC: Theory of Operation. + * + * Return: true on match, false otherwise. + */ +static bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) +{ + struct nft_pipapo *priv = nft_set_priv(set); + unsigned long *res_map, *fill_map; + u8 genmask = nft_genmask_cur(net); + const u8 *rp = (const u8 *)key; + struct nft_pipapo_match *m; + struct nft_pipapo_field *f; + bool map_index; + int i; + + local_bh_disable(); + + map_index = raw_cpu_read(nft_pipapo_scratch_index); + + m = rcu_dereference(priv->match); + + if (unlikely(!m || !*raw_cpu_ptr(m->scratch))) + goto out; + + res_map = *raw_cpu_ptr(m->scratch) + (map_index ? m->bsize_max : 0); + fill_map = *raw_cpu_ptr(m->scratch) + (map_index ? 0 : m->bsize_max); + + memset(res_map, 0xff, m->bsize_max * sizeof(*res_map)); + + nft_pipapo_for_each_field(f, i, m) { + bool last = i == m->field_count - 1; + unsigned long *lt = f->lt; + int b, group; + + /* For each 4-bit group: select lookup table bucket depending on + * packet bytes value, then AND bucket value + */ + for (group = 0; group < f->groups; group += 2) { + u8 v; + + v = *rp >> 4; + __bitmap_and(res_map, res_map, lt + v * f->bsize, + f->bsize * BITS_PER_LONG); + lt += f->bsize * NFT_PIPAPO_BUCKETS; + + v = *rp & 0x0f; + rp++; + __bitmap_and(res_map, res_map, lt + v * f->bsize, + f->bsize * BITS_PER_LONG); + lt += f->bsize * NFT_PIPAPO_BUCKETS; + } + + /* Now populate the bitmap for the next field, unless this is + * the last field, in which case return the matched 'ext' + * pointer if any. + * + * Now res_map contains the matching bitmap, and fill_map is the + * bitmap for the next field. + */ +next_match: + b = pipapo_refill(res_map, f->bsize, f->rules, fill_map, f->mt, + last); + if (b < 0) { + raw_cpu_write(nft_pipapo_scratch_index, map_index); + local_bh_enable(); + + return false; + } + + if (last) { + *ext = &f->mt[b].e->ext; + if (unlikely(nft_set_elem_expired(*ext) || + !nft_set_elem_active(*ext, genmask))) + goto next_match; + + /* Last field: we're just returning the key without + * filling the initial bitmap for the next field, so the + * current inactive bitmap is clean and can be reused as + * *next* bitmap (not initial) for the next packet. + */ + raw_cpu_write(nft_pipapo_scratch_index, map_index); + local_bh_enable(); + + return true; + } + + /* Swap bitmap indices: res_map is the initial bitmap for the + * next field, and fill_map is guaranteed to be all-zeroes at + * this point. + */ + map_index = !map_index; + swap(res_map, fill_map); + + rp += NFT_PIPAPO_GROUPS_PADDING(f->groups); + } + +out: + local_bh_enable(); + return false; +} + +/** + * pipapo_get() - Get matching element reference given key data + * @net: Network namespace + * @set: nftables API set representation + * @data: Key data to be matched against existing elements + * @genmask: If set, check that element is active in given genmask + * + * This is essentially the same as the lookup function, except that it matches + * key data against the uncommitted copy and doesn't use preallocated maps for + * bitmap results. + * + * Return: pointer to &struct nft_pipapo_elem on match, error pointer otherwise. + */ +static struct nft_pipapo_elem *pipapo_get(const struct net *net, + const struct nft_set *set, + const u8 *data, u8 genmask) +{ + struct nft_pipapo_elem *ret = ERR_PTR(-ENOENT); + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m = priv->clone; + unsigned long *res_map, *fill_map = NULL; + struct nft_pipapo_field *f; + int i; + + res_map = kmalloc_array(m->bsize_max, sizeof(*res_map), GFP_ATOMIC); + if (!res_map) { + ret = ERR_PTR(-ENOMEM); + goto out; + } + + fill_map = kcalloc(m->bsize_max, sizeof(*res_map), GFP_ATOMIC); + if (!fill_map) { + ret = ERR_PTR(-ENOMEM); + goto out; + } + + memset(res_map, 0xff, m->bsize_max * sizeof(*res_map)); + + nft_pipapo_for_each_field(f, i, m) { + bool last = i == m->field_count - 1; + unsigned long *lt = f->lt; + int b, group; + + /* For each 4-bit group: select lookup table bucket depending on + * packet bytes value, then AND bucket value + */ + for (group = 0; group < f->groups; group++) { + u8 v; + + if (group % 2) { + v = *data & 0x0f; + data++; + } else { + v = *data >> 4; + } + __bitmap_and(res_map, res_map, lt + v * f->bsize, + f->bsize * BITS_PER_LONG); + + lt += f->bsize * NFT_PIPAPO_BUCKETS; + } + + /* Now populate the bitmap for the next field, unless this is + * the last field, in which case return the matched 'ext' + * pointer if any. + * + * Now res_map contains the matching bitmap, and fill_map is the + * bitmap for the next field. + */ +next_match: + b = pipapo_refill(res_map, f->bsize, f->rules, fill_map, f->mt, + last); + if (b < 0) + goto out; + + if (last) { + if (nft_set_elem_expired(&f->mt[b].e->ext) || + (genmask && + !nft_set_elem_active(&f->mt[b].e->ext, genmask))) + goto next_match; + + ret = f->mt[b].e; + goto out; + } + + data += NFT_PIPAPO_GROUPS_PADDING(f->groups); + + /* Swap bitmap indices: fill_map will be the initial bitmap for + * the next field (i.e. the new res_map), and res_map is + * guaranteed to be all-zeroes at this point, ready to be filled + * according to the next mapping table. + */ + swap(res_map, fill_map); + } + +out: + kfree(fill_map); + kfree(res_map); + return ret; +} + +/** + * nft_pipapo_get() - Get matching element reference given key data + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * @flags: Unused + */ +void *nft_pipapo_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) +{ + return pipapo_get(net, set, (const u8 *)elem->key.val.data, + nft_genmask_cur(net)); +} + +/** + * pipapo_resize() - Resize lookup or mapping table, or both + * @f: Field containing lookup and mapping tables + * @old_rules: Previous amount of rules in field + * @rules: New amount of rules + * + * Increase, decrease or maintain tables size depending on new amount of rules, + * and copy data over. In case the new size is smaller, throw away data for + * highest-numbered rules. + * + * Return: 0 on success, -ENOMEM on allocation failure. + */ +static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules) +{ + long *new_lt = NULL, *new_p, *old_lt = f->lt, *old_p; + union nft_pipapo_map_bucket *new_mt, *old_mt = f->mt; + size_t new_bucket_size, copy; + int group, bucket; + + new_bucket_size = DIV_ROUND_UP(rules, BITS_PER_LONG); + + if (new_bucket_size == f->bsize) + goto mt; + + if (new_bucket_size > f->bsize) + copy = f->bsize; + else + copy = new_bucket_size; + + new_lt = kvzalloc(f->groups * NFT_PIPAPO_BUCKETS * new_bucket_size * + sizeof(*new_lt), GFP_KERNEL); + if (!new_lt) + return -ENOMEM; + + new_p = new_lt; + old_p = old_lt; + for (group = 0; group < f->groups; group++) { + for (bucket = 0; bucket < NFT_PIPAPO_BUCKETS; bucket++) { + memcpy(new_p, old_p, copy * sizeof(*new_p)); + new_p += copy; + old_p += copy; + + if (new_bucket_size > f->bsize) + new_p += new_bucket_size - f->bsize; + else + old_p += f->bsize - new_bucket_size; + } + } + +mt: + new_mt = kvmalloc(rules * sizeof(*new_mt), GFP_KERNEL); + if (!new_mt) { + kvfree(new_lt); + return -ENOMEM; + } + + memcpy(new_mt, f->mt, min(old_rules, rules) * sizeof(*new_mt)); + if (rules > old_rules) { + memset(new_mt + old_rules, 0, + (rules - old_rules) * sizeof(*new_mt)); + } + + if (new_lt) { + f->bsize = new_bucket_size; + f->lt = new_lt; + kvfree(old_lt); + } + + f->mt = new_mt; + kvfree(old_mt); + + return 0; +} + +/** + * pipapo_bucket_set() - Set rule bit in bucket given group and group value + * @f: Field containing lookup table + * @rule: Rule index + * @group: Group index + * @v: Value of bit group + */ +static void pipapo_bucket_set(struct nft_pipapo_field *f, int rule, int group, + int v) +{ + unsigned long *pos; + + pos = f->lt + f->bsize * NFT_PIPAPO_BUCKETS * group; + pos += f->bsize * v; + + __set_bit(rule, pos); +} + +/** + * pipapo_insert() - Insert new rule in field given input key and mask length + * @f: Field containing lookup table + * @k: Input key for classification, without nftables padding + * @mask_bits: Length of mask; matches field length for non-ranged entry + * + * Insert a new rule reference in lookup buckets corresponding to k and + * mask_bits. + * + * Return: 1 on success (one rule inserted), negative error code on failure. + */ +static int pipapo_insert(struct nft_pipapo_field *f, const uint8_t *k, + int mask_bits) +{ + int rule = f->rules++, group, ret; + + ret = pipapo_resize(f, f->rules - 1, f->rules); + if (ret) + return ret; + + for (group = 0; group < f->groups; group++) { + int i, v; + u8 mask; + + if (group % 2) + v = k[group / 2] & 0x0f; + else + v = k[group / 2] >> 4; + + if (mask_bits >= (group + 1) * 4) { + /* Not masked */ + pipapo_bucket_set(f, rule, group, v); + } else if (mask_bits <= group * 4) { + /* Completely masked */ + for (i = 0; i < NFT_PIPAPO_BUCKETS; i++) + pipapo_bucket_set(f, rule, group, i); + } else { + /* The mask limit falls on this group */ + mask = 0x0f >> (mask_bits - group * 4); + for (i = 0; i < NFT_PIPAPO_BUCKETS; i++) { + if ((i & ~mask) == (v & ~mask)) + pipapo_bucket_set(f, rule, group, i); + } + } + } + + return 1; +} + +/** + * pipapo_step_diff() - Check if setting @step bit in netmask would change it + * @base: Mask we are expanding + * @step: Step bit for given expansion step + * @len: Total length of mask space (set and unset bits), bytes + * + * Convenience function for mask expansion. + * + * Return: true if step bit changes mask (i.e. isn't set), false otherwise. + */ +static bool pipapo_step_diff(u8 *base, int step, int len) +{ + /* Network order, byte-addressed */ +#ifdef __BIG_ENDIAN__ + return !(BIT(step % BITS_PER_BYTE) & base[step / BITS_PER_BYTE]); +#else + return !(BIT(step % BITS_PER_BYTE) & + base[len - 1 - step / BITS_PER_BYTE]); +#endif +} + +/** + * pipapo_step_after_end() - Check if mask exceeds range end with given step + * @base: Mask we are expanding + * @end: End of range + * @step: Step bit for given expansion step, highest bit to be set + * @len: Total length of mask space (set and unset bits), bytes + * + * Convenience function for mask expansion. + * + * Return: true if mask exceeds range setting step bits, false otherwise. + */ +static bool pipapo_step_after_end(const u8 *base, const u8 *end, int step, + int len) +{ + u8 tmp[NFT_PIPAPO_MAX_BYTES]; + int i; + + memcpy(tmp, base, len); + + /* Network order, byte-addressed */ + for (i = 0; i <= step; i++) +#ifdef __BIG_ENDIAN__ + tmp[i / BITS_PER_BYTE] |= BIT(i % BITS_PER_BYTE); +#else + tmp[len - 1 - i / BITS_PER_BYTE] |= BIT(i % BITS_PER_BYTE); +#endif + + return memcmp(tmp, end, len) > 0; +} + +/** + * pipapo_base_sum() - Sum step bit to given len-sized netmask base with carry + * @base: Netmask base + * @step: Step bit to sum + * @len: Netmask length, bytes + */ +static void pipapo_base_sum(u8 *base, int step, int len) +{ + bool carry = false; + int i; + + /* Network order, byte-addressed */ +#ifdef __BIG_ENDIAN__ + for (i = step / BITS_PER_BYTE; i < len; i++) { +#else + for (i = len - 1 - step / BITS_PER_BYTE; i >= 0; i--) { +#endif + if (carry) + base[i]++; + else + base[i] += 1 << (step % BITS_PER_BYTE); + + if (base[i]) + break; + + carry = true; + } +} + +/** + * pipapo_expand() - Expand to composing netmasks, insert into lookup table + * @f: Field containing lookup table + * @start: Start of range + * @end: End of range + * @len: Length of value in bits + * + * Expand range to composing netmasks and insert corresponding rule references + * in lookup buckets. + * + * Return: number of inserted rules on success, negative error code on failure. + */ +static int pipapo_expand(struct nft_pipapo_field *f, + const u8 *start, const u8 *end, int len) +{ + int step, masks = 0, bytes = DIV_ROUND_UP(len, BITS_PER_BYTE); + u8 base[NFT_PIPAPO_MAX_BYTES]; + + memcpy(base, start, bytes); + while (memcmp(base, end, bytes) <= 0) { + int err; + + step = 0; + while (pipapo_step_diff(base, step, bytes)) { + if (pipapo_step_after_end(base, end, step, bytes)) + break; + + step++; + if (step >= len) { + if (!masks) { + pipapo_insert(f, base, 0); + masks = 1; + } + goto out; + } + } + + err = pipapo_insert(f, base, len - step); + + if (err < 0) + return err; + + masks++; + pipapo_base_sum(base, step, bytes); + } +out: + return masks; +} + +/** + * pipapo_map() - Insert rules in mapping tables, mapping them between fields + * @m: Matching data, including mapping table + * @map: Table of rule maps: array of first rule and amount of rules + * in next field a given rule maps to, for each field + * @ext: For last field, nft_set_ext pointer matching rules map to + */ +static void pipapo_map(struct nft_pipapo_match *m, + union nft_pipapo_map_bucket map[NFT_PIPAPO_MAX_FIELDS], + struct nft_pipapo_elem *e) +{ + struct nft_pipapo_field *f; + int i, j; + + for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) { + for (j = 0; j < map[i].n; j++) { + f->mt[map[i].to + j].to = map[i + 1].to; + f->mt[map[i].to + j].n = map[i + 1].n; + } + } + + /* Last field: map to ext instead of mapping to next field */ + for (j = 0; j < map[i].n; j++) + f->mt[map[i].to + j].e = e; +} + +/** + * pipapo_realloc_scratch() - Reallocate scratch maps for partial match results + * @clone: Copy of matching data with pending insertions and deletions + * @bsize_max Maximum bucket size, scratch maps cover two buckets + * + * Return: 0 on success, -ENOMEM on failure. + */ +static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, + unsigned long bsize_max) +{ + int i; + + for_each_possible_cpu(i) { + unsigned long *scratch; + + scratch = kzalloc_node(bsize_max * sizeof(*scratch) * 2, + GFP_KERNEL, cpu_to_node(i)); + if (!scratch) { + /* On failure, there's no need to undo previous + * allocations: this means that some scratch maps have + * a bigger allocated size now (this is only called on + * insertion), but the extra space won't be used by any + * CPU as new elements are not inserted and m->bsize_max + * is not updated. + */ + return -ENOMEM; + } + + kfree(*per_cpu_ptr(clone->scratch, i)); + + *per_cpu_ptr(clone->scratch, i) = scratch; + } + + return 0; +} + +/** + * nft_pipapo_insert() - Validate and insert ranged elements + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * @ext2: Filled with pointer to &struct nft_set_ext in inserted element + * + * Return: 0 on success, error pointer on failure. + */ +static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, + struct nft_set_ext **ext2) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; + const u8 *start = (const u8 *)elem->key.val.data, *end; + struct nft_pipapo_elem *e = elem->priv, *dup; + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m = priv->clone; + u8 genmask = nft_genmask_next(net); + struct nft_pipapo_field *f; + int i, bsize_max, err = 0; + + dup = pipapo_get(net, set, start, genmask); + if (PTR_ERR(dup) == -ENOENT) { + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) { + end = (const u8 *)nft_set_ext_key_end(ext)->data; + dup = pipapo_get(net, set, end, nft_genmask_next(net)); + } else { + end = start; + } + } + + if (PTR_ERR(dup) != -ENOENT) { + if (IS_ERR(dup)) + return PTR_ERR(dup); + *ext2 = &dup->ext; + return -EEXIST; + } + + /* Validate */ + nft_pipapo_for_each_field(f, i, m) { + const u8 *start_p = start, *end_p = end; + + if (f->rules >= (unsigned long)NFT_PIPAPO_RULE0_MAX) + return -ENOSPC; + + if (memcmp(start_p, end_p, + f->groups / NFT_PIPAPO_GROUPS_PER_BYTE) > 0) + return -EINVAL; + + start_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + end_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + } + + /* Insert */ + priv->dirty = true; + + bsize_max = m->bsize_max; + + nft_pipapo_for_each_field(f, i, m) { + int ret; + + rulemap[i].to = f->rules; + + ret = memcmp(start, end, + f->groups / NFT_PIPAPO_GROUPS_PER_BYTE); + if (!ret) { + ret = pipapo_insert(f, start, + f->groups * NFT_PIPAPO_GROUP_BITS); + } else { + ret = pipapo_expand(f, start, end, + f->groups * NFT_PIPAPO_GROUP_BITS); + } + + if (f->bsize > bsize_max) + bsize_max = f->bsize; + + rulemap[i].n = ret; + + start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + } + + if (!*this_cpu_ptr(m->scratch) || bsize_max > m->bsize_max) { + err = pipapo_realloc_scratch(m, bsize_max); + if (err) + return err; + + this_cpu_write(nft_pipapo_scratch_index, false); + + m->bsize_max = bsize_max; + } + + *ext2 = &e->ext; + + pipapo_map(m, rulemap, e); + + return 0; +} + +/** + * pipapo_clone() - Clone matching data to create new working copy + * @old: Existing matching data + * + * Return: copy of matching data passed as 'old', error pointer on failure + */ +static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) +{ + struct nft_pipapo_field *dst, *src; + struct nft_pipapo_match *new; + int i; + + new = kmalloc(sizeof(*new) + sizeof(*dst) * old->field_count, + GFP_KERNEL); + if (!new) + return ERR_PTR(-ENOMEM); + + new->field_count = old->field_count; + new->bsize_max = old->bsize_max; + + new->scratch = alloc_percpu(*new->scratch); + if (!new->scratch) + goto out_scratch; + + rcu_head_init(&new->rcu); + + src = old->f; + dst = new->f; + + for (i = 0; i < old->field_count; i++) { + memcpy(dst, src, offsetof(struct nft_pipapo_field, lt)); + + dst->lt = kvzalloc(src->groups * NFT_PIPAPO_BUCKETS * + src->bsize * sizeof(*dst->lt), + GFP_KERNEL); + if (!dst->lt) + goto out_lt; + + memcpy(dst->lt, src->lt, + src->bsize * sizeof(*dst->lt) * + src->groups * NFT_PIPAPO_BUCKETS); + + dst->mt = kvmalloc(src->rules * sizeof(*src->mt), GFP_KERNEL); + if (!dst->mt) + goto out_mt; + + memcpy(dst->mt, src->mt, src->rules * sizeof(*src->mt)); + src++; + dst++; + } + + return new; + +out_mt: + kvfree(dst->lt); +out_lt: + for (dst--; i > 0; i--) { + kvfree(dst->mt); + kvfree(dst->lt); + dst--; + } + free_percpu(new->scratch); +out_scratch: + kfree(new); + + return ERR_PTR(-ENOMEM); +} + +/** + * pipapo_rules_same_key() - Get number of rules originated from the same entry + * @f: Field containing mapping table + * @first: Index of first rule in set of rules mapping to same entry + * + * Using the fact that all rules in a field that originated from the same entry + * will map to the same set of rules in the next field, or to the same element + * reference, return the cardinality of the set of rules that originated from + * the same entry as the rule with index @first, @first rule included. + * + * In pictures: + * rules + * field #0 0 1 2 3 4 + * map to: 0 1 2-4 2-4 5-9 + * . . ....... . ... + * | | | | \ \ + * | | | | \ \ + * | | | | \ \ + * ' ' ' ' ' \ + * in field #1 0 1 2 3 4 5 ... + * + * if this is called for rule 2 on field #0, it will return 3, as also rules 2 + * and 3 in field 0 map to the same set of rules (2, 3, 4) in the next field. + * + * For the last field in a set, we can rely on associated entries to map to the + * same element references. + * + * Return: Number of rules that originated from the same entry as @first. + */ +static int pipapo_rules_same_key(struct nft_pipapo_field *f, int first) +{ + struct nft_pipapo_elem *e = NULL; /* Keep gcc happy */ + int r; + + for (r = first; r < f->rules; r++) { + if (r != first && e != f->mt[r].e) + return r - first; + + e = f->mt[r].e; + } + + if (r != first) + return r - first; + + return 0; +} + +/** + * pipapo_unmap() - Remove rules from mapping tables, renumber remaining ones + * @mt: Mapping array + * @rules: Original amount of rules in mapping table + * @start: First rule index to be removed + * @n: Amount of rules to be removed + * @to_offset: First rule index, in next field, this group of rules maps to + * @is_last: If this is the last field, delete reference from mapping array + * + * This is used to unmap rules from the mapping table for a single field, + * maintaining consistency and compactness for the existing ones. + * + * In pictures: let's assume that we want to delete rules 2 and 3 from the + * following mapping array: + * + * rules + * 0 1 2 3 4 + * map to: 4-10 4-10 11-15 11-15 16-18 + * + * the result will be: + * + * rules + * 0 1 2 + * map to: 4-10 4-10 11-13 + * + * for fields before the last one. In case this is the mapping table for the + * last field in a set, and rules map to pointers to &struct nft_pipapo_elem: + * + * rules + * 0 1 2 3 4 + * element pointers: 0x42 0x42 0x33 0x33 0x44 + * + * the result will be: + * + * rules + * 0 1 2 + * element pointers: 0x42 0x42 0x44 + */ +static void pipapo_unmap(union nft_pipapo_map_bucket *mt, int rules, + int start, int n, int to_offset, bool is_last) +{ + int i; + + memmove(mt + start, mt + start + n, (rules - start - n) * sizeof(*mt)); + memset(mt + rules - n, 0, n * sizeof(*mt)); + + if (is_last) + return; + + for (i = start; i < rules - n; i++) + mt[i].to -= to_offset; +} + +/** + * pipapo_drop() - Delete entry from lookup and mapping tables, given rule map + * @m: Matching data + * @rulemap Table of rule maps, arrays of first rule and amount of rules + * in next field a given entry maps to, for each field + * + * For each rule in lookup table buckets mapping to this set of rules, drop + * all bits set in lookup table mapping. In pictures, assuming we want to drop + * rules 0 and 1 from this lookup table: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0 1,2 + * 1 1,2 0 + * 2 0 1,2 + * 3 0 1,2 + * 4 0,1,2 + * 5 0 1 2 + * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 + * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 + * + * rule 2 becomes rule 0, and the result will be: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0 + * 1 0 + * 2 0 + * 3 0 + * 4 0 + * 5 0 + * 6 0 + * 7 0 0 + * + * once this is done, call unmap() to drop all the corresponding rule references + * from mapping tables. + */ +static void pipapo_drop(struct nft_pipapo_match *m, + union nft_pipapo_map_bucket rulemap[]) +{ + struct nft_pipapo_field *f; + int i; + + nft_pipapo_for_each_field(f, i, m) { + int g; + + for (g = 0; g < f->groups; g++) { + unsigned long *pos; + int b; + + pos = f->lt + g * NFT_PIPAPO_BUCKETS * f->bsize; + + for (b = 0; b < NFT_PIPAPO_BUCKETS; b++) { + bitmap_cut(pos, pos, rulemap[i].to, + rulemap[i].n, + f->bsize * BITS_PER_LONG); + + pos += f->bsize; + } + } + + pipapo_unmap(f->mt, f->rules, rulemap[i].to, rulemap[i].n, + rulemap[i + 1].n, i == m->field_count - 1); + if (pipapo_resize(f, f->rules, f->rules - rulemap[i].n)) { + /* We can ignore this, a failure to shrink tables down + * doesn't make tables invalid. + */ + ; + } + f->rules -= rulemap[i].n; + } +} + +/** + * pipapo_gc() - Drop expired entries from set, destroy start and end elements + * @set: nftables API set representation + * @m: Matching data + */ +static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) +{ + struct nft_pipapo *priv = nft_set_priv(set); + int rules_f0, first_rule = 0; + + while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { + union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; + struct nft_pipapo_field *f; + struct nft_pipapo_elem *e; + int i, start, rules_fx; + + start = first_rule; + rules_fx = rules_f0; + + nft_pipapo_for_each_field(f, i, m) { + rulemap[i].to = start; + rulemap[i].n = rules_fx; + + if (i < m->field_count - 1) { + rules_fx = f->mt[start].n; + start = f->mt[start].to; + } + } + + /* Pick the last field, and its last index */ + f--; + i--; + e = f->mt[rulemap[i].to].e; + if (nft_set_elem_expired(&e->ext) && + !nft_set_elem_mark_busy(&e->ext)) { + priv->dirty = true; + pipapo_drop(m, rulemap); + + rcu_barrier(); + nft_set_elem_destroy(set, e, true); + + /* And check again current first rule, which is now the + * first we haven't checked. + */ + } else { + first_rule += rules_f0; + } + } + + priv->last_gc = jiffies; +} + +/** + * pipapo_free_fields() - Free per-field tables contained in matching data + * @m: Matching data + */ +static void pipapo_free_fields(struct nft_pipapo_match *m) +{ + struct nft_pipapo_field *f; + int i; + + nft_pipapo_for_each_field(f, i, m) { + kvfree(f->lt); + kvfree(f->mt); + } +} + +/** + * pipapo_reclaim_match - RCU callback to free fields from old matching data + * @rcu: RCU head + */ +static void pipapo_reclaim_match(struct rcu_head *rcu) +{ + struct nft_pipapo_match *m; + int i; + + m = container_of(rcu, struct nft_pipapo_match, rcu); + + for_each_possible_cpu(i) + kfree(*per_cpu_ptr(m->scratch, i)); + + free_percpu(m->scratch); + + pipapo_free_fields(m); + + kfree(m); +} + +/** + * pipapo_commit() - Replace lookup data with current working copy + * @set: nftables API set representation + * + * While at it, check if we should perform garbage collection on the working + * copy before committing it for lookup, and don't replace the table if the + * working copy doesn't have pending changes. + * + * We also need to create a new working copy for subsequent insertions and + * deletions. + */ +static void pipapo_commit(const struct nft_set *set) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *new_clone, *old; + + if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) + pipapo_gc(set, priv->clone); + + if (!priv->dirty) + return; + + new_clone = pipapo_clone(priv->clone); + if (IS_ERR(new_clone)) + return; + + priv->dirty = false; + + old = rcu_access_pointer(priv->match); + rcu_assign_pointer(priv->match, priv->clone); + if (old) + call_rcu(&old->rcu, pipapo_reclaim_match); + + priv->clone = new_clone; +} + +/** + * nft_pipapo_activate() - Mark element reference as active given key, commit + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * + * On insertion, elements are added to a copy of the matching data currently + * in use for lookups, and not directly inserted into current lookup data, so + * we'll take care of that by calling pipapo_commit() here. Both + * nft_pipapo_insert() and nft_pipapo_activate() are called once for each + * element, hence we can't purpose either one as a real commit operation. + */ +static void nft_pipapo_activate(const struct net *net, + const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nft_pipapo_elem *e; + + e = pipapo_get(net, set, (const u8 *)elem->key.val.data, 0); + if (IS_ERR(e)) + return; + + nft_set_elem_change_active(net, set, &e->ext); + nft_set_elem_clear_busy(&e->ext); + + pipapo_commit(set); +} + +/** + * pipapo_deactivate() - Check that element is in set, mark as inactive + * @net: Network namespace + * @set: nftables API set representation + * @data: Input key data + * @ext: nftables API extension pointer, used to check for end element + * + * This is a convenience function that can be called from both + * nft_pipapo_deactivate() and nft_pipapo_flush(), as they are in fact the same + * operation. + * + * Return: deactivated element if found, NULL otherwise. + */ +static void *pipapo_deactivate(const struct net *net, const struct nft_set *set, + const u8 *data, const struct nft_set_ext *ext) +{ + struct nft_pipapo_elem *e; + + e = pipapo_get(net, set, data, nft_genmask_next(net)); + if (IS_ERR(e)) + return NULL; + + nft_set_elem_change_active(net, set, &e->ext); + + return e; +} + +/** + * nft_pipapo_deactivate() - Call pipapo_deactivate() to make element inactive + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * + * Return: deactivated element if found, NULL otherwise. + */ +static void *nft_pipapo_deactivate(const struct net *net, + const struct nft_set *set, + const struct nft_set_elem *elem) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + + return pipapo_deactivate(net, set, (const u8 *)elem->key.val.data, ext); +} + +/** + * nft_pipapo_flush() - Call pipapo_deactivate() to make element inactive + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * + * This is functionally the same as nft_pipapo_deactivate(), with a slightly + * different interface, and it's also called once for each element in a set + * being flushed, so we can't implement, strictly speaking, a flush operation, + * which would otherwise be as simple as allocating an empty copy of the + * matching data. + * + * Note that we could in theory do that, mark the set as flushed, and ignore + * subsequent calls, but we would leak all the elements after the first one, + * because they wouldn't then be freed as result of API calls. + * + * Return: true if element was found and deactivated. + */ +static bool nft_pipapo_flush(const struct net *net, const struct nft_set *set, + void *elem) +{ + struct nft_pipapo_elem *e = elem; + + return pipapo_deactivate(net, set, (const u8 *)nft_set_ext_key(&e->ext), + &e->ext); +} + +/** + * pipapo_get_boundaries() - Get byte interval for associated rules + * @f: Field including lookup table + * @first_rule: First rule (lowest index) + * @rule_count: Number of associated rules + * @left: Byte expression for left boundary (start of range) + * @right: Byte expression for right boundary (end of range) + * + * Given the first rule and amount of rules that originated from the same entry, + * build the original range associated with the entry, and calculate the length + * of the originating netmask. + * + * In pictures: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 1,2 + * 1 1,2 + * 2 1,2 + * 3 1,2 + * 4 1,2 + * 5 1 2 + * 6 1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 + * 7 1,2 1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 + * + * this is the lookup table corresponding to the IPv4 range + * 192.168.1.0-192.168.2.1, which was expanded to the two composing netmasks, + * rule #1: 192.168.1.0/24, and rule #2: 192.168.2.0/31. + * + * This function fills @left and @right with the byte values of the leftmost + * and rightmost bucket indices for the lowest and highest rule indices, + * respectively. If @first_rule is 1 and @rule_count is 2, we obtain, in + * nibbles: + * left: < 12, 0, 10, 8, 0, 1, 0, 0 > + * right: < 12, 0, 10, 8, 0, 2, 2, 1 > + * corresponding to bytes: + * left: < 192, 168, 1, 0 > + * right: < 192, 168, 2, 1 > + * with mask length irrelevant here, unused on return, as the range is already + * defined by its start and end points. The mask length is relevant for a single + * ranged entry instead: if @first_rule is 1 and @rule_count is 1, we ignore + * rule 2 above: @left becomes < 192, 168, 1, 0 >, @right becomes + * < 192, 168, 1, 255 >, and the mask length, calculated from the distances + * between leftmost and rightmost bucket indices for each group, would be 24. + * + * Return: mask length, in bits. + */ +static int pipapo_get_boundaries(struct nft_pipapo_field *f, int first_rule, + int rule_count, u8 *left, u8 *right) +{ + u8 *l = left, *r = right; + int g, mask_len = 0; + + for (g = 0; g < f->groups; g++) { + int b, x0, x1; + + x0 = -1; + x1 = -1; + for (b = 0; b < NFT_PIPAPO_BUCKETS; b++) { + unsigned long *pos; + + pos = f->lt + (g * NFT_PIPAPO_BUCKETS + b) * f->bsize; + if (test_bit(first_rule, pos) && x0 == -1) + x0 = b; + if (test_bit(first_rule + rule_count - 1, pos)) + x1 = b; + } + + if (g % 2) { + *(l++) |= x0 & 0x0f; + *(r++) |= x1 & 0x0f; + } else { + *l |= x0 << 4; + *r |= x1 << 4; + } + + if (x1 - x0 == 0) + mask_len += 4; + else if (x1 - x0 == 1) + mask_len += 3; + else if (x1 - x0 == 3) + mask_len += 2; + else if (x1 - x0 == 7) + mask_len += 1; + } + + return mask_len; +} + +/** + * pipapo_match_field() - Match rules against byte ranges + * @f: Field including the lookup table + * @first_rule: First of associated rules originating from same entry + * @rule_count: Amount of associated rules + * @start: Start of range to be matched + * @end: End of range to be matched + * + * Return: true on match, false otherwise. + */ +static bool pipapo_match_field(struct nft_pipapo_field *f, + int first_rule, int rule_count, + const u8 *start, const u8 *end) +{ + u8 right[NFT_PIPAPO_MAX_BYTES] = { 0 }; + u8 left[NFT_PIPAPO_MAX_BYTES] = { 0 }; + + pipapo_get_boundaries(f, first_rule, rule_count, left, right); + + return !memcmp(start, left, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE) && + !memcmp(end, right, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE); +} + +/** + * nft_pipapo_remove() - Remove element given key, commit + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * + * Similarly to nft_pipapo_activate(), this is used as commit operation by the + * API, but it's called once per element in the pending transaction, so we can't + * implement this as a single commit operation. Closest we can get is to remove + * the matched element here, if any, and commit the updated matching data. + */ +static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem) +{ + const u8 *data = (const u8 *)elem->key.val.data; + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m = priv->clone; + int rules_f0, first_rule = 0; + struct nft_pipapo_elem *e; + + e = pipapo_get(net, set, data, 0); + if (IS_ERR(e)) + return; + + while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { + union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; + const u8 *match_start, *match_end; + struct nft_pipapo_field *f; + int i, start, rules_fx; + + match_start = data; + match_end = (const u8 *)nft_set_ext_key_end(&e->ext)->data; + + start = first_rule; + rules_fx = rules_f0; + + nft_pipapo_for_each_field(f, i, m) { + if (!pipapo_match_field(f, start, rules_fx, + match_start, match_end)) + break; + + rulemap[i].to = start; + rulemap[i].n = rules_fx; + + rules_fx = f->mt[start].n; + start = f->mt[start].to; + + match_start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + } + + if (i == m->field_count) { + priv->dirty = true; + pipapo_drop(m, rulemap); + pipapo_commit(set); + return; + } + + first_rule += rules_f0; + } +} + +/** + * nft_pipapo_walk() - Walk over elements + * @ctx: nftables API context + * @set: nftables API set representation + * @iter: Iterator + * + * As elements are referenced in the mapping array for the last field, directly + * scan that array: there's no need to follow rule mappings from the first + * field. + */ +static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_iter *iter) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m; + struct nft_pipapo_field *f; + int i, r; + + rcu_read_lock(); + m = rcu_dereference(priv->match); + + if (unlikely(!m)) + goto out; + + for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) + ; + + for (r = 0; r < f->rules; r++) { + struct nft_pipapo_elem *e; + struct nft_set_elem elem; + + if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e) + continue; + + if (iter->count < iter->skip) + goto cont; + + e = f->mt[r].e; + if (nft_set_elem_expired(&e->ext)) + goto cont; + + elem.priv = e; + + iter->err = iter->fn(ctx, set, iter, &elem); + if (iter->err < 0) + goto out; + +cont: + iter->count++; + } + +out: + rcu_read_unlock(); +} + +/** + * nft_pipapo_privsize() - Return the size of private data for the set + * @nla: netlink attributes, ignored as size doesn't depend on them + * @desc: Set description, ignored as size doesn't depend on it + * + * Return: size of private data for this set implementation, in bytes + */ +static u64 nft_pipapo_privsize(const struct nlattr * const nla[], + const struct nft_set_desc *desc) +{ + return sizeof(struct nft_pipapo); +} + +/** + * nft_pipapo_estimate() - Estimate set size, space and lookup complexity + * @desc: Set description, element count and field description used here + * @features: Flags: NFT_SET_INTERVAL needs to be there + * @est: Storage for estimation data + * + * The size for this set type can vary dramatically, as it depends on the number + * of rules (composing netmasks) the entries expand to. We compute the worst + * case here. + * + * In general, for a non-ranged entry or a single composing netmask, we need + * one bit in each of the sixteen NFT_PIPAPO_BUCKETS, for each 4-bit group (that + * is, each input bit needs four bits of matching data), plus a bucket in the + * mapping table for each field. + * + * Return: true only for compatible range concatenations + */ +static bool nft_pipapo_estimate(const struct nft_set_desc *desc, u32 features, + struct nft_set_estimate *est) +{ + unsigned long entry_size; + int i; + + if (!(features & NFT_SET_INTERVAL) || desc->field_count <= 1) + return false; + + for (i = 0, entry_size = 0; i < desc->field_count; i++) { + unsigned long rules; + + if (desc->field_len[i] > NFT_PIPAPO_MAX_BYTES) + return false; + + /* Worst-case ranges for each concatenated field: each n-bit + * field can expand to up to n * 2 rules in each bucket, and + * each rule also needs a mapping bucket. + */ + rules = ilog2(desc->field_len[i] * BITS_PER_BYTE) * 2; + entry_size += rules * NFT_PIPAPO_BUCKETS / BITS_PER_BYTE; + entry_size += rules * sizeof(union nft_pipapo_map_bucket); + } + + /* Rules in lookup and mapping tables are needed for each entry */ + est->size = desc->size * entry_size; + if (est->size && div_u64(est->size, desc->size) != entry_size) + return false; + + est->size += sizeof(struct nft_pipapo) + + sizeof(struct nft_pipapo_match) * 2; + + est->size += sizeof(struct nft_pipapo_field) * desc->field_count; + + est->lookup = NFT_SET_CLASS_O_LOG_N; + + est->space = NFT_SET_CLASS_O_N; + + return true; +} + +/** + * nft_pipapo_init() - Initialise data for a set instance + * @set: nftables API set representation + * @desc: Set description + * @nla: netlink attributes + * + * Validate number and size of fields passed as NFTA_SET_DESC_CONCAT netlink + * attributes, initialise internal set parameters, current instance of matching + * data and a copy for subsequent insertions. + * + * Return: 0 on success, negative error code on failure. + */ +static int nft_pipapo_init(const struct nft_set *set, + const struct nft_set_desc *desc, + const struct nlattr * const nla[]) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m; + struct nft_pipapo_field *f; + int err, i; + + if (desc->field_count > NFT_PIPAPO_MAX_FIELDS) + return -EINVAL; + + m = kmalloc(sizeof(*priv->match) + sizeof(*f) * desc->field_count, + GFP_KERNEL); + if (!m) + return -ENOMEM; + + m->field_count = desc->field_count; + m->bsize_max = 0; + + m->scratch = alloc_percpu(unsigned long *); + if (!m->scratch) { + err = -ENOMEM; + goto out_free; + } + for_each_possible_cpu(i) + *per_cpu_ptr(m->scratch, i) = NULL; + + rcu_head_init(&m->rcu); + + nft_pipapo_for_each_field(f, i, m) { + f->groups = desc->field_len[i] * NFT_PIPAPO_GROUPS_PER_BYTE; + priv->groups += f->groups; + + priv->width += round_up(desc->field_len[i], sizeof(u32)); + + f->bsize = 0; + f->rules = 0; + f->lt = NULL; + f->mt = NULL; + } + + /* Create an initial clone of matching data for next insertion */ + priv->clone = pipapo_clone(m); + if (IS_ERR(priv->clone)) { + err = PTR_ERR(priv->clone); + goto out_free; + } + + priv->dirty = false; + + rcu_assign_pointer(priv->match, m); + + return 0; + +out_free: + free_percpu(m->scratch); + kfree(m); + + return err; +} + +/** + * nft_pipapo_destroy() - Free private data for set and all committed elements + * @set: nftables API set representation + */ +static void nft_pipapo_destroy(const struct nft_set *set) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m; + struct nft_pipapo_field *f; + int i, r, cpu; + + m = rcu_dereference_protected(priv->match, true); + if (m) { + rcu_barrier(); + + for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) + ; + + for (r = 0; r < f->rules; r++) { + struct nft_pipapo_elem *e; + + if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e) + continue; + + e = f->mt[r].e; + + nft_set_elem_destroy(set, e, true); + } + + for_each_possible_cpu(cpu) + kfree(*per_cpu_ptr(m->scratch, cpu)); + free_percpu(m->scratch); + + pipapo_free_fields(m); + kfree(m); + priv->match = NULL; + } + + if (priv->clone) { + for_each_possible_cpu(cpu) + kfree(*per_cpu_ptr(priv->clone->scratch, cpu)); + free_percpu(priv->clone->scratch); + + pipapo_free_fields(priv->clone); + kfree(priv->clone); + priv->clone = NULL; + } +} + +/** + * nft_pipapo_gc_init() - Initialise garbage collection + * @set: nftables API set representation + * + * Instead of actually setting up a periodic work for garbage collection, as + * this operation requires a swap of matching data with the working copy, we'll + * do that opportunistically with other commit operations if the interval is + * elapsed, so we just need to set the current jiffies timestamp here. + */ +static void nft_pipapo_gc_init(const struct nft_set *set) +{ + struct nft_pipapo *priv = nft_set_priv(set); + + priv->last_gc = jiffies; +} + +struct nft_set_type nft_set_pipapo_type __read_mostly = { + .owner = THIS_MODULE, + .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | + NFT_SET_TIMEOUT, + .ops = { + .lookup = nft_pipapo_lookup, + .insert = nft_pipapo_insert, + .activate = nft_pipapo_activate, + .deactivate = nft_pipapo_deactivate, + .flush = nft_pipapo_flush, + .remove = nft_pipapo_remove, + .walk = nft_pipapo_walk, + .get = nft_pipapo_get, + .privsize = nft_pipapo_privsize, + .estimate = nft_pipapo_estimate, + .init = nft_pipapo_init, + .destroy = nft_pipapo_destroy, + .gc_init = nft_pipapo_gc_init, + .elemsize = offsetof(struct nft_pipapo_elem, ext), + }, +}; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 419d58ef802b..5000b938ab1e 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -13,7 +13,7 @@ #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> struct nft_rbtree { struct rb_root root; @@ -74,8 +74,13 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set parent = rcu_dereference_raw(parent->rb_left); continue; } - if (nft_rbtree_interval_end(rbe)) - goto out; + if (nft_rbtree_interval_end(rbe)) { + if (nft_set_is_anonymous(set)) + return false; + parent = rcu_dereference_raw(parent->rb_left); + interval = NULL; + continue; + } *ext = &rbe->ext; return true; @@ -88,7 +93,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set *ext = &interval->ext; return true; } -out: + return false; } @@ -139,8 +144,10 @@ static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set, if (flags & NFT_SET_ELEM_INTERVAL_END) interval = rbe; } else { - if (!nft_set_elem_active(&rbe->ext, genmask)) + if (!nft_set_elem_active(&rbe->ext, genmask)) { parent = rcu_dereference_raw(parent->rb_left); + continue; + } if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) || (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) == @@ -148,7 +155,11 @@ static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set, *elem = rbe; return true; } - return false; + + if (nft_rbtree_interval_end(rbe)) + interval = NULL; + + parent = rcu_dereference_raw(parent->rb_left); } } @@ -455,6 +466,9 @@ static void nft_rbtree_destroy(const struct nft_set *set) static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est) { + if (desc->field_count > 1) + return false; + if (desc->size) est->size = sizeof(struct nft_rbtree) + desc->size * sizeof(struct nft_rbtree_elem); diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index d7f3776dfd71..637ce3e8c575 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -47,9 +47,6 @@ static void nft_socket_eval(const struct nft_expr *expr, return; } - /* So that subsequent socket matching not to require other lookups. */ - skb->sk = sk; - switch(priv->key) { case NFT_SOCKET_TRANSPARENT: nft_reg_store8(dest, inet_sk_transparent(sk)); @@ -66,6 +63,9 @@ static void nft_socket_eval(const struct nft_expr *expr, WARN_ON(1); regs->verdict.code = NFT_BREAK; } + + if (sk != skb->sk) + sock_gen_put(sk); } static const struct nla_policy nft_socket_policy[NFTA_SOCKET_MAX + 1] = { diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c index 928e661d1517..e2c1fc608841 100644 --- a/net/netfilter/nft_synproxy.c +++ b/net/netfilter/nft_synproxy.c @@ -24,15 +24,15 @@ static void nft_synproxy_tcp_options(struct synproxy_options *opts, const struct tcphdr *tcp, struct synproxy_net *snet, struct nf_synproxy_info *info, - struct nft_synproxy *priv) + const struct nft_synproxy *priv) { this_cpu_inc(snet->stats->syn_received); if (tcp->ece && tcp->cwr) opts->options |= NF_SYNPROXY_OPT_ECN; opts->options &= priv->info.options; - opts->mss_encode = opts->mss; - opts->mss = info->mss; + opts->mss_encode = opts->mss_option; + opts->mss_option = info->mss; if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) synproxy_init_timestamp_cookie(info, opts); else @@ -41,14 +41,13 @@ static void nft_synproxy_tcp_options(struct synproxy_options *opts, NF_SYNPROXY_OPT_ECN); } -static void nft_synproxy_eval_v4(const struct nft_expr *expr, +static void nft_synproxy_eval_v4(const struct nft_synproxy *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt, const struct tcphdr *tcp, struct tcphdr *_tcph, struct synproxy_options *opts) { - struct nft_synproxy *priv = nft_expr_priv(expr); struct nf_synproxy_info info = priv->info; struct net *net = nft_net(pkt); struct synproxy_net *snet = synproxy_pernet(net); @@ -73,14 +72,13 @@ static void nft_synproxy_eval_v4(const struct nft_expr *expr, } #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) -static void nft_synproxy_eval_v6(const struct nft_expr *expr, +static void nft_synproxy_eval_v6(const struct nft_synproxy *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt, const struct tcphdr *tcp, struct tcphdr *_tcph, struct synproxy_options *opts) { - struct nft_synproxy *priv = nft_expr_priv(expr); struct nf_synproxy_info info = priv->info; struct net *net = nft_net(pkt); struct synproxy_net *snet = synproxy_pernet(net); @@ -105,9 +103,9 @@ static void nft_synproxy_eval_v6(const struct nft_expr *expr, } #endif /* CONFIG_NF_TABLES_IPV6*/ -static void nft_synproxy_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +static void nft_synproxy_do_eval(const struct nft_synproxy *priv, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { struct synproxy_options opts = {}; struct sk_buff *skb = pkt->skb; @@ -140,23 +138,22 @@ static void nft_synproxy_eval(const struct nft_expr *expr, switch (skb->protocol) { case htons(ETH_P_IP): - nft_synproxy_eval_v4(expr, regs, pkt, tcp, &_tcph, &opts); + nft_synproxy_eval_v4(priv, regs, pkt, tcp, &_tcph, &opts); return; #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) case htons(ETH_P_IPV6): - nft_synproxy_eval_v6(expr, regs, pkt, tcp, &_tcph, &opts); + nft_synproxy_eval_v6(priv, regs, pkt, tcp, &_tcph, &opts); return; #endif } regs->verdict.code = NFT_BREAK; } -static int nft_synproxy_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) +static int nft_synproxy_do_init(const struct nft_ctx *ctx, + const struct nlattr * const tb[], + struct nft_synproxy *priv) { struct synproxy_net *snet = synproxy_pernet(ctx->net); - struct nft_synproxy *priv = nft_expr_priv(expr); u32 flags; int err; @@ -206,8 +203,7 @@ nf_ct_failure: return err; } -static void nft_synproxy_destroy(const struct nft_ctx *ctx, - const struct nft_expr *expr) +static void nft_synproxy_do_destroy(const struct nft_ctx *ctx) { struct synproxy_net *snet = synproxy_pernet(ctx->net); @@ -229,10 +225,8 @@ static void nft_synproxy_destroy(const struct nft_ctx *ctx, nf_ct_netns_put(ctx->net, ctx->family); } -static int nft_synproxy_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_synproxy_do_dump(struct sk_buff *skb, struct nft_synproxy *priv) { - const struct nft_synproxy *priv = nft_expr_priv(expr); - if (nla_put_be16(skb, NFTA_SYNPROXY_MSS, htons(priv->info.mss)) || nla_put_u8(skb, NFTA_SYNPROXY_WSCALE, priv->info.wscale) || nla_put_be32(skb, NFTA_SYNPROXY_FLAGS, htonl(priv->info.options))) @@ -244,6 +238,15 @@ nla_put_failure: return -1; } +static void nft_synproxy_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_synproxy *priv = nft_expr_priv(expr); + + nft_synproxy_do_eval(priv, regs, pkt); +} + static int nft_synproxy_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data) @@ -252,6 +255,28 @@ static int nft_synproxy_validate(const struct nft_ctx *ctx, (1 << NF_INET_FORWARD)); } +static int nft_synproxy_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_synproxy *priv = nft_expr_priv(expr); + + return nft_synproxy_do_init(ctx, tb, priv); +} + +static void nft_synproxy_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + nft_synproxy_do_destroy(ctx); +} + +static int nft_synproxy_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + struct nft_synproxy *priv = nft_expr_priv(expr); + + return nft_synproxy_do_dump(skb, priv); +} + static struct nft_expr_type nft_synproxy_type; static const struct nft_expr_ops nft_synproxy_ops = { .eval = nft_synproxy_eval, @@ -271,14 +296,89 @@ static struct nft_expr_type nft_synproxy_type __read_mostly = { .maxattr = NFTA_SYNPROXY_MAX, }; +static int nft_synproxy_obj_init(const struct nft_ctx *ctx, + const struct nlattr * const tb[], + struct nft_object *obj) +{ + struct nft_synproxy *priv = nft_obj_data(obj); + + return nft_synproxy_do_init(ctx, tb, priv); +} + +static void nft_synproxy_obj_destroy(const struct nft_ctx *ctx, + struct nft_object *obj) +{ + nft_synproxy_do_destroy(ctx); +} + +static int nft_synproxy_obj_dump(struct sk_buff *skb, + struct nft_object *obj, bool reset) +{ + struct nft_synproxy *priv = nft_obj_data(obj); + + return nft_synproxy_do_dump(skb, priv); +} + +static void nft_synproxy_obj_eval(struct nft_object *obj, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_synproxy *priv = nft_obj_data(obj); + + nft_synproxy_do_eval(priv, regs, pkt); +} + +static void nft_synproxy_obj_update(struct nft_object *obj, + struct nft_object *newobj) +{ + struct nft_synproxy *newpriv = nft_obj_data(newobj); + struct nft_synproxy *priv = nft_obj_data(obj); + + priv->info = newpriv->info; +} + +static struct nft_object_type nft_synproxy_obj_type; +static const struct nft_object_ops nft_synproxy_obj_ops = { + .type = &nft_synproxy_obj_type, + .size = sizeof(struct nft_synproxy), + .init = nft_synproxy_obj_init, + .destroy = nft_synproxy_obj_destroy, + .dump = nft_synproxy_obj_dump, + .eval = nft_synproxy_obj_eval, + .update = nft_synproxy_obj_update, +}; + +static struct nft_object_type nft_synproxy_obj_type __read_mostly = { + .type = NFT_OBJECT_SYNPROXY, + .ops = &nft_synproxy_obj_ops, + .maxattr = NFTA_SYNPROXY_MAX, + .policy = nft_synproxy_policy, + .owner = THIS_MODULE, +}; + static int __init nft_synproxy_module_init(void) { - return nft_register_expr(&nft_synproxy_type); + int err; + + err = nft_register_obj(&nft_synproxy_obj_type); + if (err < 0) + return err; + + err = nft_register_expr(&nft_synproxy_type); + if (err < 0) + goto err; + + return 0; + +err: + nft_unregister_obj(&nft_synproxy_obj_type); + return err; } static void __exit nft_synproxy_module_exit(void) { - return nft_unregister_expr(&nft_synproxy_type); + nft_unregister_expr(&nft_synproxy_type); + nft_unregister_obj(&nft_synproxy_obj_type); } module_init(nft_synproxy_module_init); @@ -287,3 +387,4 @@ module_exit(nft_synproxy_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Fernando Fernandez <ffmancera@riseup.net>"); MODULE_ALIAS_NFT_EXPR("synproxy"); +MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_SYNPROXY); diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c index f92a82c73880..d67f83a0958d 100644 --- a/net/netfilter/nft_tproxy.c +++ b/net/netfilter/nft_tproxy.c @@ -50,7 +50,7 @@ static void nft_tproxy_eval_v4(const struct nft_expr *expr, taddr = nf_tproxy_laddr4(skb, taddr, iph->daddr); if (priv->sreg_port) - tport = regs->data[priv->sreg_port]; + tport = nft_reg_load16(®s->data[priv->sreg_port]); if (!tport) tport = hp->dest; @@ -117,7 +117,7 @@ static void nft_tproxy_eval_v6(const struct nft_expr *expr, taddr = *nf_tproxy_laddr6(skb, &taddr, &iph->daddr); if (priv->sreg_port) - tport = regs->data[priv->sreg_port]; + tport = nft_reg_load16(®s->data[priv->sreg_port]); if (!tport) tport = hp->dest; @@ -218,14 +218,14 @@ static int nft_tproxy_init(const struct nft_ctx *ctx, switch (priv->family) { case NFPROTO_IPV4: - alen = FIELD_SIZEOF(union nf_inet_addr, in); + alen = sizeof_field(union nf_inet_addr, in); err = nf_defrag_ipv4_enable(ctx->net); if (err) return err; break; #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) case NFPROTO_IPV6: - alen = FIELD_SIZEOF(union nf_inet_addr, in6); + alen = sizeof_field(union nf_inet_addr, in6); err = nf_defrag_ipv6_enable(ctx->net); if (err) return err; diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index 3d4c2ae605a8..4c3f2e24c7cb 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -76,7 +76,7 @@ static int nft_tunnel_get_init(const struct nft_ctx *ctx, struct nft_tunnel *priv = nft_expr_priv(expr); u32 len; - if (!tb[NFTA_TUNNEL_KEY] && + if (!tb[NFTA_TUNNEL_KEY] || !tb[NFTA_TUNNEL_DREG]) return -EINVAL; @@ -248,8 +248,9 @@ static int nft_tunnel_obj_vxlan_init(const struct nlattr *attr, } static const struct nla_policy nft_tunnel_opts_erspan_policy[NFTA_TUNNEL_KEY_ERSPAN_MAX + 1] = { + [NFTA_TUNNEL_KEY_ERSPAN_VERSION] = { .type = NLA_U32 }, [NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX] = { .type = NLA_U32 }, - [NFTA_TUNNEL_KEY_ERSPAN_V2_DIR] = { .type = NLA_U8 }, + [NFTA_TUNNEL_KEY_ERSPAN_V2_DIR] = { .type = NLA_U8 }, [NFTA_TUNNEL_KEY_ERSPAN_V2_HWID] = { .type = NLA_U8 }, }; @@ -266,6 +267,9 @@ static int nft_tunnel_obj_erspan_init(const struct nlattr *attr, if (err < 0) return err; + if (!tb[NFTA_TUNNEL_KEY_ERSPAN_VERSION]) + return -EINVAL; + version = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY_ERSPAN_VERSION])); switch (version) { case ERSPAN_VERSION: @@ -442,10 +446,15 @@ static int nft_tunnel_ip_dump(struct sk_buff *skb, struct ip_tunnel_info *info) if (!nest) return -1; - if (nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_SRC, &info->key.u.ipv6.src) < 0 || - nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_DST, &info->key.u.ipv6.dst) < 0 || - nla_put_be32(skb, NFTA_TUNNEL_KEY_IP6_FLOWLABEL, info->key.label)) + if (nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_SRC, + &info->key.u.ipv6.src) < 0 || + nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_DST, + &info->key.u.ipv6.dst) < 0 || + nla_put_be32(skb, NFTA_TUNNEL_KEY_IP6_FLOWLABEL, + info->key.label)) { + nla_nest_cancel(skb, nest); return -1; + } nla_nest_end(skb, nest); } else { @@ -453,9 +462,13 @@ static int nft_tunnel_ip_dump(struct sk_buff *skb, struct ip_tunnel_info *info) if (!nest) return -1; - if (nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_SRC, info->key.u.ipv4.src) < 0 || - nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_DST, info->key.u.ipv4.dst) < 0) + if (nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_SRC, + info->key.u.ipv4.src) < 0 || + nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_DST, + info->key.u.ipv4.dst) < 0) { + nla_nest_cancel(skb, nest); return -1; + } nla_nest_end(skb, nest); } @@ -467,42 +480,58 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, struct nft_tunnel_obj *priv) { struct nft_tunnel_opts *opts = &priv->opts; - struct nlattr *nest; + struct nlattr *nest, *inner; nest = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS); if (!nest) return -1; if (opts->flags & TUNNEL_VXLAN_OPT) { + inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_VXLAN); + if (!inner) + goto failure; if (nla_put_be32(skb, NFTA_TUNNEL_KEY_VXLAN_GBP, htonl(opts->u.vxlan.gbp))) - return -1; + goto inner_failure; + nla_nest_end(skb, inner); } else if (opts->flags & TUNNEL_ERSPAN_OPT) { + inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_ERSPAN); + if (!inner) + goto failure; + if (nla_put_be32(skb, NFTA_TUNNEL_KEY_ERSPAN_VERSION, + htonl(opts->u.erspan.version))) + goto inner_failure; switch (opts->u.erspan.version) { case ERSPAN_VERSION: if (nla_put_be32(skb, NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX, opts->u.erspan.u.index)) - return -1; + goto inner_failure; break; case ERSPAN_VERSION2: if (nla_put_u8(skb, NFTA_TUNNEL_KEY_ERSPAN_V2_HWID, get_hwid(&opts->u.erspan.u.md2)) || nla_put_u8(skb, NFTA_TUNNEL_KEY_ERSPAN_V2_DIR, opts->u.erspan.u.md2.dir)) - return -1; + goto inner_failure; break; } + nla_nest_end(skb, inner); } nla_nest_end(skb, nest); - return 0; + +inner_failure: + nla_nest_cancel(skb, inner); +failure: + nla_nest_cancel(skb, nest); + return -1; } static int nft_tunnel_ports_dump(struct sk_buff *skb, struct ip_tunnel_info *info) { - if (nla_put_be16(skb, NFTA_TUNNEL_KEY_SPORT, htons(info->key.tp_src)) < 0 || - nla_put_be16(skb, NFTA_TUNNEL_KEY_DPORT, htons(info->key.tp_dst)) < 0) + if (nla_put_be16(skb, NFTA_TUNNEL_KEY_SPORT, info->key.tp_src) < 0 || + nla_put_be16(skb, NFTA_TUNNEL_KEY_DPORT, info->key.tp_dst) < 0) return -1; return 0; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index ce70c2576bb2..e27c6c5ba9df 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -939,14 +939,14 @@ EXPORT_SYMBOL(xt_check_entry_offsets); * * @size: number of entries * - * Return: NULL or kmalloc'd or vmalloc'd array + * Return: NULL or zeroed kmalloc'd or vmalloc'd array */ unsigned int *xt_alloc_entry_offsets(unsigned int size) { if (size > XT_MAX_TABLE_SIZE / sizeof(unsigned int)) return NULL; - return kvmalloc_array(size, sizeof(unsigned int), GFP_KERNEL | __GFP_ZERO); + return kvcalloc(size, sizeof(unsigned int), GFP_KERNEL); } EXPORT_SYMBOL(xt_alloc_entry_offsets); diff --git a/net/netfilter/xt_HMARK.c b/net/netfilter/xt_HMARK.c index be7798a50546..713fb38541df 100644 --- a/net/netfilter/xt_HMARK.c +++ b/net/netfilter/xt_HMARK.c @@ -239,11 +239,7 @@ static int get_inner_hdr(const struct sk_buff *skb, int iphsz, int *nhoff) return 0; /* Error message? */ - if (icmph->type != ICMP_DEST_UNREACH && - icmph->type != ICMP_SOURCE_QUENCH && - icmph->type != ICMP_TIME_EXCEEDED && - icmph->type != ICMP_PARAMETERPROB && - icmph->type != ICMP_REDIRECT) + if (!icmp_is_err(icmph->type)) return 0; *nhoff += iphsz + sizeof(_ih); diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index 9cec9eae556a..f56d3ed93e56 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -283,7 +283,7 @@ static int __init idletimer_tg_init(void) idletimer_tg_kobj = &idletimer_tg_device->kobj; - err = xt_register_target(&idletimer_tg); + err = xt_register_target(&idletimer_tg); if (err < 0) { pr_debug("couldn't register xt target\n"); goto out_dev; diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 2236455b10a3..37253d399c6b 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -30,7 +30,7 @@ static unsigned int jhash_rnd __read_mostly; static unsigned int xt_rateest_hash(const char *name) { - return jhash(name, FIELD_SIZEOF(struct xt_rateest, name), jhash_rnd) & + return jhash(name, sizeof_field(struct xt_rateest, name), jhash_rnd) & (RATEEST_HSIZE - 1); } diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index bc6c8ab0fa62..46fcac75f726 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -13,6 +13,8 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/ip.h> +#include <linux/ipv6.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netfilter/x_tables.h> diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 2d2691dd51e0..bccd47cd7190 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -34,9 +34,14 @@ #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> -#include <linux/netfilter/xt_hashlimit.h> #include <linux/mutex.h> #include <linux/kernel.h> +#include <uapi/linux/netfilter/xt_hashlimit.h> + +#define XT_HASHLIMIT_ALL (XT_HASHLIMIT_HASH_DIP | XT_HASHLIMIT_HASH_DPT | \ + XT_HASHLIMIT_HASH_SIP | XT_HASHLIMIT_HASH_SPT | \ + XT_HASHLIMIT_INVERT | XT_HASHLIMIT_BYTES |\ + XT_HASHLIMIT_RATE_MATCH) MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); @@ -352,21 +357,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, return 0; } -static bool select_all(const struct xt_hashlimit_htable *ht, - const struct dsthash_ent *he) -{ - return true; -} - -static bool select_gc(const struct xt_hashlimit_htable *ht, - const struct dsthash_ent *he) -{ - return time_after_eq(jiffies, he->expires); -} - -static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, - bool (*select)(const struct xt_hashlimit_htable *ht, - const struct dsthash_ent *he)) +static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, bool select_all) { unsigned int i; @@ -376,7 +367,7 @@ static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, spin_lock_bh(&ht->lock); hlist_for_each_entry_safe(dh, n, &ht->hash[i], node) { - if ((*select)(ht, dh)) + if (time_after_eq(jiffies, dh->expires) || select_all) dsthash_free(ht, dh); } spin_unlock_bh(&ht->lock); @@ -390,7 +381,7 @@ static void htable_gc(struct work_struct *work) ht = container_of(work, struct xt_hashlimit_htable, gc_work.work); - htable_selective_cleanup(ht, select_gc); + htable_selective_cleanup(ht, false); queue_delayed_work(system_power_efficient_wq, &ht->gc_work, msecs_to_jiffies(ht->cfg.gc_interval)); @@ -414,7 +405,7 @@ static void htable_destroy(struct xt_hashlimit_htable *hinfo) { cancel_delayed_work_sync(&hinfo->gc_work); htable_remove_proc_entry(hinfo); - htable_selective_cleanup(hinfo, select_all); + htable_selective_cleanup(hinfo, true); kfree(hinfo->name); vfree(hinfo); } diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c index d0ab1adf5bff..5aab6df74e0f 100644 --- a/net/netfilter/xt_nfacct.c +++ b/net/netfilter/xt_nfacct.c @@ -54,25 +54,39 @@ nfacct_mt_destroy(const struct xt_mtdtor_param *par) nfnl_acct_put(info->nfacct); } -static struct xt_match nfacct_mt_reg __read_mostly = { - .name = "nfacct", - .family = NFPROTO_UNSPEC, - .checkentry = nfacct_mt_checkentry, - .match = nfacct_mt, - .destroy = nfacct_mt_destroy, - .matchsize = sizeof(struct xt_nfacct_match_info), - .usersize = offsetof(struct xt_nfacct_match_info, nfacct), - .me = THIS_MODULE, +static struct xt_match nfacct_mt_reg[] __read_mostly = { + { + .name = "nfacct", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = nfacct_mt_checkentry, + .match = nfacct_mt, + .destroy = nfacct_mt_destroy, + .matchsize = sizeof(struct xt_nfacct_match_info), + .usersize = offsetof(struct xt_nfacct_match_info, nfacct), + .me = THIS_MODULE, + }, + { + .name = "nfacct", + .revision = 1, + .family = NFPROTO_UNSPEC, + .checkentry = nfacct_mt_checkentry, + .match = nfacct_mt, + .destroy = nfacct_mt_destroy, + .matchsize = sizeof(struct xt_nfacct_match_info_v1), + .usersize = offsetof(struct xt_nfacct_match_info_v1, nfacct), + .me = THIS_MODULE, + }, }; static int __init nfacct_mt_init(void) { - return xt_register_match(&nfacct_mt_reg); + return xt_register_matches(nfacct_mt_reg, ARRAY_SIZE(nfacct_mt_reg)); } static void __exit nfacct_mt_exit(void) { - xt_unregister_match(&nfacct_mt_reg); + xt_unregister_matches(nfacct_mt_reg, ARRAY_SIZE(nfacct_mt_reg)); } module_init(nfacct_mt_init); diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index ead7c6022208..ec6ed6fda96c 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -5,12 +5,13 @@ /* (C) 2001-2003 Bart De Schuymer <bdschuym@pandora.be> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/if.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netfilter_bridge.h> -#include <linux/netfilter/xt_physdev.h> #include <linux/netfilter/x_tables.h> -#include <net/netfilter/br_netfilter.h> +#include <uapi/linux/netfilter/xt_physdev.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>"); @@ -101,11 +102,9 @@ static int physdev_mt_check(const struct xt_mtchk_param *par) if (info->bitmask & (XT_PHYSDEV_OP_OUT | XT_PHYSDEV_OP_ISOUT) && (!(info->bitmask & XT_PHYSDEV_OP_BRIDGED) || info->invert & XT_PHYSDEV_OP_BRIDGED) && - par->hook_mask & ((1 << NF_INET_LOCAL_OUT) | - (1 << NF_INET_FORWARD) | (1 << NF_INET_POST_ROUTING))) { + par->hook_mask & (1 << NF_INET_LOCAL_OUT)) { pr_info_ratelimited("--physdev-out and --physdev-is-out only supported in the FORWARD and POSTROUTING chains with bridged traffic\n"); - if (par->hook_mask & (1 << NF_INET_LOCAL_OUT)) - return -EINVAL; + return -EINVAL; } if (!brnf_probed) { diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 781e0b482189..0a9708004e20 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -103,7 +103,7 @@ static DEFINE_SPINLOCK(recent_lock); static DEFINE_MUTEX(recent_mutex); #ifdef CONFIG_PROC_FS -static const struct file_operations recent_mt_fops; +static const struct proc_ops recent_mt_proc_ops; #endif static u_int32_t hash_rnd __read_mostly; @@ -405,7 +405,7 @@ static int recent_mt_check(const struct xt_mtchk_param *par, goto out; } pde = proc_create_data(t->name, ip_list_perms, recent_net->xt_recent, - &recent_mt_fops, t); + &recent_mt_proc_ops, t); if (pde == NULL) { recent_table_free(t); ret = -ENOMEM; @@ -616,13 +616,12 @@ recent_mt_proc_write(struct file *file, const char __user *input, return size + 1; } -static const struct file_operations recent_mt_fops = { - .open = recent_seq_open, - .read = seq_read, - .write = recent_mt_proc_write, - .release = seq_release_private, - .owner = THIS_MODULE, - .llseek = seq_lseek, +static const struct proc_ops recent_mt_proc_ops = { + .proc_open = recent_seq_open, + .proc_read = seq_read, + .proc_write = recent_mt_proc_write, + .proc_release = seq_release_private, + .proc_lseek = seq_lseek, }; static int __net_init recent_proc_net_init(struct net *net) diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c index ecbfa291fb70..731bc2cafae4 100644 --- a/net/netfilter/xt_set.c +++ b/net/netfilter/xt_set.c @@ -14,7 +14,6 @@ #include <linux/netfilter/x_tables.h> #include <linux/netfilter/ipset/ip_set.h> -#include <linux/netfilter/ipset/ip_set_timeout.h> #include <uapi/linux/netfilter/xt_set.h> MODULE_LICENSE("GPL"); diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c index 8dbb4d48f2ed..67cb98489415 100644 --- a/net/netfilter/xt_time.c +++ b/net/netfilter/xt_time.c @@ -77,12 +77,12 @@ static inline bool is_leap(unsigned int y) * This is done in three separate functions so that the most expensive * calculations are done last, in case a "simple match" can be found earlier. */ -static inline unsigned int localtime_1(struct xtm *r, time_t time) +static inline unsigned int localtime_1(struct xtm *r, time64_t time) { unsigned int v, w; /* Each day has 86400s, so finding the hour/minute is actually easy. */ - v = time % SECONDS_PER_DAY; + div_u64_rem(time, SECONDS_PER_DAY, &v); r->second = v % 60; w = v / 60; r->minute = w % 60; @@ -90,13 +90,13 @@ static inline unsigned int localtime_1(struct xtm *r, time_t time) return v; } -static inline void localtime_2(struct xtm *r, time_t time) +static inline void localtime_2(struct xtm *r, time64_t time) { /* * Here comes the rest (weekday, monthday). First, divide the SSTE * by seconds-per-day to get the number of _days_ since the epoch. */ - r->dse = time / 86400; + r->dse = div_u64(time, SECONDS_PER_DAY); /* * 1970-01-01 (w=0) was a Thursday (4). @@ -105,7 +105,7 @@ static inline void localtime_2(struct xtm *r, time_t time) r->weekday = (4 + r->dse - 1) % 7 + 1; } -static void localtime_3(struct xtm *r, time_t time) +static void localtime_3(struct xtm *r, time64_t time) { unsigned int year, i, w = r->dse; @@ -160,7 +160,7 @@ time_mt(const struct sk_buff *skb, struct xt_action_param *par) const struct xt_time_info *info = par->matchinfo; unsigned int packet_time; struct xtm current_time; - s64 stamp; + time64_t stamp; /* * We need real time here, but we can neither use skb->tstamp @@ -173,14 +173,14 @@ time_mt(const struct sk_buff *skb, struct xt_action_param *par) * 1. match before 13:00 * 2. match after 13:00 * - * If you match against processing time (get_seconds) it + * If you match against processing time (ktime_get_real_seconds) it * may happen that the same packet matches both rules if * it arrived at the right moment before 13:00, so it would be * better to check skb->tstamp and set it via __net_timestamp() * if needed. This however breaks outgoing packets tx timestamp, * and causes them to get delayed forever by fq packet scheduler. */ - stamp = get_seconds(); + stamp = ktime_get_real_seconds(); if (info->flags & XT_TIME_LOCAL_TZ) /* Adjust for local timezone */ @@ -193,6 +193,9 @@ time_mt(const struct sk_buff *skb, struct xt_action_param *par) * - 'now' is in the weekday mask * - 'now' is in the daytime range time_start..time_end * (and by default, libxt_time will set these so as to match) + * + * note: info->date_start/stop are unsigned 32-bit values that + * can hold values beyond y2038, but not after y2106. */ if (stamp < info->date_start || stamp > info->date_stop) diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index 2b0ef55cf89e..409a3ae47ce2 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -607,7 +607,7 @@ catmap_getnode_alloc: */ int netlbl_catmap_walk(struct netlbl_lsm_catmap *catmap, u32 offset) { - struct netlbl_lsm_catmap *iter = catmap; + struct netlbl_lsm_catmap *iter; u32 idx; u32 bit; NETLBL_CATMAP_MAPTYPE bitmap; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 90b2ab9dd449..4e31721e7293 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2755,7 +2755,7 @@ static int __init netlink_proto_init(void) if (err != 0) goto out; - BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb)); + BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof_field(struct sk_buff, cb)); nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL); if (!nl_table) diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index efccd1ac9a66..0522b2b1fd95 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -458,10 +458,63 @@ void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, } EXPORT_SYMBOL(genlmsg_put); +static struct genl_dumpit_info *genl_dumpit_info_alloc(void) +{ + return kmalloc(sizeof(struct genl_dumpit_info), GFP_KERNEL); +} + +static void genl_dumpit_info_free(const struct genl_dumpit_info *info) +{ + kfree(info); +} + +static struct nlattr ** +genl_family_rcv_msg_attrs_parse(const struct genl_family *family, + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, + const struct genl_ops *ops, + int hdrlen, + enum genl_validate_flags no_strict_flag, + bool parallel) +{ + enum netlink_validation validate = ops->validate & no_strict_flag ? + NL_VALIDATE_LIBERAL : + NL_VALIDATE_STRICT; + struct nlattr **attrbuf; + int err; + + if (!family->maxattr) + return NULL; + + if (parallel) { + attrbuf = kmalloc_array(family->maxattr + 1, + sizeof(struct nlattr *), GFP_KERNEL); + if (!attrbuf) + return ERR_PTR(-ENOMEM); + } else { + attrbuf = family->attrbuf; + } + + err = __nlmsg_parse(nlh, hdrlen, attrbuf, family->maxattr, + family->policy, validate, extack); + if (err && parallel) { + kfree(attrbuf); + return ERR_PTR(err); + } + return attrbuf; +} + +static void genl_family_rcv_msg_attrs_free(const struct genl_family *family, + struct nlattr **attrbuf, + bool parallel) +{ + if (parallel) + kfree(attrbuf); +} + static int genl_lock_start(struct netlink_callback *cb) { - /* our ops are always const - netlink API doesn't propagate that */ - const struct genl_ops *ops = cb->data; + const struct genl_ops *ops = genl_dumpit_info(cb)->ops; int rc = 0; if (ops->start) { @@ -474,8 +527,7 @@ static int genl_lock_start(struct netlink_callback *cb) static int genl_lock_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { - /* our ops are always const - netlink API doesn't propagate that */ - const struct genl_ops *ops = cb->data; + const struct genl_ops *ops = genl_dumpit_info(cb)->ops; int rc; genl_lock(); @@ -486,8 +538,8 @@ static int genl_lock_dumpit(struct sk_buff *skb, struct netlink_callback *cb) static int genl_lock_done(struct netlink_callback *cb) { - /* our ops are always const - netlink API doesn't propagate that */ - const struct genl_ops *ops = cb->data; + const struct genl_dumpit_info *info = genl_dumpit_info(cb); + const struct genl_ops *ops = info->ops; int rc = 0; if (ops->done) { @@ -495,120 +547,111 @@ static int genl_lock_done(struct netlink_callback *cb) rc = ops->done(cb); genl_unlock(); } + genl_family_rcv_msg_attrs_free(info->family, info->attrs, true); + genl_dumpit_info_free(info); return rc; } -static int genl_family_rcv_msg(const struct genl_family *family, - struct sk_buff *skb, - struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) +static int genl_parallel_done(struct netlink_callback *cb) { - const struct genl_ops *ops; - struct net *net = sock_net(skb->sk); - struct genl_info info; - struct genlmsghdr *hdr = nlmsg_data(nlh); - struct nlattr **attrbuf; - int hdrlen, err; + const struct genl_dumpit_info *info = genl_dumpit_info(cb); + const struct genl_ops *ops = info->ops; + int rc = 0; - /* this family doesn't exist in this netns */ - if (!family->netnsok && !net_eq(net, &init_net)) - return -ENOENT; + if (ops->done) + rc = ops->done(cb); + genl_family_rcv_msg_attrs_free(info->family, info->attrs, true); + genl_dumpit_info_free(info); + return rc; +} - hdrlen = GENL_HDRLEN + family->hdrsize; - if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) - return -EINVAL; +static int genl_family_rcv_msg_dumpit(const struct genl_family *family, + struct sk_buff *skb, + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, + const struct genl_ops *ops, + int hdrlen, struct net *net) +{ + struct genl_dumpit_info *info; + struct nlattr **attrs = NULL; + int err; - ops = genl_get_cmd(hdr->cmd, family); - if (ops == NULL) + if (!ops->dumpit) return -EOPNOTSUPP; - if ((ops->flags & GENL_ADMIN_PERM) && - !netlink_capable(skb, CAP_NET_ADMIN)) - return -EPERM; - - if ((ops->flags & GENL_UNS_ADMIN_PERM) && - !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) - return -EPERM; - - if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) { - int rc; - - if (ops->dumpit == NULL) - return -EOPNOTSUPP; - - if (!(ops->validate & GENL_DONT_VALIDATE_DUMP)) { - int hdrlen = GENL_HDRLEN + family->hdrsize; - - if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) - return -EINVAL; + if (ops->validate & GENL_DONT_VALIDATE_DUMP) + goto no_attrs; - if (family->maxattr) { - unsigned int validate = NL_VALIDATE_STRICT; - - if (ops->validate & - GENL_DONT_VALIDATE_DUMP_STRICT) - validate = NL_VALIDATE_LIBERAL; - rc = __nla_validate(nlmsg_attrdata(nlh, hdrlen), - nlmsg_attrlen(nlh, hdrlen), - family->maxattr, - family->policy, - validate, extack); - if (rc) - return rc; - } - } + if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) + return -EINVAL; - if (!family->parallel_ops) { - struct netlink_dump_control c = { - .module = family->module, - /* we have const, but the netlink API doesn't */ - .data = (void *)ops, - .start = genl_lock_start, - .dump = genl_lock_dumpit, - .done = genl_lock_done, - }; + attrs = genl_family_rcv_msg_attrs_parse(family, nlh, extack, + ops, hdrlen, + GENL_DONT_VALIDATE_DUMP_STRICT, + true); + if (IS_ERR(attrs)) + return PTR_ERR(attrs); + +no_attrs: + /* Allocate dumpit info. It is going to be freed by done() callback. */ + info = genl_dumpit_info_alloc(); + if (!info) { + genl_family_rcv_msg_attrs_free(family, attrs, true); + return -ENOMEM; + } - genl_unlock(); - rc = __netlink_dump_start(net->genl_sock, skb, nlh, &c); - genl_lock(); + info->family = family; + info->ops = ops; + info->attrs = attrs; - } else { - struct netlink_dump_control c = { - .module = family->module, - .start = ops->start, - .dump = ops->dumpit, - .done = ops->done, - }; + if (!family->parallel_ops) { + struct netlink_dump_control c = { + .module = family->module, + .data = info, + .start = genl_lock_start, + .dump = genl_lock_dumpit, + .done = genl_lock_done, + }; - rc = __netlink_dump_start(net->genl_sock, skb, nlh, &c); - } + genl_unlock(); + err = __netlink_dump_start(net->genl_sock, skb, nlh, &c); + genl_lock(); - return rc; + } else { + struct netlink_dump_control c = { + .module = family->module, + .data = info, + .start = ops->start, + .dump = ops->dumpit, + .done = genl_parallel_done, + }; + + err = __netlink_dump_start(net->genl_sock, skb, nlh, &c); } - if (ops->doit == NULL) - return -EOPNOTSUPP; - - if (family->maxattr && family->parallel_ops) { - attrbuf = kmalloc_array(family->maxattr + 1, - sizeof(struct nlattr *), - GFP_KERNEL); - if (attrbuf == NULL) - return -ENOMEM; - } else - attrbuf = family->attrbuf; + return err; +} - if (attrbuf) { - enum netlink_validation validate = NL_VALIDATE_STRICT; +static int genl_family_rcv_msg_doit(const struct genl_family *family, + struct sk_buff *skb, + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, + const struct genl_ops *ops, + int hdrlen, struct net *net) +{ + struct nlattr **attrbuf; + struct genl_info info; + int err; - if (ops->validate & GENL_DONT_VALIDATE_STRICT) - validate = NL_VALIDATE_LIBERAL; + if (!ops->doit) + return -EOPNOTSUPP; - err = __nlmsg_parse(nlh, hdrlen, attrbuf, family->maxattr, - family->policy, validate, extack); - if (err < 0) - goto out; - } + attrbuf = genl_family_rcv_msg_attrs_parse(family, nlh, extack, + ops, hdrlen, + GENL_DONT_VALIDATE_STRICT, + family->parallel_ops); + if (IS_ERR(attrbuf)) + return PTR_ERR(attrbuf); info.snd_seq = nlh->nlmsg_seq; info.snd_portid = NETLINK_CB(skb).portid; @@ -632,12 +675,49 @@ static int genl_family_rcv_msg(const struct genl_family *family, family->post_doit(ops, skb, &info); out: - if (family->parallel_ops) - kfree(attrbuf); + genl_family_rcv_msg_attrs_free(family, attrbuf, family->parallel_ops); return err; } +static int genl_family_rcv_msg(const struct genl_family *family, + struct sk_buff *skb, + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + const struct genl_ops *ops; + struct net *net = sock_net(skb->sk); + struct genlmsghdr *hdr = nlmsg_data(nlh); + int hdrlen; + + /* this family doesn't exist in this netns */ + if (!family->netnsok && !net_eq(net, &init_net)) + return -ENOENT; + + hdrlen = GENL_HDRLEN + family->hdrsize; + if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) + return -EINVAL; + + ops = genl_get_cmd(hdr->cmd, family); + if (ops == NULL) + return -EOPNOTSUPP; + + if ((ops->flags & GENL_ADMIN_PERM) && + !netlink_capable(skb, CAP_NET_ADMIN)) + return -EPERM; + + if ((ops->flags & GENL_UNS_ADMIN_PERM) && + !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) + return genl_family_rcv_msg_dumpit(family, skb, nlh, extack, + ops, hdrlen, net); + else + return genl_family_rcv_msg_doit(family, skb, nlh, extack, + ops, hdrlen, net); +} + static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -1088,25 +1168,6 @@ problem: subsys_initcall(genl_init); -/** - * genl_family_attrbuf - return family's attrbuf - * @family: the family - * - * Return the family's attrbuf, while validating that it's - * actually valid to access it. - * - * You cannot use this function with a family that has parallel_ops - * and you can only use it within (pre/post) doit/dumpit callbacks. - */ -struct nlattr **genl_family_attrbuf(const struct genl_family *family) -{ - if (!WARN_ON(family->parallel_ops)) - lockdep_assert_held(&genl_mutex); - - return family->attrbuf; -} -EXPORT_SYMBOL(genl_family_attrbuf); - static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group, gfp_t flags) { diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index c4f54ad2b98a..58d5373c513c 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -64,28 +64,6 @@ static DEFINE_SPINLOCK(nr_list_lock); static const struct proto_ops nr_proto_ops; /* - * NETROM network devices are virtual network devices encapsulating NETROM - * frames into AX.25 which will be sent through an AX.25 device, so form a - * special "super class" of normal net devices; split their locks off into a - * separate class since they always nest. - */ -static struct lock_class_key nr_netdev_xmit_lock_key; -static struct lock_class_key nr_netdev_addr_lock_key; - -static void nr_set_lockdep_one(struct net_device *dev, - struct netdev_queue *txq, - void *_unused) -{ - lockdep_set_class(&txq->_xmit_lock, &nr_netdev_xmit_lock_key); -} - -static void nr_set_lockdep_key(struct net_device *dev) -{ - lockdep_set_class(&dev->addr_list_lock, &nr_netdev_addr_lock_key); - netdev_for_each_tx_queue(dev, nr_set_lockdep_one, NULL); -} - -/* * Socket removal during an interrupt is now safe. */ static void nr_remove_socket(struct sock *sk) @@ -1414,7 +1392,6 @@ static int __init nr_proto_init(void) free_netdev(dev); goto fail; } - nr_set_lockdep_key(dev); dev_nr[i] = dev; } diff --git a/net/nfc/hci/Kconfig b/net/nfc/hci/Kconfig index 97bd3a2c5c98..4822d6f46947 100644 --- a/net/nfc/hci/Kconfig +++ b/net/nfc/hci/Kconfig @@ -1,12 +1,12 @@ # SPDX-License-Identifier: GPL-2.0-only config NFC_HCI - depends on NFC - tristate "NFC HCI implementation" - default n - help - Say Y here if you want to build support for a kernel NFC HCI - implementation. This is mostly needed for devices that only process - HCI frames, like for example the NXP pn544. + depends on NFC + tristate "NFC HCI implementation" + default n + help + Say Y here if you want to build support for a kernel NFC HCI + implementation. This is mostly needed for devices that only process + HCI frames, like for example the NXP pn544. config NFC_SHDLC depends on NFC_HCI diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 9b8742947aff..28604414dec1 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -107,9 +107,14 @@ static int llcp_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) llcp_sock->service_name = kmemdup(llcp_addr.service_name, llcp_sock->service_name_len, GFP_KERNEL); - + if (!llcp_sock->service_name) { + ret = -ENOMEM; + goto put_dev; + } llcp_sock->ssap = nfc_llcp_get_sdp_ssap(local, llcp_sock); if (llcp_sock->ssap == LLCP_SAP_MAX) { + kfree(llcp_sock->service_name); + llcp_sock->service_name = NULL; ret = -EADDRINUSE; goto put_dev; } @@ -549,11 +554,11 @@ static __poll_t llcp_sock_poll(struct file *file, struct socket *sock, if (sk->sk_state == LLCP_LISTEN) return llcp_accept_poll(sk); - if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; if (sk->sk_state == LLCP_CLOSED) @@ -1004,10 +1009,13 @@ static int llcp_sock_create(struct net *net, struct socket *sock, sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; - if (sock->type == SOCK_RAW) + if (sock->type == SOCK_RAW) { + if (!capable(CAP_NET_RAW)) + return -EPERM; sock->ops = &llcp_rawsock_ops; - else + } else { sock->ops = &llcp_sock_ops; + } sk = nfc_llcp_sock_alloc(sock, sock->type, GFP_ATOMIC, kern); if (sk == NULL) diff --git a/net/nfc/nci/spi.c b/net/nfc/nci/spi.c index 9dd8a1096916..7d8e10e27c20 100644 --- a/net/nfc/nci/spi.c +++ b/net/nfc/nci/spi.c @@ -44,7 +44,8 @@ static int __nci_spi_send(struct nci_spi *nspi, struct sk_buff *skb, t.len = 0; } t.cs_change = cs_change; - t.delay_usecs = nspi->xfer_udelay; + t.delay.value = nspi->xfer_udelay; + t.delay.unit = SPI_DELAY_UNIT_USECS; t.speed_hz = nspi->xfer_speed_hz; spi_message_init(&m); @@ -216,7 +217,8 @@ static struct sk_buff *__nci_spi_read(struct nci_spi *nspi) rx.rx_buf = skb_put(skb, rx_len); rx.len = rx_len; rx.cs_change = 0; - rx.delay_usecs = nspi->xfer_udelay; + rx.delay.value = nspi->xfer_udelay; + rx.delay.unit = SPI_DELAY_UNIT_USECS; rx.speed_hz = nspi->xfer_speed_hz; spi_message_add_tail(&rx, &m); diff --git a/net/nfc/nci/uart.c b/net/nfc/nci/uart.c index 78fe622eba65..11b554ce07ff 100644 --- a/net/nfc/nci/uart.c +++ b/net/nfc/nci/uart.c @@ -346,7 +346,7 @@ static int nci_uart_default_recv_buf(struct nci_uart *nu, const u8 *data, nu->rx_packet_len = -1; nu->rx_skb = nci_skb_alloc(nu->ndev, NCI_MAX_PACKET_SIZE, - GFP_KERNEL); + GFP_ATOMIC); if (!nu->rx_skb) return -ENOMEM; } diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index ea64c90b14e8..eee0dddb7749 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -102,22 +102,14 @@ nla_put_failure: static struct nfc_dev *__get_device_from_cb(struct netlink_callback *cb) { - struct nlattr **attrbuf = genl_family_attrbuf(&nfc_genl_family); + const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct nfc_dev *dev; - int rc; u32 idx; - rc = nlmsg_parse_deprecated(cb->nlh, - GENL_HDRLEN + nfc_genl_family.hdrsize, - attrbuf, nfc_genl_family.maxattr, - nfc_genl_policy, NULL); - if (rc < 0) - return ERR_PTR(rc); - - if (!attrbuf[NFC_ATTR_DEVICE_INDEX]) + if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return ERR_PTR(-EINVAL); - idx = nla_get_u32(attrbuf[NFC_ATTR_DEVICE_INDEX]); + idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) @@ -970,7 +962,8 @@ static int nfc_genl_dep_link_down(struct sk_buff *skb, struct genl_info *info) int rc; u32 idx; - if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) + if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || + !info->attrs[NFC_ATTR_TARGET_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); @@ -1018,7 +1011,8 @@ static int nfc_genl_llc_get_params(struct sk_buff *skb, struct genl_info *info) struct sk_buff *msg = NULL; u32 idx; - if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) + if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || + !info->attrs[NFC_ATTR_FIRMWARE_NAME]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); @@ -1097,7 +1091,6 @@ static int nfc_genl_llc_set_params(struct sk_buff *skb, struct genl_info *info) local = nfc_llcp_find_local(dev); if (!local) { - nfc_put_device(dev); rc = -ENODEV; goto exit; } @@ -1157,7 +1150,6 @@ static int nfc_genl_llc_sdreq(struct sk_buff *skb, struct genl_info *info) local = nfc_llcp_find_local(dev); if (!local) { - nfc_put_device(dev); rc = -ENODEV; goto exit; } @@ -1695,7 +1687,8 @@ static const struct genl_ops nfc_genl_ops[] = { }, { .cmd = NFC_CMD_GET_TARGET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = nfc_genl_dump_targets, .done = nfc_genl_dump_targets_done, }, diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 3572e11b6f21..7fbfe2adfffa 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -161,14 +161,17 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, const struct nlattr *attr, int len); static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, - const struct ovs_action_push_mpls *mpls) + __be32 mpls_lse, __be16 mpls_ethertype, __u16 mac_len) { int err; - err = skb_mpls_push(skb, mpls->mpls_lse, mpls->mpls_ethertype); + err = skb_mpls_push(skb, mpls_lse, mpls_ethertype, mac_len, !!mac_len); if (err) return err; + if (!mac_len) + key->mac_proto = MAC_PROTO_NONE; + invalidate_flow_key(key); return 0; } @@ -178,10 +181,14 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key, { int err; - err = skb_mpls_pop(skb, ethertype); + err = skb_mpls_pop(skb, ethertype, skb->mac_len, + ovs_key_mac_proto(key) == MAC_PROTO_ETHERNET); if (err) return err; + if (ethertype == htons(ETH_P_TEB)) + key->mac_proto = MAC_PROTO_ETHERNET; + invalidate_flow_key(key); return 0; } @@ -199,7 +206,7 @@ static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key, if (err) return err; - flow_key->mpls.top_lse = lse; + flow_key->mpls.lse[0] = lse; return 0; } @@ -1226,10 +1233,24 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, execute_hash(skb, key, a); break; - case OVS_ACTION_ATTR_PUSH_MPLS: - err = push_mpls(skb, key, nla_data(a)); + case OVS_ACTION_ATTR_PUSH_MPLS: { + struct ovs_action_push_mpls *mpls = nla_data(a); + + err = push_mpls(skb, key, mpls->mpls_lse, + mpls->mpls_ethertype, skb->mac_len); break; + } + case OVS_ACTION_ATTR_ADD_MPLS: { + struct ovs_action_add_mpls *mpls = nla_data(a); + __u16 mac_len = 0; + + if (mpls->tun_flags & OVS_MPLS_L3_TUNNEL_FLAG_MASK) + mac_len = skb->mac_len; + err = push_mpls(skb, key, mpls->mpls_lse, + mpls->mpls_ethertype, mac_len); + break; + } case OVS_ACTION_ATTR_POP_MPLS: err = pop_mpls(skb, key, nla_get_be16(a)); break; diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 848c6eb55064..e726159cfcfa 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -67,6 +67,7 @@ struct ovs_conntrack_info { struct md_mark mark; struct md_labels labels; char timeout[CTNL_TIMEOUT_NAME_MAX]; + struct nf_ct_timeout *nf_ct_timeout; #if IS_ENABLED(CONFIG_NF_NAT) struct nf_nat_range2 range; /* Only present for SRC NAT and DST NAT. */ #endif @@ -524,6 +525,11 @@ static int handle_fragments(struct net *net, struct sw_flow_key *key, return -EPFNOSUPPORT; } + /* The key extracted from the fragment that completed this datagram + * likely didn't have an L4 header, so regenerate it. + */ + ovs_flow_key_update_l3l4(skb, key); + key->ip.frag = OVS_FRAG_TYPE_NONE; skb_clear_hash(skb); skb->ignore_df = 1; @@ -697,6 +703,14 @@ static bool skb_nfct_cached(struct net *net, if (help && rcu_access_pointer(help->helper) != info->helper) return false; } + if (info->nf_ct_timeout) { + struct nf_conn_timeout *timeout_ext; + + timeout_ext = nf_ct_timeout_find(ct); + if (!timeout_ext || info->nf_ct_timeout != + rcu_dereference(timeout_ext->timeout)) + return false; + } /* Force conntrack entry direction to the current packet? */ if (info->force && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) { /* Delete the conntrack entry if confirmed, else just release @@ -889,6 +903,17 @@ static int ovs_ct_nat(struct net *net, struct sw_flow_key *key, } err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype); + if (err == NF_ACCEPT && + ct->status & IPS_SRC_NAT && ct->status & IPS_DST_NAT) { + if (maniptype == NF_NAT_MANIP_SRC) + maniptype = NF_NAT_MANIP_DST; + else + maniptype = NF_NAT_MANIP_SRC; + + err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, + maniptype); + } + /* Mark NAT done if successful and update the flow key. */ if (err == NF_ACCEPT) ovs_nat_update_key(key, skb, maniptype); @@ -957,6 +982,8 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, ct = nf_ct_get(skb, &ctinfo); if (ct) { + bool add_helper = false; + /* Packets starting a new connection must be NATted before the * helper, so that the helper knows about the NAT. We enforce * this by delaying both NAT and helper calls for unconfirmed @@ -974,16 +1001,17 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, } /* Userspace may decide to perform a ct lookup without a helper - * specified followed by a (recirculate and) commit with one. - * Therefore, for unconfirmed connections which we will commit, - * we need to attach the helper here. + * specified followed by a (recirculate and) commit with one, + * or attach a helper in a later commit. Therefore, for + * connections which we will commit, we may need to attach + * the helper here. */ - if (!nf_ct_is_confirmed(ct) && info->commit && - info->helper && !nfct_help(ct)) { + if (info->commit && info->helper && !nfct_help(ct)) { int err = __nf_ct_try_assign_helper(ct, info->ct, GFP_ATOMIC); if (err) return err; + add_helper = true; /* helper installed, add seqadj if NAT is required */ if (info->nat && !nfct_seqadj(ct)) { @@ -993,11 +1021,13 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, } /* Call the helper only if: - * - nf_conntrack_in() was executed above ("!cached") for a - * confirmed connection, or + * - nf_conntrack_in() was executed above ("!cached") or a + * helper was just attached ("add_helper") for a confirmed + * connection, or * - When committing an unconfirmed connection. */ - if ((nf_ct_is_confirmed(ct) ? !cached : info->commit) && + if ((nf_ct_is_confirmed(ct) ? !cached || add_helper : + info->commit) && ovs_ct_helper(skb, info->family) != NF_ACCEPT) { return -EINVAL; } @@ -1565,7 +1595,7 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, case OVS_CT_ATTR_TIMEOUT: memcpy(info->timeout, nla_data(a), nla_len(a)); if (!memchr(info->timeout, '\0', nla_len(a))) { - OVS_NLERR(log, "Invalid conntrack helper"); + OVS_NLERR(log, "Invalid conntrack timeout"); return -EINVAL; } break; @@ -1657,6 +1687,10 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, ct_info.timeout)) pr_info_ratelimited("Failed to associated timeout " "policy `%s'\n", ct_info.timeout); + else + ct_info.nf_ct_timeout = rcu_dereference( + nf_ct_timeout_find(ct_info.ct)->timeout); + } if (helper) { diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index d01410e52097..659c2a790fe7 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -222,14 +222,15 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) struct dp_stats_percpu *stats; u64 *stats_counter; u32 n_mask_hit; + int error; stats = this_cpu_ptr(dp->stats_percpu); /* Look up flow. */ - flow = ovs_flow_tbl_lookup_stats(&dp->table, key, &n_mask_hit); + flow = ovs_flow_tbl_lookup_stats(&dp->table, key, skb_get_hash(skb), + &n_mask_hit); if (unlikely(!flow)) { struct dp_upcall_info upcall; - int error; memset(&upcall, 0, sizeof(upcall)); upcall.cmd = OVS_PACKET_CMD_MISS; @@ -246,7 +247,10 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) ovs_flow_stats_update(flow, key->tp.flags, skb); sf_acts = rcu_dereference(flow->sf_acts); - ovs_execute_actions(dp, skb, sf_acts, key); + error = ovs_execute_actions(dp, skb, sf_acts, key); + if (unlikely(error)) + net_dbg_ratelimited("ovs: action execution error on datapath %s: %d\n", + ovs_dp_name(dp), error); stats_counter = &stats->n_hit; @@ -317,8 +321,7 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, } /* Queue all of the segments. */ - skb = segs; - do { + skb_list_walk_safe(segs, skb, nskb) { if (gso_type & SKB_GSO_UDP && skb != segs) key = &later_key; @@ -326,17 +329,15 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, if (err) break; - } while ((skb = skb->next)); + } /* Free all of the segments. */ - skb = segs; - do { - nskb = skb->next; + skb_list_walk_safe(segs, skb, nskb) { if (err) kfree_skb(skb); else consume_skb(skb); - } while ((skb = nskb)); + } return err; } @@ -346,7 +347,8 @@ static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info, size_t size = NLMSG_ALIGN(sizeof(struct ovs_header)) + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */ + nla_total_size(ovs_key_attr_size()) /* OVS_PACKET_ATTR_KEY */ - + nla_total_size(sizeof(unsigned int)); /* OVS_PACKET_ATTR_LEN */ + + nla_total_size(sizeof(unsigned int)) /* OVS_PACKET_ATTR_LEN */ + + nla_total_size(sizeof(u64)); /* OVS_PACKET_ATTR_HASH */ /* OVS_PACKET_ATTR_USERDATA */ if (upcall_info->userdata) @@ -389,6 +391,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, size_t len; unsigned int hlen; int err, dp_ifindex; + u64 hash; dp_ifindex = get_dpifindex(dp); if (!dp_ifindex) @@ -481,23 +484,30 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, } /* Add OVS_PACKET_ATTR_MRU */ - if (upcall_info->mru) { - if (nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU, - upcall_info->mru)) { - err = -ENOBUFS; - goto out; - } - pad_packet(dp, user_skb); + if (upcall_info->mru && + nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU, upcall_info->mru)) { + err = -ENOBUFS; + goto out; } /* Add OVS_PACKET_ATTR_LEN when packet is truncated */ - if (cutlen > 0) { - if (nla_put_u32(user_skb, OVS_PACKET_ATTR_LEN, - skb->len)) { - err = -ENOBUFS; - goto out; - } - pad_packet(dp, user_skb); + if (cutlen > 0 && + nla_put_u32(user_skb, OVS_PACKET_ATTR_LEN, skb->len)) { + err = -ENOBUFS; + goto out; + } + + /* Add OVS_PACKET_ATTR_HASH */ + hash = skb_get_hash_raw(skb); + if (skb->sw_hash) + hash |= OVS_PACKET_HASH_SW_BIT; + + if (skb->l4_hash) + hash |= OVS_PACKET_HASH_L4_BIT; + + if (nla_put(user_skb, OVS_PACKET_ATTR_HASH, sizeof (u64), &hash)) { + err = -ENOBUFS; + goto out; } /* Only reserve room for attribute header, packet data is added @@ -539,6 +549,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; struct vport *input_vport; u16 mru = 0; + u64 hash; int len; int err; bool log = !a[OVS_PACKET_ATTR_PROBE]; @@ -564,6 +575,14 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) } OVS_CB(packet)->mru = mru; + if (a[OVS_PACKET_ATTR_HASH]) { + hash = nla_get_u64(a[OVS_PACKET_ATTR_HASH]); + + __skb_set_hash(packet, hash & 0xFFFFFFFFULL, + !!(hash & OVS_PACKET_HASH_SW_BIT), + !!(hash & OVS_PACKET_HASH_L4_BIT)); + } + /* Build an sw_flow for sending this packet. */ flow = ovs_flow_alloc(); err = PTR_ERR(flow); @@ -701,9 +720,13 @@ static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts, { size_t len = NLMSG_ALIGN(sizeof(struct ovs_header)); - /* OVS_FLOW_ATTR_UFID */ + /* OVS_FLOW_ATTR_UFID, or unmasked flow key as fallback + * see ovs_nla_put_identifier() + */ if (sfid && ovs_identifier_is_ufid(sfid)) len += nla_total_size(sfid->ufid_len); + else + len += nla_total_size(ovs_key_attr_size()); /* OVS_FLOW_ATTR_KEY */ if (!sfid || should_fill_key(sfid, ufid_flags)) @@ -879,7 +902,10 @@ static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow, retval = ovs_flow_cmd_fill_info(flow, dp_ifindex, skb, info->snd_portid, info->snd_seq, 0, cmd, ufid_flags); - BUG_ON(retval < 0); + if (WARN_ON_ONCE(retval < 0)) { + kfree_skb(skb); + skb = ERR_PTR(retval); + } return skb; } @@ -1343,7 +1369,10 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) OVS_FLOW_CMD_DEL, ufid_flags); rcu_read_unlock(); - BUG_ON(err < 0); + if (WARN_ON_ONCE(err < 0)) { + kfree_skb(reply); + goto out_free; + } ovs_notify(&dp_flow_genl_family, reply, info); } else { @@ -1351,6 +1380,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) } } +out_free: ovs_flow_free(flow, true); return 0; unlock: @@ -1542,10 +1572,59 @@ static void ovs_dp_reset_user_features(struct sk_buff *skb, struct genl_info *in dp->user_features = 0; } -static void ovs_dp_change(struct datapath *dp, struct nlattr *a[]) +DEFINE_STATIC_KEY_FALSE(tc_recirc_sharing_support); + +static int ovs_dp_change(struct datapath *dp, struct nlattr *a[]) +{ + u32 user_features = 0; + + if (a[OVS_DP_ATTR_USER_FEATURES]) { + user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]); + + if (user_features & ~(OVS_DP_F_VPORT_PIDS | + OVS_DP_F_UNALIGNED | + OVS_DP_F_TC_RECIRC_SHARING)) + return -EOPNOTSUPP; + +#if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + if (user_features & OVS_DP_F_TC_RECIRC_SHARING) + return -EOPNOTSUPP; +#endif + } + + dp->user_features = user_features; + + if (dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) + static_branch_enable(&tc_recirc_sharing_support); + else + static_branch_disable(&tc_recirc_sharing_support); + + return 0; +} + +static int ovs_dp_stats_init(struct datapath *dp) +{ + dp->stats_percpu = netdev_alloc_pcpu_stats(struct dp_stats_percpu); + if (!dp->stats_percpu) + return -ENOMEM; + + return 0; +} + +static int ovs_dp_vport_init(struct datapath *dp) { - if (a[OVS_DP_ATTR_USER_FEATURES]) - dp->user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]); + int i; + + dp->ports = kmalloc_array(DP_VPORT_HASH_BUCKETS, + sizeof(struct hlist_head), + GFP_KERNEL); + if (!dp->ports) + return -ENOMEM; + + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) + INIT_HLIST_HEAD(&dp->ports[i]); + + return 0; } static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) @@ -1556,7 +1635,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; struct vport *vport; struct ovs_net *ovs_net; - int err, i; + int err; err = -EINVAL; if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID]) @@ -1569,35 +1648,26 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) err = -ENOMEM; dp = kzalloc(sizeof(*dp), GFP_KERNEL); if (dp == NULL) - goto err_free_reply; + goto err_destroy_reply; ovs_dp_set_net(dp, sock_net(skb->sk)); /* Allocate table. */ err = ovs_flow_tbl_init(&dp->table); if (err) - goto err_free_dp; + goto err_destroy_dp; - dp->stats_percpu = netdev_alloc_pcpu_stats(struct dp_stats_percpu); - if (!dp->stats_percpu) { - err = -ENOMEM; + err = ovs_dp_stats_init(dp); + if (err) goto err_destroy_table; - } - - dp->ports = kmalloc_array(DP_VPORT_HASH_BUCKETS, - sizeof(struct hlist_head), - GFP_KERNEL); - if (!dp->ports) { - err = -ENOMEM; - goto err_destroy_percpu; - } - for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) - INIT_HLIST_HEAD(&dp->ports[i]); + err = ovs_dp_vport_init(dp); + if (err) + goto err_destroy_stats; err = ovs_meters_init(dp); if (err) - goto err_destroy_ports_array; + goto err_destroy_ports; /* Set up our datapath device. */ parms.name = nla_data(a[OVS_DP_ATTR_NAME]); @@ -1607,7 +1677,9 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) parms.port_no = OVSP_LOCAL; parms.upcall_portids = a[OVS_DP_ATTR_UPCALL_PID]; - ovs_dp_change(dp, a); + err = ovs_dp_change(dp, a); + if (err) + goto err_destroy_meters; /* So far only local changes have been made, now need the lock. */ ovs_lock(); @@ -1627,6 +1699,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) ovs_dp_reset_user_features(skb, info); } + ovs_unlock(); goto err_destroy_meters; } @@ -1643,17 +1716,16 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) return 0; err_destroy_meters: - ovs_unlock(); ovs_meters_exit(dp); -err_destroy_ports_array: +err_destroy_ports: kfree(dp->ports); -err_destroy_percpu: +err_destroy_stats: free_percpu(dp->stats_percpu); err_destroy_table: ovs_flow_tbl_destroy(&dp->table); -err_free_dp: +err_destroy_dp: kfree(dp); -err_free_reply: +err_destroy_reply: kfree_skb(reply); err: return err; @@ -1733,7 +1805,9 @@ static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info) if (IS_ERR(dp)) goto err_unlock_free; - ovs_dp_change(dp, info->attrs); + err = ovs_dp_change(dp, info->attrs); + if (err) + goto err_unlock_free; err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, info->snd_seq, 0, OVS_DP_CMD_SET); @@ -1850,7 +1924,7 @@ static struct genl_family dp_datapath_genl_family __ro_after_init = { /* Called with ovs_mutex or RCU read lock. */ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, struct net *net, u32 portid, u32 seq, - u32 flags, u8 cmd) + u32 flags, u8 cmd, gfp_t gfp) { struct ovs_header *ovs_header; struct ovs_vport_stats vport_stats; @@ -1871,7 +1945,7 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, goto nla_put_failure; if (!net_eq(net, dev_net(vport->dev))) { - int id = peernet2id_alloc(net, dev_net(vport->dev)); + int id = peernet2id_alloc(net, dev_net(vport->dev), gfp); if (nla_put_s32(skb, OVS_VPORT_ATTR_NETNSID, id)) goto nla_put_failure; @@ -1912,11 +1986,12 @@ struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net, struct sk_buff *skb; int retval; - skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return ERR_PTR(-ENOMEM); - retval = ovs_vport_cmd_fill_info(vport, skb, net, portid, seq, 0, cmd); + retval = ovs_vport_cmd_fill_info(vport, skb, net, portid, seq, 0, cmd, + GFP_KERNEL); BUG_ON(retval < 0); return skb; @@ -2058,7 +2133,7 @@ restart: err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), info->snd_portid, info->snd_seq, 0, - OVS_VPORT_CMD_NEW); + OVS_VPORT_CMD_NEW, GFP_KERNEL); new_headroom = netdev_get_fwd_headroom(vport->dev); @@ -2119,7 +2194,7 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info) err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), info->snd_portid, info->snd_seq, 0, - OVS_VPORT_CMD_SET); + OVS_VPORT_CMD_SET, GFP_KERNEL); BUG_ON(err < 0); ovs_unlock(); @@ -2159,7 +2234,7 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), info->snd_portid, info->snd_seq, 0, - OVS_VPORT_CMD_DEL); + OVS_VPORT_CMD_DEL, GFP_KERNEL); BUG_ON(err < 0); /* the vport deletion may trigger dp headroom update */ @@ -2206,7 +2281,7 @@ static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info) goto exit_unlock_free; err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), info->snd_portid, info->snd_seq, 0, - OVS_VPORT_CMD_GET); + OVS_VPORT_CMD_GET, GFP_ATOMIC); BUG_ON(err < 0); rcu_read_unlock(); @@ -2242,7 +2317,8 @@ static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - OVS_VPORT_CMD_GET) < 0) + OVS_VPORT_CMD_GET, + GFP_ATOMIC) < 0) goto out; j++; @@ -2263,7 +2339,7 @@ static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { [OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) }, [OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 }, [OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 }, - [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 }, + [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_UNSPEC }, [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED }, [OVS_VPORT_ATTR_IFINDEX] = { .type = NLA_U32 }, [OVS_VPORT_ATTR_NETNSID] = { .type = NLA_S32 }, @@ -2418,7 +2494,7 @@ static int __init dp_init(void) { int err; - BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb)); + BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > sizeof_field(struct sk_buff, cb)); pr_info("Open vSwitch switching datapath\n"); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 751d34accdf9..e239a46c2f94 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -139,6 +139,18 @@ struct ovs_net { bool xt_label; }; +/** + * enum ovs_pkt_hash_types - hash info to include with a packet + * to send to userspace. + * @OVS_PACKET_HASH_SW_BIT: indicates hash was computed in software stack. + * @OVS_PACKET_HASH_L4_BIT: indicates hash is a canonical 4-tuple hash + * over transport ports. + */ +enum ovs_pkt_hash_types { + OVS_PACKET_HASH_SW_BIT = (1ULL << 32), + OVS_PACKET_HASH_L4_BIT = (1ULL << 33), +}; + extern unsigned int ovs_net_id; void ovs_lock(void); void ovs_unlock(void); @@ -218,6 +230,8 @@ static inline struct datapath *get_dp(struct net *net, int dp_ifindex) extern struct notifier_block ovs_dp_device_notifier; extern struct genl_family dp_vport_genl_family; +DECLARE_STATIC_KEY_FALSE(tc_recirc_sharing_support); + void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key); void ovs_dp_detach_port(struct vport *); int ovs_dp_upcall(struct datapath *, struct sk_buff *, diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index bc89e16e0505..9d375e74b607 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -523,78 +523,15 @@ static int parse_nsh(struct sk_buff *skb, struct sw_flow_key *key) } /** - * key_extract - extracts a flow key from an Ethernet frame. + * key_extract_l3l4 - extracts L3/L4 header information. * @skb: sk_buff that contains the frame, with skb->data pointing to the - * Ethernet header + * L3 header * @key: output flow key * - * The caller must ensure that skb->len >= ETH_HLEN. - * - * Returns 0 if successful, otherwise a negative errno value. - * - * Initializes @skb header fields as follows: - * - * - skb->mac_header: the L2 header. - * - * - skb->network_header: just past the L2 header, or just past the - * VLAN header, to the first byte of the L2 payload. - * - * - skb->transport_header: If key->eth.type is ETH_P_IP or ETH_P_IPV6 - * on output, then just past the IP header, if one is present and - * of a correct length, otherwise the same as skb->network_header. - * For other key->eth.type values it is left untouched. - * - * - skb->protocol: the type of the data starting at skb->network_header. - * Equals to key->eth.type. */ -static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) +static int key_extract_l3l4(struct sk_buff *skb, struct sw_flow_key *key) { int error; - struct ethhdr *eth; - - /* Flags are always used as part of stats */ - key->tp.flags = 0; - - skb_reset_mac_header(skb); - - /* Link layer. */ - clear_vlan(key); - if (ovs_key_mac_proto(key) == MAC_PROTO_NONE) { - if (unlikely(eth_type_vlan(skb->protocol))) - return -EINVAL; - - skb_reset_network_header(skb); - key->eth.type = skb->protocol; - } else { - eth = eth_hdr(skb); - ether_addr_copy(key->eth.src, eth->h_source); - ether_addr_copy(key->eth.dst, eth->h_dest); - - __skb_pull(skb, 2 * ETH_ALEN); - /* We are going to push all headers that we pull, so no need to - * update skb->csum here. - */ - - if (unlikely(parse_vlan(skb, key))) - return -ENOMEM; - - key->eth.type = parse_ethertype(skb); - if (unlikely(key->eth.type == htons(0))) - return -ENOMEM; - - /* Multiple tagged packets need to retain TPID to satisfy - * skb_vlan_pop(), which will later shift the ethertype into - * skb->protocol. - */ - if (key->eth.cvlan.tci & htons(VLAN_CFI_MASK)) - skb->protocol = key->eth.cvlan.tpid; - else - skb->protocol = key->eth.type; - - skb_reset_network_header(skb); - __skb_push(skb, skb->data - skb_mac_header(skb)); - } - skb_reset_mac_len(skb); /* Network layer. */ if (key->eth.type == htons(ETH_P_IP)) { @@ -623,6 +560,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) offset = nh->frag_off & htons(IP_OFFSET); if (offset) { key->ip.frag = OVS_FRAG_TYPE_LATER; + memset(&key->tp, 0, sizeof(key->tp)); return 0; } if (nh->frag_off & htons(IP_MF) || @@ -699,27 +637,35 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) memset(&key->ipv4, 0, sizeof(key->ipv4)); } } else if (eth_p_mpls(key->eth.type)) { - size_t stack_len = MPLS_HLEN; + u8 label_count = 1; + memset(&key->mpls, 0, sizeof(key->mpls)); skb_set_inner_network_header(skb, skb->mac_len); while (1) { __be32 lse; - error = check_header(skb, skb->mac_len + stack_len); + error = check_header(skb, skb->mac_len + + label_count * MPLS_HLEN); if (unlikely(error)) return 0; memcpy(&lse, skb_inner_network_header(skb), MPLS_HLEN); - if (stack_len == MPLS_HLEN) - memcpy(&key->mpls.top_lse, &lse, MPLS_HLEN); + if (label_count <= MPLS_LABEL_DEPTH) + memcpy(&key->mpls.lse[label_count - 1], &lse, + MPLS_HLEN); - skb_set_inner_network_header(skb, skb->mac_len + stack_len); + skb_set_inner_network_header(skb, skb->mac_len + + label_count * MPLS_HLEN); if (lse & htonl(MPLS_LS_S_MASK)) break; - stack_len += MPLS_HLEN; + label_count++; } + if (label_count > MPLS_LABEL_DEPTH) + label_count = MPLS_LABEL_DEPTH; + + key->mpls.num_labels_mask = GENMASK(label_count - 1, 0); } else if (key->eth.type == htons(ETH_P_IPV6)) { int nh_len; /* IPv6 Header + Extensions */ @@ -740,8 +686,10 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) return error; } - if (key->ip.frag == OVS_FRAG_TYPE_LATER) + if (key->ip.frag == OVS_FRAG_TYPE_LATER) { + memset(&key->tp, 0, sizeof(key->tp)); return 0; + } if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP) key->ip.frag = OVS_FRAG_TYPE_FIRST; @@ -788,6 +736,92 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) return 0; } +/** + * key_extract - extracts a flow key from an Ethernet frame. + * @skb: sk_buff that contains the frame, with skb->data pointing to the + * Ethernet header + * @key: output flow key + * + * The caller must ensure that skb->len >= ETH_HLEN. + * + * Returns 0 if successful, otherwise a negative errno value. + * + * Initializes @skb header fields as follows: + * + * - skb->mac_header: the L2 header. + * + * - skb->network_header: just past the L2 header, or just past the + * VLAN header, to the first byte of the L2 payload. + * + * - skb->transport_header: If key->eth.type is ETH_P_IP or ETH_P_IPV6 + * on output, then just past the IP header, if one is present and + * of a correct length, otherwise the same as skb->network_header. + * For other key->eth.type values it is left untouched. + * + * - skb->protocol: the type of the data starting at skb->network_header. + * Equals to key->eth.type. + */ +static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) +{ + struct ethhdr *eth; + + /* Flags are always used as part of stats */ + key->tp.flags = 0; + + skb_reset_mac_header(skb); + + /* Link layer. */ + clear_vlan(key); + if (ovs_key_mac_proto(key) == MAC_PROTO_NONE) { + if (unlikely(eth_type_vlan(skb->protocol))) + return -EINVAL; + + skb_reset_network_header(skb); + key->eth.type = skb->protocol; + } else { + eth = eth_hdr(skb); + ether_addr_copy(key->eth.src, eth->h_source); + ether_addr_copy(key->eth.dst, eth->h_dest); + + __skb_pull(skb, 2 * ETH_ALEN); + /* We are going to push all headers that we pull, so no need to + * update skb->csum here. + */ + + if (unlikely(parse_vlan(skb, key))) + return -ENOMEM; + + key->eth.type = parse_ethertype(skb); + if (unlikely(key->eth.type == htons(0))) + return -ENOMEM; + + /* Multiple tagged packets need to retain TPID to satisfy + * skb_vlan_pop(), which will later shift the ethertype into + * skb->protocol. + */ + if (key->eth.cvlan.tci & htons(VLAN_CFI_MASK)) + skb->protocol = key->eth.cvlan.tpid; + else + skb->protocol = key->eth.type; + + skb_reset_network_header(skb); + __skb_push(skb, skb->data - skb_mac_header(skb)); + } + + skb_reset_mac_len(skb); + + /* Fill out L3/L4 key info, if any */ + return key_extract_l3l4(skb, key); +} + +/* In the case of conntrack fragment handling it expects L3 headers, + * add a helper. + */ +int ovs_flow_key_update_l3l4(struct sk_buff *skb, struct sw_flow_key *key) +{ + return key_extract_l3l4(skb, key); +} + int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key) { int res; @@ -816,6 +850,9 @@ static int key_extract_mac_proto(struct sk_buff *skb) int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, struct sk_buff *skb, struct sw_flow_key *key) { +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + struct tc_skb_ext *tc_ext; +#endif int res, err; /* Extract metadata from packet. */ @@ -848,7 +885,17 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, if (res < 0) return res; key->mac_proto = res; + +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + if (static_branch_unlikely(&tc_recirc_sharing_support)) { + tc_ext = skb_ext_find(skb, TC_SKB_EXT); + key->recirc_id = tc_ext ? tc_ext->chain : 0; + } else { + key->recirc_id = 0; + } +#else key->recirc_id = 0; +#endif err = key_extract(skb, key); if (!err) diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index a5506e2d4b7a..758a8c77f736 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -30,13 +30,14 @@ enum sw_flow_mac_proto { MAC_PROTO_ETHERNET, }; #define SW_FLOW_KEY_INVALID 0x80 +#define MPLS_LABEL_DEPTH 3 /* Store options at the end of the array if they are less than the * maximum size. This allows us to get the benefits of variable length * matching for small options. */ #define TUN_METADATA_OFFSET(opt_len) \ - (FIELD_SIZEOF(struct sw_flow_key, tun_opts) - opt_len) + (sizeof_field(struct sw_flow_key, tun_opts) - opt_len) #define TUN_METADATA_OPTS(flow_key, opt_len) \ ((void *)((flow_key)->tun_opts + TUN_METADATA_OFFSET(opt_len))) @@ -51,7 +52,7 @@ struct vlan_head { #define OVS_SW_FLOW_KEY_METADATA_SIZE \ (offsetof(struct sw_flow_key, recirc_id) + \ - FIELD_SIZEOF(struct sw_flow_key, recirc_id)) + sizeof_field(struct sw_flow_key, recirc_id)) struct ovs_key_nsh { struct ovs_nsh_key_base base; @@ -85,9 +86,6 @@ struct sw_flow_key { */ union { struct { - __be32 top_lse; /* top label stack entry */ - } mpls; - struct { u8 proto; /* IP protocol or lower 8 bits of ARP opcode. */ u8 tos; /* IP ToS. */ u8 ttl; /* IP TTL/hop limit. */ @@ -135,6 +133,11 @@ struct sw_flow_key { } nd; }; } ipv6; + struct { + u32 num_labels_mask; /* labels present bitmap of effective length MPLS_LABEL_DEPTH */ + __be32 lse[MPLS_LABEL_DEPTH]; /* label stack entry */ + } mpls; + struct ovs_key_nsh nsh; /* network service header */ }; struct { @@ -166,7 +169,6 @@ struct sw_flow_key_range { struct sw_flow_mask { int ref_count; struct rcu_head rcu; - struct list_head list; struct sw_flow_key_range range; struct sw_flow_key key; }; @@ -270,6 +272,7 @@ void ovs_flow_stats_clear(struct sw_flow *); u64 ovs_flow_used_time(unsigned long flow_jiffies); int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key); +int ovs_flow_key_update_l3l4(struct sk_buff *skb, struct sw_flow_key *key); int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, struct sk_buff *skb, struct sw_flow_key *key); diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index d7559c64795d..7da4230627f5 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -79,6 +79,7 @@ static bool actions_may_change_flow(const struct nlattr *actions) case OVS_ACTION_ATTR_SET_MASKED: case OVS_ACTION_ATTR_METER: case OVS_ACTION_ATTR_CHECK_PKT_LEN: + case OVS_ACTION_ATTR_ADD_MPLS: default: return true; } @@ -424,7 +425,7 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_DP_HASH] = { .len = sizeof(u32) }, [OVS_KEY_ATTR_TUNNEL] = { .len = OVS_ATTR_NESTED, .next = ovs_tunnel_key_lens, }, - [OVS_KEY_ATTR_MPLS] = { .len = sizeof(struct ovs_key_mpls) }, + [OVS_KEY_ATTR_MPLS] = { .len = OVS_ATTR_VARIABLE }, [OVS_KEY_ATTR_CT_STATE] = { .len = sizeof(u32) }, [OVS_KEY_ATTR_CT_ZONE] = { .len = sizeof(u16) }, [OVS_KEY_ATTR_CT_MARK] = { .len = sizeof(u32) }, @@ -1628,10 +1629,25 @@ static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match, if (attrs & (1 << OVS_KEY_ATTR_MPLS)) { const struct ovs_key_mpls *mpls_key; + u32 hdr_len; + u32 label_count, label_count_mask, i; mpls_key = nla_data(a[OVS_KEY_ATTR_MPLS]); - SW_FLOW_KEY_PUT(match, mpls.top_lse, - mpls_key->mpls_lse, is_mask); + hdr_len = nla_len(a[OVS_KEY_ATTR_MPLS]); + label_count = hdr_len / sizeof(struct ovs_key_mpls); + + if (label_count == 0 || label_count > MPLS_LABEL_DEPTH || + hdr_len % sizeof(struct ovs_key_mpls)) + return -EINVAL; + + label_count_mask = GENMASK(label_count - 1, 0); + + for (i = 0 ; i < label_count; i++) + SW_FLOW_KEY_PUT(match, mpls.lse[i], + mpls_key[i].mpls_lse, is_mask); + + SW_FLOW_KEY_PUT(match, mpls.num_labels_mask, + label_count_mask, is_mask); attrs &= ~(1 << OVS_KEY_ATTR_MPLS); } @@ -2114,13 +2130,18 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, ether_addr_copy(arp_key->arp_sha, output->ipv4.arp.sha); ether_addr_copy(arp_key->arp_tha, output->ipv4.arp.tha); } else if (eth_p_mpls(swkey->eth.type)) { + u8 i, num_labels; struct ovs_key_mpls *mpls_key; - nla = nla_reserve(skb, OVS_KEY_ATTR_MPLS, sizeof(*mpls_key)); + num_labels = hweight_long(output->mpls.num_labels_mask); + nla = nla_reserve(skb, OVS_KEY_ATTR_MPLS, + num_labels * sizeof(*mpls_key)); if (!nla) goto nla_put_failure; + mpls_key = nla_data(nla); - mpls_key->mpls_lse = output->mpls.top_lse; + for (i = 0; i < num_labels; i++) + mpls_key[i].mpls_lse = output->mpls.lse[i]; } if ((swkey->eth.type == htons(ETH_P_IP) || @@ -2406,13 +2427,14 @@ static inline void add_nested_action_end(struct sw_flow_actions *sfa, static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, struct sw_flow_actions **sfa, - __be16 eth_type, __be16 vlan_tci, bool log); + __be16 eth_type, __be16 vlan_tci, + u32 mpls_label_count, bool log); static int validate_and_copy_sample(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, - bool log, bool last) + u32 mpls_label_count, bool log, bool last) { const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1]; const struct nlattr *probability, *actions; @@ -2463,7 +2485,7 @@ static int validate_and_copy_sample(struct net *net, const struct nlattr *attr, return err; err = __ovs_nla_copy_actions(net, actions, key, sfa, - eth_type, vlan_tci, log); + eth_type, vlan_tci, mpls_label_count, log); if (err) return err; @@ -2478,7 +2500,7 @@ static int validate_and_copy_clone(struct net *net, const struct sw_flow_key *key, struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, - bool log, bool last) + u32 mpls_label_count, bool log, bool last) { int start, err; u32 exec; @@ -2498,7 +2520,7 @@ static int validate_and_copy_clone(struct net *net, return err; err = __ovs_nla_copy_actions(net, attr, key, sfa, - eth_type, vlan_tci, log); + eth_type, vlan_tci, mpls_label_count, log); if (err) return err; @@ -2864,6 +2886,7 @@ static int validate_and_copy_check_pkt_len(struct net *net, const struct sw_flow_key *key, struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, + u32 mpls_label_count, bool log, bool last) { const struct nlattr *acts_if_greater, *acts_if_lesser_eq; @@ -2912,7 +2935,7 @@ static int validate_and_copy_check_pkt_len(struct net *net, return nested_acts_start; err = __ovs_nla_copy_actions(net, acts_if_lesser_eq, key, sfa, - eth_type, vlan_tci, log); + eth_type, vlan_tci, mpls_label_count, log); if (err) return err; @@ -2925,7 +2948,7 @@ static int validate_and_copy_check_pkt_len(struct net *net, return nested_acts_start; err = __ovs_nla_copy_actions(net, acts_if_greater, key, sfa, - eth_type, vlan_tci, log); + eth_type, vlan_tci, mpls_label_count, log); if (err) return err; @@ -2952,7 +2975,8 @@ static int copy_action(const struct nlattr *from, static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, struct sw_flow_actions **sfa, - __be16 eth_type, __be16 vlan_tci, bool log) + __be16 eth_type, __be16 vlan_tci, + u32 mpls_label_count, bool log) { u8 mac_proto = ovs_key_mac_proto(key); const struct nlattr *a; @@ -2982,6 +3006,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, [OVS_ACTION_ATTR_METER] = sizeof(u32), [OVS_ACTION_ATTR_CLONE] = (u32)-1, [OVS_ACTION_ATTR_CHECK_PKT_LEN] = (u32)-1, + [OVS_ACTION_ATTR_ADD_MPLS] = sizeof(struct ovs_action_add_mpls), }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); @@ -3049,6 +3074,33 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, case OVS_ACTION_ATTR_RECIRC: break; + case OVS_ACTION_ATTR_ADD_MPLS: { + const struct ovs_action_add_mpls *mpls = nla_data(a); + + if (!eth_p_mpls(mpls->mpls_ethertype)) + return -EINVAL; + + if (mpls->tun_flags & OVS_MPLS_L3_TUNNEL_FLAG_MASK) { + if (vlan_tci & htons(VLAN_CFI_MASK) || + (eth_type != htons(ETH_P_IP) && + eth_type != htons(ETH_P_IPV6) && + eth_type != htons(ETH_P_ARP) && + eth_type != htons(ETH_P_RARP) && + !eth_p_mpls(eth_type))) + return -EINVAL; + mpls_label_count++; + } else { + if (mac_proto == MAC_PROTO_ETHERNET) { + mpls_label_count = 1; + mac_proto = MAC_PROTO_NONE; + } else { + mpls_label_count++; + } + } + eth_type = mpls->mpls_ethertype; + break; + } + case OVS_ACTION_ATTR_PUSH_MPLS: { const struct ovs_action_push_mpls *mpls = nla_data(a); @@ -3065,25 +3117,41 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, !eth_p_mpls(eth_type))) return -EINVAL; eth_type = mpls->mpls_ethertype; + mpls_label_count++; break; } - case OVS_ACTION_ATTR_POP_MPLS: + case OVS_ACTION_ATTR_POP_MPLS: { + __be16 proto; if (vlan_tci & htons(VLAN_CFI_MASK) || !eth_p_mpls(eth_type)) return -EINVAL; - /* Disallow subsequent L2.5+ set and mpls_pop actions - * as there is no check here to ensure that the new - * eth_type is valid and thus set actions could - * write off the end of the packet or otherwise - * corrupt it. + /* Disallow subsequent L2.5+ set actions and mpls_pop + * actions once the last MPLS label in the packet is + * is popped as there is no check here to ensure that + * the new eth type is valid and thus set actions could + * write off the end of the packet or otherwise corrupt + * it. * * Support for these actions is planned using packet * recirculation. */ - eth_type = htons(0); + proto = nla_get_be16(a); + + if (proto == htons(ETH_P_TEB) && + mac_proto != MAC_PROTO_NONE) + return -EINVAL; + + mpls_label_count--; + + if (!eth_p_mpls(proto) || !mpls_label_count) + eth_type = htons(0); + else + eth_type = proto; + break; + } case OVS_ACTION_ATTR_SET: err = validate_set(a, key, sfa, @@ -3106,6 +3174,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, err = validate_and_copy_sample(net, a, key, sfa, eth_type, vlan_tci, + mpls_label_count, log, last); if (err) return err; @@ -3176,6 +3245,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, err = validate_and_copy_clone(net, a, key, sfa, eth_type, vlan_tci, + mpls_label_count, log, last); if (err) return err; @@ -3188,8 +3258,9 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, err = validate_and_copy_check_pkt_len(net, a, key, sfa, eth_type, - vlan_tci, log, - last); + vlan_tci, + mpls_label_count, + log, last); if (err) return err; skip_copy = true; @@ -3219,14 +3290,18 @@ int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, struct sw_flow_actions **sfa, bool log) { int err; + u32 mpls_label_count = 0; *sfa = nla_alloc_flow_actions(min(nla_len(attr), MAX_ACTIONS_BUFSIZE)); if (IS_ERR(*sfa)) return PTR_ERR(*sfa); + if (eth_p_mpls(key->eth.type)) + mpls_label_count = hweight_long(key->mpls.num_labels_mask); + (*sfa)->orig_len = nla_len(attr); err = __ovs_nla_copy_actions(net, attr, key, sfa, key->eth.type, - key->eth.vlan.tci, log); + key->eth.vlan.tci, mpls_label_count, log); if (err) ovs_nla_free_flow_actions(*sfa); diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index cf3582c5ed70..5904e93e5765 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -34,8 +34,13 @@ #include <net/ndisc.h> #define TBL_MIN_BUCKETS 1024 +#define MASK_ARRAY_SIZE_MIN 16 #define REHASH_INTERVAL (10 * 60 * HZ) +#define MC_HASH_SHIFT 8 +#define MC_HASH_ENTRIES (1u << MC_HASH_SHIFT) +#define MC_HASH_SEGS ((sizeof(uint32_t) * 8) / MC_HASH_SHIFT) + static struct kmem_cache *flow_cache; struct kmem_cache *flow_stats_cache __read_mostly; @@ -164,14 +169,133 @@ static struct table_instance *table_instance_alloc(int new_size) return ti; } +static struct mask_array *tbl_mask_array_alloc(int size) +{ + struct mask_array *new; + + size = max(MASK_ARRAY_SIZE_MIN, size); + new = kzalloc(sizeof(struct mask_array) + + sizeof(struct sw_flow_mask *) * size, GFP_KERNEL); + if (!new) + return NULL; + + new->count = 0; + new->max = size; + + return new; +} + +static int tbl_mask_array_realloc(struct flow_table *tbl, int size) +{ + struct mask_array *old; + struct mask_array *new; + + new = tbl_mask_array_alloc(size); + if (!new) + return -ENOMEM; + + old = ovsl_dereference(tbl->mask_array); + if (old) { + int i; + + for (i = 0; i < old->max; i++) { + if (ovsl_dereference(old->masks[i])) + new->masks[new->count++] = old->masks[i]; + } + } + + rcu_assign_pointer(tbl->mask_array, new); + kfree_rcu(old, rcu); + + return 0; +} + +static int tbl_mask_array_add_mask(struct flow_table *tbl, + struct sw_flow_mask *new) +{ + struct mask_array *ma = ovsl_dereference(tbl->mask_array); + int err, ma_count = READ_ONCE(ma->count); + + if (ma_count >= ma->max) { + err = tbl_mask_array_realloc(tbl, ma->max + + MASK_ARRAY_SIZE_MIN); + if (err) + return err; + + ma = ovsl_dereference(tbl->mask_array); + } + + BUG_ON(ovsl_dereference(ma->masks[ma_count])); + + rcu_assign_pointer(ma->masks[ma_count], new); + WRITE_ONCE(ma->count, ma_count +1); + + return 0; +} + +static void tbl_mask_array_del_mask(struct flow_table *tbl, + struct sw_flow_mask *mask) +{ + struct mask_array *ma = ovsl_dereference(tbl->mask_array); + int i, ma_count = READ_ONCE(ma->count); + + /* Remove the deleted mask pointers from the array */ + for (i = 0; i < ma_count; i++) { + if (mask == ovsl_dereference(ma->masks[i])) + goto found; + } + + BUG(); + return; + +found: + WRITE_ONCE(ma->count, ma_count -1); + + rcu_assign_pointer(ma->masks[i], ma->masks[ma_count -1]); + RCU_INIT_POINTER(ma->masks[ma_count -1], NULL); + + kfree_rcu(mask, rcu); + + /* Shrink the mask array if necessary. */ + if (ma->max >= (MASK_ARRAY_SIZE_MIN * 2) && + ma_count <= (ma->max / 3)) + tbl_mask_array_realloc(tbl, ma->max / 2); +} + +/* Remove 'mask' from the mask list, if it is not needed any more. */ +static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask) +{ + if (mask) { + /* ovs-lock is required to protect mask-refcount and + * mask list. + */ + ASSERT_OVSL(); + BUG_ON(!mask->ref_count); + mask->ref_count--; + + if (!mask->ref_count) + tbl_mask_array_del_mask(tbl, mask); + } +} + int ovs_flow_tbl_init(struct flow_table *table) { struct table_instance *ti, *ufid_ti; + struct mask_array *ma; - ti = table_instance_alloc(TBL_MIN_BUCKETS); + table->mask_cache = __alloc_percpu(sizeof(struct mask_cache_entry) * + MC_HASH_ENTRIES, + __alignof__(struct mask_cache_entry)); + if (!table->mask_cache) + return -ENOMEM; + ma = tbl_mask_array_alloc(MASK_ARRAY_SIZE_MIN); + if (!ma) + goto free_mask_cache; + + ti = table_instance_alloc(TBL_MIN_BUCKETS); if (!ti) - return -ENOMEM; + goto free_mask_array; ufid_ti = table_instance_alloc(TBL_MIN_BUCKETS); if (!ufid_ti) @@ -179,7 +303,7 @@ int ovs_flow_tbl_init(struct flow_table *table) rcu_assign_pointer(table->ti, ti); rcu_assign_pointer(table->ufid_ti, ufid_ti); - INIT_LIST_HEAD(&table->mask_list); + rcu_assign_pointer(table->mask_array, ma); table->last_rehash = jiffies; table->count = 0; table->ufid_count = 0; @@ -187,6 +311,10 @@ int ovs_flow_tbl_init(struct flow_table *table) free_ti: __table_instance_destroy(ti); +free_mask_array: + kfree(ma); +free_mask_cache: + free_percpu(table->mask_cache); return -ENOMEM; } @@ -197,7 +325,28 @@ static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu) __table_instance_destroy(ti); } -static void table_instance_destroy(struct table_instance *ti, +static void table_instance_flow_free(struct flow_table *table, + struct table_instance *ti, + struct table_instance *ufid_ti, + struct sw_flow *flow, + bool count) +{ + hlist_del_rcu(&flow->flow_table.node[ti->node_ver]); + if (count) + table->count--; + + if (ovs_identifier_is_ufid(&flow->id)) { + hlist_del_rcu(&flow->ufid_table.node[ufid_ti->node_ver]); + + if (count) + table->ufid_count--; + } + + flow_mask_remove(table, flow->mask); +} + +static void table_instance_destroy(struct flow_table *table, + struct table_instance *ti, struct table_instance *ufid_ti, bool deferred) { @@ -214,13 +363,12 @@ static void table_instance_destroy(struct table_instance *ti, struct sw_flow *flow; struct hlist_head *head = &ti->buckets[i]; struct hlist_node *n; - int ver = ti->node_ver; - int ufid_ver = ufid_ti->node_ver; - hlist_for_each_entry_safe(flow, n, head, flow_table.node[ver]) { - hlist_del_rcu(&flow->flow_table.node[ver]); - if (ovs_identifier_is_ufid(&flow->id)) - hlist_del_rcu(&flow->ufid_table.node[ufid_ver]); + hlist_for_each_entry_safe(flow, n, head, + flow_table.node[ti->node_ver]) { + + table_instance_flow_free(table, ti, ufid_ti, + flow, false); ovs_flow_free(flow, deferred); } } @@ -243,7 +391,9 @@ void ovs_flow_tbl_destroy(struct flow_table *table) struct table_instance *ti = rcu_dereference_raw(table->ti); struct table_instance *ufid_ti = rcu_dereference_raw(table->ufid_ti); - table_instance_destroy(ti, ufid_ti, false); + free_percpu(table->mask_cache); + kfree_rcu(rcu_dereference_raw(table->mask_array), rcu); + table_instance_destroy(table, ti, ufid_ti, false); } struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti, @@ -359,7 +509,7 @@ int ovs_flow_tbl_flush(struct flow_table *flow_table) flow_table->count = 0; flow_table->ufid_count = 0; - table_instance_destroy(old_ti, old_ufid_ti, true); + table_instance_destroy(flow_table, old_ti, old_ufid_ti, true); return 0; err_free_ti: @@ -370,13 +520,10 @@ err_free_ti: static u32 flow_hash(const struct sw_flow_key *key, const struct sw_flow_key_range *range) { - int key_start = range->start; - int key_end = range->end; - const u32 *hash_key = (const u32 *)((const u8 *)key + key_start); - int hash_u32s = (key_end - key_start) >> 2; + const u32 *hash_key = (const u32 *)((const u8 *)key + range->start); /* Make sure number of hash bytes are multiple of u32. */ - BUILD_BUG_ON(sizeof(long) % sizeof(u32)); + int hash_u32s = range_n_bytes(range) >> 2; return jhash2(hash_key, hash_u32s, 0); } @@ -425,7 +572,8 @@ static bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, static struct sw_flow *masked_flow_lookup(struct table_instance *ti, const struct sw_flow_key *unmasked, - const struct sw_flow_mask *mask) + const struct sw_flow_mask *mask, + u32 *n_mask_hit) { struct sw_flow *flow; struct hlist_head *head; @@ -435,6 +583,8 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti, ovs_flow_mask_key(&masked_key, unmasked, false, mask); hash = flow_hash(&masked_key, &mask->range); head = find_bucket(ti, hash); + (*n_mask_hit)++; + hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver]) { if (flow->mask == mask && flow->flow_table.hash == hash && flow_cmp_masked_key(flow, &masked_key, &mask->range)) @@ -443,46 +593,147 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti, return NULL; } -struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, - const struct sw_flow_key *key, - u32 *n_mask_hit) +/* Flow lookup does full lookup on flow table. It starts with + * mask from index passed in *index. + */ +static struct sw_flow *flow_lookup(struct flow_table *tbl, + struct table_instance *ti, + struct mask_array *ma, + const struct sw_flow_key *key, + u32 *n_mask_hit, + u32 *index) { - struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); + struct sw_flow *flow; struct sw_flow_mask *mask; + int i; + + if (likely(*index < ma->max)) { + mask = rcu_dereference_ovsl(ma->masks[*index]); + if (mask) { + flow = masked_flow_lookup(ti, key, mask, n_mask_hit); + if (flow) + return flow; + } + } + + for (i = 0; i < ma->max; i++) { + + if (i == *index) + continue; + + mask = rcu_dereference_ovsl(ma->masks[i]); + if (unlikely(!mask)) + break; + + flow = masked_flow_lookup(ti, key, mask, n_mask_hit); + if (flow) { /* Found */ + *index = i; + return flow; + } + } + + return NULL; +} + +/* + * mask_cache maps flow to probable mask. This cache is not tightly + * coupled cache, It means updates to mask list can result in inconsistent + * cache entry in mask cache. + * This is per cpu cache and is divided in MC_HASH_SEGS segments. + * In case of a hash collision the entry is hashed in next segment. + * */ +struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, + const struct sw_flow_key *key, + u32 skb_hash, + u32 *n_mask_hit) +{ + struct mask_array *ma = rcu_dereference(tbl->mask_array); + struct table_instance *ti = rcu_dereference(tbl->ti); + struct mask_cache_entry *entries, *ce; struct sw_flow *flow; + u32 hash; + int seg; *n_mask_hit = 0; - list_for_each_entry_rcu(mask, &tbl->mask_list, list) { - (*n_mask_hit)++; - flow = masked_flow_lookup(ti, key, mask); - if (flow) /* Found */ + if (unlikely(!skb_hash)) { + u32 mask_index = 0; + + return flow_lookup(tbl, ti, ma, key, n_mask_hit, &mask_index); + } + + /* Pre and post recirulation flows usually have the same skb_hash + * value. To avoid hash collisions, rehash the 'skb_hash' with + * 'recirc_id'. */ + if (key->recirc_id) + skb_hash = jhash_1word(skb_hash, key->recirc_id); + + ce = NULL; + hash = skb_hash; + entries = this_cpu_ptr(tbl->mask_cache); + + /* Find the cache entry 'ce' to operate on. */ + for (seg = 0; seg < MC_HASH_SEGS; seg++) { + int index = hash & (MC_HASH_ENTRIES - 1); + struct mask_cache_entry *e; + + e = &entries[index]; + if (e->skb_hash == skb_hash) { + flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, + &e->mask_index); + if (!flow) + e->skb_hash = 0; return flow; + } + + if (!ce || e->skb_hash < ce->skb_hash) + ce = e; /* A better replacement cache candidate. */ + + hash >>= MC_HASH_SHIFT; } - return NULL; + + /* Cache miss, do full lookup. */ + flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, &ce->mask_index); + if (flow) + ce->skb_hash = skb_hash; + + return flow; } struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl, const struct sw_flow_key *key) { + struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); + struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array); u32 __always_unused n_mask_hit; + u32 index = 0; - return ovs_flow_tbl_lookup_stats(tbl, key, &n_mask_hit); + return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &index); } struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, const struct sw_flow_match *match) { - struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); - struct sw_flow_mask *mask; - struct sw_flow *flow; + struct mask_array *ma = ovsl_dereference(tbl->mask_array); + int i; /* Always called under ovs-mutex. */ - list_for_each_entry(mask, &tbl->mask_list, list) { - flow = masked_flow_lookup(ti, match->key, mask); + for (i = 0; i < ma->max; i++) { + struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); + u32 __always_unused n_mask_hit; + struct sw_flow_mask *mask; + struct sw_flow *flow; + + mask = ovsl_dereference(ma->masks[i]); + if (!mask) + continue; + + flow = masked_flow_lookup(ti, match->key, mask, &n_mask_hit); if (flow && ovs_identifier_is_key(&flow->id) && - ovs_flow_cmp_unmasked_key(flow, match)) + ovs_flow_cmp_unmasked_key(flow, match)) { return flow; + } } + return NULL; } @@ -528,13 +779,8 @@ struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *tbl, int ovs_flow_tbl_num_masks(const struct flow_table *table) { - struct sw_flow_mask *mask; - int num = 0; - - list_for_each_entry(mask, &table->mask_list, list) - num++; - - return num; + struct mask_array *ma = rcu_dereference_ovsl(table->mask_array); + return READ_ONCE(ma->count); } static struct table_instance *table_instance_expand(struct table_instance *ti, @@ -543,24 +789,6 @@ static struct table_instance *table_instance_expand(struct table_instance *ti, return table_instance_rehash(ti, ti->n_buckets * 2, ufid); } -/* Remove 'mask' from the mask list, if it is not needed any more. */ -static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask) -{ - if (mask) { - /* ovs-lock is required to protect mask-refcount and - * mask list. - */ - ASSERT_OVSL(); - BUG_ON(!mask->ref_count); - mask->ref_count--; - - if (!mask->ref_count) { - list_del_rcu(&mask->list); - kfree_rcu(mask, rcu); - } - } -} - /* Must be called with OVS mutex held. */ void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow) { @@ -568,17 +796,7 @@ void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow) struct table_instance *ufid_ti = ovsl_dereference(table->ufid_ti); BUG_ON(table->count == 0); - hlist_del_rcu(&flow->flow_table.node[ti->node_ver]); - table->count--; - if (ovs_identifier_is_ufid(&flow->id)) { - hlist_del_rcu(&flow->ufid_table.node[ufid_ti->node_ver]); - table->ufid_count--; - } - - /* RCU delete the mask. 'flow->mask' is not NULLed, as it should be - * accessible as long as the RCU read lock is held. - */ - flow_mask_remove(table, flow->mask); + table_instance_flow_free(table, ti, ufid_ti, flow, true); } static struct sw_flow_mask *mask_alloc(void) @@ -606,13 +824,16 @@ static bool mask_equal(const struct sw_flow_mask *a, static struct sw_flow_mask *flow_mask_find(const struct flow_table *tbl, const struct sw_flow_mask *mask) { - struct list_head *ml; + struct mask_array *ma; + int i; + + ma = ovsl_dereference(tbl->mask_array); + for (i = 0; i < ma->max; i++) { + struct sw_flow_mask *t; + t = ovsl_dereference(ma->masks[i]); - list_for_each(ml, &tbl->mask_list) { - struct sw_flow_mask *m; - m = container_of(ml, struct sw_flow_mask, list); - if (mask_equal(mask, m)) - return m; + if (t && mask_equal(mask, t)) + return t; } return NULL; @@ -623,6 +844,7 @@ static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow, const struct sw_flow_mask *new) { struct sw_flow_mask *mask; + mask = flow_mask_find(tbl, new); if (!mask) { /* Allocate a new mask if none exsits. */ @@ -631,7 +853,12 @@ static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow, return -ENOMEM; mask->key = new->key; mask->range = new->range; - list_add_rcu(&mask->list, &tbl->mask_list); + + /* Add mask to mask-list. */ + if (tbl_mask_array_add_mask(tbl, mask)) { + kfree(mask); + return -ENOMEM; + } } else { BUG_ON(!mask->ref_count); mask->ref_count++; diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index bc52045b63ff..8a5cea6ae111 100644 --- a/net/openvswitch/flow_table.h +++ b/net/openvswitch/flow_table.h @@ -22,6 +22,17 @@ #include "flow.h" +struct mask_cache_entry { + u32 skb_hash; + u32 mask_index; +}; + +struct mask_array { + struct rcu_head rcu; + int count, max; + struct sw_flow_mask __rcu *masks[]; +}; + struct table_instance { struct hlist_head *buckets; unsigned int n_buckets; @@ -34,7 +45,8 @@ struct table_instance { struct flow_table { struct table_instance __rcu *ti; struct table_instance __rcu *ufid_ti; - struct list_head mask_list; + struct mask_cache_entry __percpu *mask_cache; + struct mask_array __rcu *mask_array; unsigned long last_rehash; unsigned int count; unsigned int ufid_count; @@ -60,8 +72,9 @@ int ovs_flow_tbl_num_masks(const struct flow_table *table); struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *table, u32 *bucket, u32 *idx); struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *, - const struct sw_flow_key *, - u32 *n_mask_hit); + const struct sw_flow_key *, + u32 skb_hash, + u32 *n_mask_hit); struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *, const struct sw_flow_key *); struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index d2437b5b2f6a..58a7b8312c28 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -137,7 +137,7 @@ static void do_setup(struct net_device *netdev) netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH | IFF_NO_QUEUE; netdev->needs_free_netdev = true; - netdev->priv_destructor = internal_dev_destructor; + netdev->priv_destructor = NULL; netdev->ethtool_ops = &internal_dev_ethtool_ops; netdev->rtnl_link_ops = &internal_dev_link_ops; @@ -159,7 +159,6 @@ static struct vport *internal_dev_create(const struct vport_parms *parms) struct internal_dev *internal_dev; struct net_device *dev; int err; - bool free_vport = true; vport = ovs_vport_alloc(0, &ovs_internal_vport_ops, parms); if (IS_ERR(vport)) { @@ -190,10 +189,9 @@ static struct vport *internal_dev_create(const struct vport_parms *parms) rtnl_lock(); err = register_netdevice(vport->dev); - if (err) { - free_vport = false; + if (err) goto error_unlock; - } + vport->dev->priv_destructor = internal_dev_destructor; dev_set_promiscuity(vport->dev, 1); rtnl_unlock(); @@ -207,8 +205,7 @@ error_unlock: error_free_netdev: free_netdev(dev); error_free_vport: - if (free_vport) - ovs_vport_free(vport); + ovs_vport_free(vport); error: return ERR_PTR(err); } @@ -237,7 +234,7 @@ static netdev_tx_t internal_dev_recv(struct sk_buff *skb) } skb_dst_drop(skb); - nf_reset(skb); + nf_reset_ct(skb); secpath_reset(skb); skb->pkt_type = PACKET_HOST; diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 3fc38d16c456..5da9392b03d6 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -403,8 +403,9 @@ u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb) ids = rcu_dereference(vport->upcall_portids); - if (ids->n_ids == 1 && ids->ids[0] == 0) - return 0; + /* If there is only one portid, select it in the fast-path. */ + if (ids->n_ids == 1) + return ids->ids[0]; hash = skb_get_hash(skb); ids_index = hash - ids->n_ids * reciprocal_divide(hash, ids->rn_ids); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index e2742b006d25..30c6879d6774 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -408,17 +408,17 @@ static int __packet_get_status(const struct packet_sock *po, void *frame) } } -static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts, +static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec64 *ts, unsigned int flags) { struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); if (shhwtstamps && (flags & SOF_TIMESTAMPING_RAW_HARDWARE) && - ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts)) + ktime_to_timespec64_cond(shhwtstamps->hwtstamp, ts)) return TP_STATUS_TS_RAW_HARDWARE; - if (ktime_to_timespec_cond(skb->tstamp, ts)) + if (ktime_to_timespec64_cond(skb->tstamp, ts)) return TP_STATUS_TS_SOFTWARE; return 0; @@ -428,13 +428,20 @@ static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame, struct sk_buff *skb) { union tpacket_uhdr h; - struct timespec ts; + struct timespec64 ts; __u32 ts_status; if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp))) return 0; h.raw = frame; + /* + * versions 1 through 3 overflow the timestamps in y2106, since they + * all store the seconds in a 32-bit unsigned integer. + * If we create a version 4, that should have a 64-bit timestamp, + * either 64-bit seconds + 32-bit nanoseconds, or just 64-bit + * nanoseconds. + */ switch (po->tp_version) { case TPACKET_V1: h.h1->tp_sec = ts.tv_sec; @@ -520,7 +527,7 @@ static int prb_calc_retire_blk_tmo(struct packet_sock *po, int blk_size_in_bytes) { struct net_device *dev; - unsigned int mbits = 0, msec = 0, div = 0, tmo = 0; + unsigned int mbits, div; struct ethtool_link_ksettings ecmd; int err; @@ -532,30 +539,25 @@ static int prb_calc_retire_blk_tmo(struct packet_sock *po, } err = __ethtool_get_link_ksettings(dev, &ecmd); rtnl_unlock(); - if (!err) { - /* - * If the link speed is so slow you don't really - * need to worry about perf anyways - */ - if (ecmd.base.speed < SPEED_1000 || - ecmd.base.speed == SPEED_UNKNOWN) { - return DEFAULT_PRB_RETIRE_TOV; - } else { - msec = 1; - div = ecmd.base.speed / 1000; - } - } + if (err) + return DEFAULT_PRB_RETIRE_TOV; + + /* If the link speed is so slow you don't really + * need to worry about perf anyways + */ + if (ecmd.base.speed < SPEED_1000 || + ecmd.base.speed == SPEED_UNKNOWN) + return DEFAULT_PRB_RETIRE_TOV; + div = ecmd.base.speed / 1000; mbits = (blk_size_in_bytes * 8) / (1024 * 1024); if (div) mbits /= div; - tmo = mbits * msec; - if (div) - return tmo+1; - return tmo; + return mbits + 1; + return mbits; } static void prb_init_ft_ops(struct tpacket_kbdq_core *p1, @@ -774,8 +776,8 @@ static void prb_close_block(struct tpacket_kbdq_core *pkc1, * It shouldn't really happen as we don't close empty * blocks. See prb_retire_rx_blk_timer_expired(). */ - struct timespec ts; - getnstimeofday(&ts); + struct timespec64 ts; + ktime_get_real_ts64(&ts); h1->ts_last_pkt.ts_sec = ts.tv_sec; h1->ts_last_pkt.ts_nsec = ts.tv_nsec; } @@ -805,7 +807,7 @@ static void prb_thaw_queue(struct tpacket_kbdq_core *pkc) static void prb_open_block(struct tpacket_kbdq_core *pkc1, struct tpacket_block_desc *pbd1) { - struct timespec ts; + struct timespec64 ts; struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; smp_rmb(); @@ -818,7 +820,7 @@ static void prb_open_block(struct tpacket_kbdq_core *pkc1, BLOCK_NUM_PKTS(pbd1) = 0; BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); - getnstimeofday(&ts); + ktime_get_real_ts64(&ts); h1->ts_first_pkt.ts_sec = ts.tv_sec; h1->ts_first_pkt.ts_nsec = ts.tv_nsec; @@ -1295,15 +1297,21 @@ static void packet_sock_destruct(struct sock *sk) static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb) { - u32 rxhash; + u32 *history = po->rollover->history; + u32 victim, rxhash; int i, count = 0; rxhash = skb_get_hash(skb); for (i = 0; i < ROLLOVER_HLEN; i++) - if (po->rollover->history[i] == rxhash) + if (READ_ONCE(history[i]) == rxhash) count++; - po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash; + victim = prandom_u32() % ROLLOVER_HLEN; + + /* Avoid dirtying the cache line if possible */ + if (READ_ONCE(history[victim]) != rxhash) + WRITE_ONCE(history[victim], rxhash); + return count > (ROLLOVER_HLEN >> 1); } @@ -1821,7 +1829,7 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, skb_dst_drop(skb); /* drop conntrack reference */ - nf_reset(skb); + nf_reset_ct(skb); spkt = &PACKET_SKB_CB(skb)->sa.pkt; @@ -2121,7 +2129,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, skb_dst_drop(skb); /* drop conntrack reference */ - nf_reset(skb); + nf_reset_ct(skb); spin_lock(&sk->sk_receive_queue.lock); po->stats.stats1.tp_packets++; @@ -2162,7 +2170,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, unsigned long status = TP_STATUS_USER; unsigned short macoff, netoff, hdrlen; struct sk_buff *copy_skb = NULL; - struct timespec ts; + struct timespec64 ts; __u32 ts_status; bool is_drop_n_account = false; bool do_vnet = false; @@ -2294,7 +2302,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, skb_copy_bits(skb, 0, h.raw + macoff, snaplen); if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp))) - getnstimeofday(&ts); + ktime_get_real_ts64(&ts); status |= ts_status; diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index 49bd76a87461..ac0fae06cc15 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -35,8 +35,6 @@ static unsigned int phonet_net_id __read_mostly; static struct phonet_net *phonet_pernet(struct net *net) { - BUG_ON(!net); - return net_generic(net, phonet_net_id); } diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 96ea9f254ae9..76d499f6af9a 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -338,9 +338,9 @@ static __poll_t pn_socket_poll(struct file *file, struct socket *sock, if (sk->sk_state == TCP_CLOSE) return EPOLLERR; - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; - if (!skb_queue_empty(&pn->ctrlreq_queue)) + if (!skb_queue_empty_lockless(&pn->ctrlreq_queue)) mask |= EPOLLPRI; if (!mask && sk->sk_state == TCP_CLOSE_WAIT) return EPOLLHUP; diff --git a/net/psample/psample.c b/net/psample/psample.c index 841f198ea1a8..6f2fbc6b9eb2 100644 --- a/net/psample/psample.c +++ b/net/psample/psample.c @@ -73,7 +73,7 @@ static int psample_nl_cmd_get_group_dumpit(struct sk_buff *msg, int idx = 0; int err; - spin_lock(&psample_groups_lock); + spin_lock_bh(&psample_groups_lock); list_for_each_entry(group, &psample_groups_list, list) { if (!net_eq(group->net, sock_net(msg->sk))) continue; @@ -89,7 +89,7 @@ static int psample_nl_cmd_get_group_dumpit(struct sk_buff *msg, idx++; } - spin_unlock(&psample_groups_lock); + spin_unlock_bh(&psample_groups_lock); cb->args[0] = idx; return msg->len; } @@ -154,7 +154,7 @@ static void psample_group_destroy(struct psample_group *group) { psample_group_notify(group, PSAMPLE_CMD_DEL_GROUP); list_del(&group->list); - kfree(group); + kfree_rcu(group, rcu); } static struct psample_group * @@ -172,7 +172,7 @@ struct psample_group *psample_group_get(struct net *net, u32 group_num) { struct psample_group *group; - spin_lock(&psample_groups_lock); + spin_lock_bh(&psample_groups_lock); group = psample_group_lookup(net, group_num); if (!group) { @@ -183,19 +183,27 @@ struct psample_group *psample_group_get(struct net *net, u32 group_num) group->refcount++; out: - spin_unlock(&psample_groups_lock); + spin_unlock_bh(&psample_groups_lock); return group; } EXPORT_SYMBOL_GPL(psample_group_get); +void psample_group_take(struct psample_group *group) +{ + spin_lock_bh(&psample_groups_lock); + group->refcount++; + spin_unlock_bh(&psample_groups_lock); +} +EXPORT_SYMBOL_GPL(psample_group_take); + void psample_group_put(struct psample_group *group) { - spin_lock(&psample_groups_lock); + spin_lock_bh(&psample_groups_lock); if (--group->refcount == 0) psample_group_destroy(group); - spin_unlock(&psample_groups_lock); + spin_unlock_bh(&psample_groups_lock); } EXPORT_SYMBOL_GPL(psample_group_put); @@ -221,7 +229,7 @@ void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, data_len = PSAMPLE_MAX_PACKET_SIZE - meta_len - NLA_HDRLEN - NLA_ALIGNTO; - nl_skb = genlmsg_new(meta_len + data_len, GFP_ATOMIC); + nl_skb = genlmsg_new(meta_len + nla_total_size(data_len), GFP_ATOMIC); if (unlikely(!nl_skb)) return; diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 6c8b0f6d28f9..5a8e42ad1504 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -8,6 +8,8 @@ #include <linux/qrtr.h> #include <linux/termios.h> /* For TIOCINQ/OUTQ */ #include <linux/numa.h> +#include <linux/spinlock.h> +#include <linux/wait.h> #include <net/sock.h> @@ -97,10 +99,11 @@ static inline struct qrtr_sock *qrtr_sk(struct sock *sk) static unsigned int qrtr_local_nid = NUMA_NO_NODE; /* for node ids */ -static RADIX_TREE(qrtr_nodes, GFP_KERNEL); +static RADIX_TREE(qrtr_nodes, GFP_ATOMIC); +static DEFINE_SPINLOCK(qrtr_nodes_lock); /* broadcast list */ static LIST_HEAD(qrtr_all_nodes); -/* lock for qrtr_nodes, qrtr_all_nodes and node reference */ +/* lock for qrtr_all_nodes and node reference */ static DEFINE_MUTEX(qrtr_node_lock); /* local port allocation management */ @@ -113,8 +116,9 @@ static DEFINE_MUTEX(qrtr_port_lock); * @ep: endpoint * @ref: reference count for node * @nid: node id + * @qrtr_tx_flow: tree of qrtr_tx_flow, keyed by node << 32 | port + * @qrtr_tx_lock: lock for qrtr_tx_flow inserts * @rx_queue: receive queue - * @work: scheduled work struct for recv work * @item: list item for broadcast list */ struct qrtr_node { @@ -123,17 +127,36 @@ struct qrtr_node { struct kref ref; unsigned int nid; + struct radix_tree_root qrtr_tx_flow; + struct mutex qrtr_tx_lock; /* for qrtr_tx_flow */ + struct sk_buff_head rx_queue; - struct work_struct work; struct list_head item; }; +/** + * struct qrtr_tx_flow - tx flow control + * @resume_tx: waiters for a resume tx from the remote + * @pending: number of waiting senders + * @tx_failed: indicates that a message with confirm_rx flag was lost + */ +struct qrtr_tx_flow { + struct wait_queue_head resume_tx; + int pending; + int tx_failed; +}; + +#define QRTR_TX_FLOW_HIGH 10 +#define QRTR_TX_FLOW_LOW 5 + static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb, int type, struct sockaddr_qrtr *from, struct sockaddr_qrtr *to); static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb, int type, struct sockaddr_qrtr *from, struct sockaddr_qrtr *to); +static struct qrtr_sock *qrtr_port_lookup(int port); +static void qrtr_port_put(struct qrtr_sock *ipc); /* Release node resources and free the node. * @@ -143,14 +166,25 @@ static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb, static void __qrtr_node_release(struct kref *kref) { struct qrtr_node *node = container_of(kref, struct qrtr_node, ref); + struct radix_tree_iter iter; + unsigned long flags; + void __rcu **slot; + spin_lock_irqsave(&qrtr_nodes_lock, flags); if (node->nid != QRTR_EP_NID_AUTO) radix_tree_delete(&qrtr_nodes, node->nid); + spin_unlock_irqrestore(&qrtr_nodes_lock, flags); list_del(&node->item); mutex_unlock(&qrtr_node_lock); skb_queue_purge(&node->rx_queue); + + /* Free tx flow counters */ + radix_tree_for_each_slot(slot, &node->qrtr_tx_flow, &iter, 0) { + radix_tree_iter_delete(&node->qrtr_tx_flow, &iter, slot); + kfree(*slot); + } kfree(node); } @@ -170,6 +204,126 @@ static void qrtr_node_release(struct qrtr_node *node) kref_put_mutex(&node->ref, __qrtr_node_release, &qrtr_node_lock); } +/** + * qrtr_tx_resume() - reset flow control counter + * @node: qrtr_node that the QRTR_TYPE_RESUME_TX packet arrived on + * @skb: resume_tx packet + */ +static void qrtr_tx_resume(struct qrtr_node *node, struct sk_buff *skb) +{ + struct qrtr_ctrl_pkt *pkt = (struct qrtr_ctrl_pkt *)skb->data; + u64 remote_node = le32_to_cpu(pkt->client.node); + u32 remote_port = le32_to_cpu(pkt->client.port); + struct qrtr_tx_flow *flow; + unsigned long key; + + key = remote_node << 32 | remote_port; + + rcu_read_lock(); + flow = radix_tree_lookup(&node->qrtr_tx_flow, key); + rcu_read_unlock(); + if (flow) { + spin_lock(&flow->resume_tx.lock); + flow->pending = 0; + spin_unlock(&flow->resume_tx.lock); + wake_up_interruptible_all(&flow->resume_tx); + } + + consume_skb(skb); +} + +/** + * qrtr_tx_wait() - flow control for outgoing packets + * @node: qrtr_node that the packet is to be send to + * @dest_node: node id of the destination + * @dest_port: port number of the destination + * @type: type of message + * + * The flow control scheme is based around the low and high "watermarks". When + * the low watermark is passed the confirm_rx flag is set on the outgoing + * message, which will trigger the remote to send a control message of the type + * QRTR_TYPE_RESUME_TX to reset the counter. If the high watermark is hit + * further transmision should be paused. + * + * Return: 1 if confirm_rx should be set, 0 otherwise or errno failure + */ +static int qrtr_tx_wait(struct qrtr_node *node, int dest_node, int dest_port, + int type) +{ + unsigned long key = (u64)dest_node << 32 | dest_port; + struct qrtr_tx_flow *flow; + int confirm_rx = 0; + int ret; + + /* Never set confirm_rx on non-data packets */ + if (type != QRTR_TYPE_DATA) + return 0; + + mutex_lock(&node->qrtr_tx_lock); + flow = radix_tree_lookup(&node->qrtr_tx_flow, key); + if (!flow) { + flow = kzalloc(sizeof(*flow), GFP_KERNEL); + if (flow) { + init_waitqueue_head(&flow->resume_tx); + radix_tree_insert(&node->qrtr_tx_flow, key, flow); + } + } + mutex_unlock(&node->qrtr_tx_lock); + + /* Set confirm_rx if we where unable to find and allocate a flow */ + if (!flow) + return 1; + + spin_lock_irq(&flow->resume_tx.lock); + ret = wait_event_interruptible_locked_irq(flow->resume_tx, + flow->pending < QRTR_TX_FLOW_HIGH || + flow->tx_failed || + !node->ep); + if (ret < 0) { + confirm_rx = ret; + } else if (!node->ep) { + confirm_rx = -EPIPE; + } else if (flow->tx_failed) { + flow->tx_failed = 0; + confirm_rx = 1; + } else { + flow->pending++; + confirm_rx = flow->pending == QRTR_TX_FLOW_LOW; + } + spin_unlock_irq(&flow->resume_tx.lock); + + return confirm_rx; +} + +/** + * qrtr_tx_flow_failed() - flag that tx of confirm_rx flagged messages failed + * @node: qrtr_node that the packet is to be send to + * @dest_node: node id of the destination + * @dest_port: port number of the destination + * + * Signal that the transmission of a message with confirm_rx flag failed. The + * flow's "pending" counter will keep incrementing towards QRTR_TX_FLOW_HIGH, + * at which point transmission would stall forever waiting for the resume TX + * message associated with the dropped confirm_rx message. + * Work around this by marking the flow as having a failed transmission and + * cause the next transmission attempt to be sent with the confirm_rx. + */ +static void qrtr_tx_flow_failed(struct qrtr_node *node, int dest_node, + int dest_port) +{ + unsigned long key = (u64)dest_node << 32 | dest_port; + struct qrtr_tx_flow *flow; + + rcu_read_lock(); + flow = radix_tree_lookup(&node->qrtr_tx_flow, key); + rcu_read_unlock(); + if (flow) { + spin_lock_irq(&flow->resume_tx.lock); + flow->tx_failed = 1; + spin_unlock_irq(&flow->resume_tx.lock); + } +} + /* Pass an outgoing packet socket buffer to the endpoint driver. */ static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb, int type, struct sockaddr_qrtr *from, @@ -178,6 +332,13 @@ static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb, struct qrtr_hdr_v1 *hdr; size_t len = skb->len; int rc = -ENODEV; + int confirm_rx; + + confirm_rx = qrtr_tx_wait(node, to->sq_node, to->sq_port, type); + if (confirm_rx < 0) { + kfree_skb(skb); + return confirm_rx; + } hdr = skb_push(skb, sizeof(*hdr)); hdr->version = cpu_to_le32(QRTR_PROTO_VER_1); @@ -193,9 +354,9 @@ static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb, } hdr->size = cpu_to_le32(len); - hdr->confirm_rx = 0; + hdr->confirm_rx = !!confirm_rx; - skb_put_padto(skb, ALIGN(len, 4)); + skb_put_padto(skb, ALIGN(len, 4) + sizeof(*hdr)); mutex_lock(&node->ep_lock); if (node->ep) @@ -204,6 +365,11 @@ static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb, kfree_skb(skb); mutex_unlock(&node->ep_lock); + /* Need to ensure that a subsequent message carries the otherwise lost + * confirm_rx flag if we dropped this one */ + if (rc && confirm_rx) + qrtr_tx_flow_failed(node, to->sq_node, to->sq_port); + return rc; } @@ -214,11 +380,12 @@ static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb, static struct qrtr_node *qrtr_node_lookup(unsigned int nid) { struct qrtr_node *node; + unsigned long flags; - mutex_lock(&qrtr_node_lock); + spin_lock_irqsave(&qrtr_nodes_lock, flags); node = radix_tree_lookup(&qrtr_nodes, nid); node = qrtr_node_acquire(node); - mutex_unlock(&qrtr_node_lock); + spin_unlock_irqrestore(&qrtr_nodes_lock, flags); return node; } @@ -230,13 +397,15 @@ static struct qrtr_node *qrtr_node_lookup(unsigned int nid) */ static void qrtr_node_assign(struct qrtr_node *node, unsigned int nid) { + unsigned long flags; + if (node->nid != QRTR_EP_NID_AUTO || nid == QRTR_EP_NID_AUTO) return; - mutex_lock(&qrtr_node_lock); + spin_lock_irqsave(&qrtr_nodes_lock, flags); radix_tree_insert(&qrtr_nodes, nid, node); node->nid = nid; - mutex_unlock(&qrtr_node_lock); + spin_unlock_irqrestore(&qrtr_nodes_lock, flags); } /** @@ -252,6 +421,7 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) struct qrtr_node *node = ep->node; const struct qrtr_hdr_v1 *v1; const struct qrtr_hdr_v2 *v2; + struct qrtr_sock *ipc; struct sk_buff *skb; struct qrtr_cb *cb; unsigned int size; @@ -310,13 +480,26 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) if (len != ALIGN(size, 4) + hdrlen) goto err; - if (cb->dst_port != QRTR_PORT_CTRL && cb->type != QRTR_TYPE_DATA) + if (cb->dst_port != QRTR_PORT_CTRL && cb->type != QRTR_TYPE_DATA && + cb->type != QRTR_TYPE_RESUME_TX) goto err; skb_put_data(skb, data + hdrlen, size); - skb_queue_tail(&node->rx_queue, skb); - schedule_work(&node->work); + qrtr_node_assign(node, cb->src_node); + + if (cb->type == QRTR_TYPE_RESUME_TX) { + qrtr_tx_resume(node, skb); + } else { + ipc = qrtr_port_lookup(cb->dst_port); + if (!ipc) + goto err; + + if (sock_queue_rcv_skb(&ipc->sk, skb)) + goto err; + + qrtr_port_put(ipc); + } return 0; @@ -351,61 +534,6 @@ static struct sk_buff *qrtr_alloc_ctrl_packet(struct qrtr_ctrl_pkt **pkt) return skb; } -static struct qrtr_sock *qrtr_port_lookup(int port); -static void qrtr_port_put(struct qrtr_sock *ipc); - -/* Handle and route a received packet. - * - * This will auto-reply with resume-tx packet as necessary. - */ -static void qrtr_node_rx_work(struct work_struct *work) -{ - struct qrtr_node *node = container_of(work, struct qrtr_node, work); - struct qrtr_ctrl_pkt *pkt; - struct sockaddr_qrtr dst; - struct sockaddr_qrtr src; - struct sk_buff *skb; - - while ((skb = skb_dequeue(&node->rx_queue)) != NULL) { - struct qrtr_sock *ipc; - struct qrtr_cb *cb; - int confirm; - - cb = (struct qrtr_cb *)skb->cb; - src.sq_node = cb->src_node; - src.sq_port = cb->src_port; - dst.sq_node = cb->dst_node; - dst.sq_port = cb->dst_port; - confirm = !!cb->confirm_rx; - - qrtr_node_assign(node, cb->src_node); - - ipc = qrtr_port_lookup(cb->dst_port); - if (!ipc) { - kfree_skb(skb); - } else { - if (sock_queue_rcv_skb(&ipc->sk, skb)) - kfree_skb(skb); - - qrtr_port_put(ipc); - } - - if (confirm) { - skb = qrtr_alloc_ctrl_packet(&pkt); - if (!skb) - break; - - pkt->cmd = cpu_to_le32(QRTR_TYPE_RESUME_TX); - pkt->client.node = cpu_to_le32(dst.sq_node); - pkt->client.port = cpu_to_le32(dst.sq_port); - - if (qrtr_node_enqueue(node, skb, QRTR_TYPE_RESUME_TX, - &dst, &src)) - break; - } - } -} - /** * qrtr_endpoint_register() - register a new endpoint * @ep: endpoint to register @@ -425,13 +553,15 @@ int qrtr_endpoint_register(struct qrtr_endpoint *ep, unsigned int nid) if (!node) return -ENOMEM; - INIT_WORK(&node->work, qrtr_node_rx_work); kref_init(&node->ref); mutex_init(&node->ep_lock); skb_queue_head_init(&node->rx_queue); node->nid = QRTR_EP_NID_AUTO; node->ep = ep; + INIT_RADIX_TREE(&node->qrtr_tx_flow, GFP_KERNEL); + mutex_init(&node->qrtr_tx_lock); + qrtr_node_assign(node, nid); mutex_lock(&qrtr_node_lock); @@ -452,8 +582,11 @@ void qrtr_endpoint_unregister(struct qrtr_endpoint *ep) struct qrtr_node *node = ep->node; struct sockaddr_qrtr src = {AF_QIPCRTR, node->nid, QRTR_PORT_CTRL}; struct sockaddr_qrtr dst = {AF_QIPCRTR, qrtr_local_nid, QRTR_PORT_CTRL}; + struct radix_tree_iter iter; struct qrtr_ctrl_pkt *pkt; + struct qrtr_tx_flow *flow; struct sk_buff *skb; + void __rcu **slot; mutex_lock(&node->ep_lock); node->ep = NULL; @@ -466,6 +599,14 @@ void qrtr_endpoint_unregister(struct qrtr_endpoint *ep) qrtr_local_enqueue(NULL, skb, QRTR_TYPE_BYE, &src, &dst); } + /* Wake up any transmitters waiting for resume-tx from the node */ + mutex_lock(&node->qrtr_tx_lock); + radix_tree_for_each_slot(slot, &node->qrtr_tx_flow, &iter, 0) { + flow = *slot; + wake_up_interruptible_all(&flow->resume_tx); + } + mutex_unlock(&node->qrtr_tx_lock); + qrtr_node_release(node); ep->node = NULL; } @@ -482,11 +623,11 @@ static struct qrtr_sock *qrtr_port_lookup(int port) if (port == QRTR_PORT_CTRL) port = 0; - mutex_lock(&qrtr_port_lock); + rcu_read_lock(); ipc = idr_find(&qrtr_ports, port); if (ipc) sock_hold(&ipc->sk); - mutex_unlock(&qrtr_port_lock); + rcu_read_unlock(); return ipc; } @@ -528,6 +669,10 @@ static void qrtr_port_remove(struct qrtr_sock *ipc) mutex_lock(&qrtr_port_lock); idr_remove(&qrtr_ports, port); mutex_unlock(&qrtr_port_lock); + + /* Ensure that if qrtr_port_lookup() did enter the RCU read section we + * wait for it to up increment the refcount */ + synchronize_rcu(); } /* Assign port number to socket. @@ -815,6 +960,34 @@ out_node: return rc; } +static int qrtr_send_resume_tx(struct qrtr_cb *cb) +{ + struct sockaddr_qrtr remote = { AF_QIPCRTR, cb->src_node, cb->src_port }; + struct sockaddr_qrtr local = { AF_QIPCRTR, cb->dst_node, cb->dst_port }; + struct qrtr_ctrl_pkt *pkt; + struct qrtr_node *node; + struct sk_buff *skb; + int ret; + + node = qrtr_node_lookup(remote.sq_node); + if (!node) + return -EINVAL; + + skb = qrtr_alloc_ctrl_packet(&pkt); + if (!skb) + return -ENOMEM; + + pkt->cmd = cpu_to_le32(QRTR_TYPE_RESUME_TX); + pkt->client.node = cpu_to_le32(cb->dst_node); + pkt->client.port = cpu_to_le32(cb->dst_port); + + ret = qrtr_node_enqueue(node, skb, QRTR_TYPE_RESUME_TX, &local, &remote); + + qrtr_node_release(node); + + return ret; +} + static int qrtr_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { @@ -837,6 +1010,7 @@ static int qrtr_recvmsg(struct socket *sock, struct msghdr *msg, release_sock(sk); return rc; } + cb = (struct qrtr_cb *)skb->cb; copied = skb->len; if (copied > size) { @@ -850,7 +1024,6 @@ static int qrtr_recvmsg(struct socket *sock, struct msghdr *msg, rc = copied; if (addr) { - cb = (struct qrtr_cb *)skb->cb; addr->sq_family = AF_QIPCRTR; addr->sq_node = cb->src_node; addr->sq_port = cb->src_port; @@ -858,6 +1031,9 @@ static int qrtr_recvmsg(struct socket *sock, struct msghdr *msg, } out: + if (cb->confirm_rx) + qrtr_send_resume_tx(cb); + skb_free_datagram(sk, skb); release_sock(sk); diff --git a/net/qrtr/tun.c b/net/qrtr/tun.c index ccff1e544c21..15ce9b642b25 100644 --- a/net/qrtr/tun.c +++ b/net/qrtr/tun.c @@ -84,11 +84,14 @@ static ssize_t qrtr_tun_write_iter(struct kiocb *iocb, struct iov_iter *from) if (!kbuf) return -ENOMEM; - if (!copy_from_iter_full(kbuf, len, from)) + if (!copy_from_iter_full(kbuf, len, from)) { + kfree(kbuf); return -EFAULT; + } ret = qrtr_endpoint_post(&tun->ep, kbuf, len); + kfree(kbuf); return ret < 0 ? ret : len; } @@ -108,15 +111,11 @@ static __poll_t qrtr_tun_poll(struct file *filp, poll_table *wait) static int qrtr_tun_release(struct inode *inode, struct file *filp) { struct qrtr_tun *tun = filp->private_data; - struct sk_buff *skb; qrtr_endpoint_unregister(&tun->ep); /* Discard all SKBs */ - while (!skb_queue_empty(&tun->queue)) { - skb = skb_dequeue(&tun->queue); - kfree_skb(skb); - } + skb_queue_purge(&tun->queue); kfree(tun); diff --git a/net/rds/Kconfig b/net/rds/Kconfig index 38ea7f0f2699..c64e154bc18f 100644 --- a/net/rds/Kconfig +++ b/net/rds/Kconfig @@ -23,6 +23,6 @@ config RDS_TCP This transport does not support RDMA operations. config RDS_DEBUG - bool "RDS debugging messages" + bool "RDS debugging messages" depends on RDS - default n + default n diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 2b969f99ef13..1a5bf3fa4578 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -559,7 +559,7 @@ static int rds_connect(struct socket *sock, struct sockaddr *uaddr, ret = -EDESTADDRREQ; break; } - if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) || + if (ipv4_is_multicast(sin->sin_addr.s_addr) || sin->sin_addr.s_addr == htonl(INADDR_BROADCAST)) { ret = -EINVAL; break; @@ -593,7 +593,7 @@ static int rds_connect(struct socket *sock, struct sockaddr *uaddr, addr4 = sin6->sin6_addr.s6_addr32[3]; if (addr4 == htonl(INADDR_ANY) || addr4 == htonl(INADDR_BROADCAST) || - IN_MULTICAST(ntohl(addr4))) { + ipv4_is_multicast(addr4)) { ret = -EPROTOTYPE; break; } @@ -705,7 +705,7 @@ static int rds_create(struct net *net, struct socket *sock, int protocol, if (sock->type != SOCK_SEQPACKET || protocol) return -ESOCKTNOSUPPORT; - sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto, kern); + sk = sk_alloc(net, AF_RDS, GFP_KERNEL, &rds_proto, kern); if (!sk) return -ENOMEM; @@ -741,6 +741,10 @@ static void rds_sock_inc_info(struct socket *sock, unsigned int len, spin_lock_bh(&rds_sock_lock); list_for_each_entry(rs, &rds_sock_list, rs_item) { + /* This option only supports IPv4 sockets. */ + if (!ipv6_addr_v4mapped(&rs->rs_bound_addr)) + continue; + read_lock(&rs->rs_recv_lock); /* XXX too lazy to maintain counts.. */ @@ -762,21 +766,60 @@ static void rds_sock_inc_info(struct socket *sock, unsigned int len, lens->each = sizeof(struct rds_info_message); } +#if IS_ENABLED(CONFIG_IPV6) +static void rds6_sock_inc_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + struct rds_incoming *inc; + unsigned int total = 0; + struct rds_sock *rs; + + len /= sizeof(struct rds6_info_message); + + spin_lock_bh(&rds_sock_lock); + + list_for_each_entry(rs, &rds_sock_list, rs_item) { + read_lock(&rs->rs_recv_lock); + + list_for_each_entry(inc, &rs->rs_recv_queue, i_item) { + total++; + if (total <= len) + rds6_inc_info_copy(inc, iter, &inc->i_saddr, + &rs->rs_bound_addr, 1); + } + + read_unlock(&rs->rs_recv_lock); + } + + spin_unlock_bh(&rds_sock_lock); + + lens->nr = total; + lens->each = sizeof(struct rds6_info_message); +} +#endif + static void rds_sock_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { struct rds_info_socket sinfo; + unsigned int cnt = 0; struct rds_sock *rs; len /= sizeof(struct rds_info_socket); spin_lock_bh(&rds_sock_lock); - if (len < rds_sock_count) + if (len < rds_sock_count) { + cnt = rds_sock_count; goto out; + } list_for_each_entry(rs, &rds_sock_list, rs_item) { + /* This option only supports IPv4 sockets. */ + if (!ipv6_addr_v4mapped(&rs->rs_bound_addr)) + continue; sinfo.sndbuf = rds_sk_sndbuf(rs); sinfo.rcvbuf = rds_sk_rcvbuf(rs); sinfo.bound_addr = rs->rs_bound_addr_v4; @@ -786,15 +829,51 @@ static void rds_sock_info(struct socket *sock, unsigned int len, sinfo.inum = sock_i_ino(rds_rs_to_sk(rs)); rds_info_copy(iter, &sinfo, sizeof(sinfo)); + cnt++; } out: - lens->nr = rds_sock_count; + lens->nr = cnt; lens->each = sizeof(struct rds_info_socket); spin_unlock_bh(&rds_sock_lock); } +#if IS_ENABLED(CONFIG_IPV6) +static void rds6_sock_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + struct rds6_info_socket sinfo6; + struct rds_sock *rs; + + len /= sizeof(struct rds6_info_socket); + + spin_lock_bh(&rds_sock_lock); + + if (len < rds_sock_count) + goto out; + + list_for_each_entry(rs, &rds_sock_list, rs_item) { + sinfo6.sndbuf = rds_sk_sndbuf(rs); + sinfo6.rcvbuf = rds_sk_rcvbuf(rs); + sinfo6.bound_addr = rs->rs_bound_addr; + sinfo6.connected_addr = rs->rs_conn_addr; + sinfo6.bound_port = rs->rs_bound_port; + sinfo6.connected_port = rs->rs_conn_port; + sinfo6.inum = sock_i_ino(rds_rs_to_sk(rs)); + + rds_info_copy(iter, &sinfo6, sizeof(sinfo6)); + } + + out: + lens->nr = rds_sock_count; + lens->each = sizeof(struct rds6_info_socket); + + spin_unlock_bh(&rds_sock_lock); +} +#endif + static void rds_exit(void) { sock_unregister(rds_family_ops.family); @@ -808,6 +887,10 @@ static void rds_exit(void) rds_bind_lock_destroy(); rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info); rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); +#if IS_ENABLED(CONFIG_IPV6) + rds_info_deregister_func(RDS6_INFO_SOCKETS, rds6_sock_info); + rds_info_deregister_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info); +#endif } module_exit(rds_exit); @@ -845,6 +928,10 @@ static int rds_init(void) rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info); rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); +#if IS_ENABLED(CONFIG_IPV6) + rds_info_register_func(RDS6_INFO_SOCKETS, rds6_sock_info); + rds_info_register_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info); +#endif goto out; diff --git a/net/rds/bind.c b/net/rds/bind.c index 0f4398e7f2a7..5b5fb4ca8d3e 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -181,7 +181,7 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (addr_len < sizeof(struct sockaddr_in) || sin->sin_addr.s_addr == htonl(INADDR_ANY) || sin->sin_addr.s_addr == htonl(INADDR_BROADCAST) || - IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) + ipv4_is_multicast(sin->sin_addr.s_addr)) return -EINVAL; ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &v6addr); binding_addr = &v6addr; @@ -206,7 +206,7 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) addr4 = sin6->sin6_addr.s6_addr32[3]; if (addr4 == htonl(INADDR_ANY) || addr4 == htonl(INADDR_BROADCAST) || - IN_MULTICAST(ntohl(addr4))) + ipv4_is_multicast(addr4)) return -EINVAL; } /* The scope ID must be specified for link local address. */ @@ -239,34 +239,33 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; } - sock_set_flag(sk, SOCK_RCU_FREE); - ret = rds_add_bound(rs, binding_addr, &port, scope_id); - if (ret) - goto out; - - if (rs->rs_transport) { /* previously bound */ + /* The transport can be set using SO_RDS_TRANSPORT option before the + * socket is bound. + */ + if (rs->rs_transport) { trans = rs->rs_transport; - if (trans->laddr_check(sock_net(sock->sk), + if (!trans->laddr_check || + trans->laddr_check(sock_net(sock->sk), binding_addr, scope_id) != 0) { ret = -ENOPROTOOPT; - rds_remove_bound(rs); - } else { - ret = 0; + goto out; } - goto out; - } - trans = rds_trans_get_preferred(sock_net(sock->sk), binding_addr, - scope_id); - if (!trans) { - ret = -EADDRNOTAVAIL; - rds_remove_bound(rs); - pr_info_ratelimited("RDS: %s could not find a transport for %pI6c, load rds_tcp or rds_rdma?\n", - __func__, binding_addr); - goto out; + } else { + trans = rds_trans_get_preferred(sock_net(sock->sk), + binding_addr, scope_id); + if (!trans) { + ret = -EADDRNOTAVAIL; + pr_info_ratelimited("RDS: %s could not find a transport for %pI6c, load rds_tcp or rds_rdma?\n", + __func__, binding_addr); + goto out; + } + rs->rs_transport = trans; } - rs->rs_transport = trans; - ret = 0; + sock_set_flag(sk, SOCK_RCU_FREE); + ret = rds_add_bound(rs, binding_addr, &port, scope_id); + if (ret) + rs->rs_transport = NULL; out: release_sock(sk); diff --git a/net/rds/ib.c b/net/rds/ib.c index ec05d91aa9a2..a792d8a3872a 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -30,6 +30,7 @@ * SOFTWARE. * */ +#include <linux/dmapool.h> #include <linux/kernel.h> #include <linux/in.h> #include <linux/if.h> @@ -107,6 +108,7 @@ static void rds_ib_dev_free(struct work_struct *work) rds_ib_destroy_mr_pool(rds_ibdev->mr_1m_pool); if (rds_ibdev->pd) ib_dealloc_pd(rds_ibdev->pd); + dma_pool_destroy(rds_ibdev->rid_hdrs_pool); list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { list_del(&i_ipaddr->list); @@ -143,6 +145,9 @@ static void rds_ib_add_one(struct ib_device *device) refcount_set(&rds_ibdev->refcount, 1); INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free); + INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); + INIT_LIST_HEAD(&rds_ibdev->conn_list); + rds_ibdev->max_wrs = device->attrs.max_qp_wr; rds_ibdev->max_sge = min(device->attrs.max_send_sge, RDS_IB_MAX_SGE); @@ -151,6 +156,13 @@ static void rds_ib_add_one(struct ib_device *device) has_fmr = (device->ops.alloc_fmr && device->ops.dealloc_fmr && device->ops.map_phys_fmr && device->ops.unmap_fmr); rds_ibdev->use_fastreg = (has_fr && !has_fmr); + rds_ibdev->odp_capable = + !!(device->attrs.device_cap_flags & + IB_DEVICE_ON_DEMAND_PAGING) && + !!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps & + IB_ODP_SUPPORT_WRITE) && + !!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps & + IB_ODP_SUPPORT_READ); rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32; rds_ibdev->max_1m_mrs = device->attrs.max_mr ? @@ -179,6 +191,12 @@ static void rds_ib_add_one(struct ib_device *device) rds_ibdev->pd = NULL; goto put_dev; } + rds_ibdev->rid_hdrs_pool = dma_pool_create(device->name, + device->dma_device, + sizeof(struct rds_header), + L1_CACHE_BYTES, 0); + if (!rds_ibdev->rid_hdrs_pool) + goto put_dev; rds_ibdev->mr_1m_pool = rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_1M_POOL); @@ -203,9 +221,6 @@ static void rds_ib_add_one(struct ib_device *device) device->name, rds_ibdev->use_fastreg ? "FRMR" : "FMR"); - INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); - INIT_LIST_HEAD(&rds_ibdev->conn_list); - down_write(&rds_ib_devices_lock); list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices); up_write(&rds_ib_devices_lock); @@ -291,7 +306,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, void *buffer) { struct rds_info_rdma_connection *iinfo = buffer; - struct rds_ib_connection *ic; + struct rds_ib_connection *ic = conn->c_transport_data; /* We will only ever look at IB transports */ if (conn->c_trans != &rds_ib_transport) @@ -301,15 +316,16 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, iinfo->src_addr = conn->c_laddr.s6_addr32[3]; iinfo->dst_addr = conn->c_faddr.s6_addr32[3]; - iinfo->tos = conn->c_tos; + if (ic) { + iinfo->tos = conn->c_tos; + iinfo->sl = ic->i_sl; + } memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid)); memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid)); if (rds_conn_state(conn) == RDS_CONN_UP) { struct rds_ib_device *rds_ibdev; - ic = conn->c_transport_data; - rdma_read_gids(ic->i_cm_id, (union ib_gid *)&iinfo->src_gid, (union ib_gid *)&iinfo->dst_gid); @@ -329,7 +345,7 @@ static int rds6_ib_conn_info_visitor(struct rds_connection *conn, void *buffer) { struct rds6_info_rdma_connection *iinfo6 = buffer; - struct rds_ib_connection *ic; + struct rds_ib_connection *ic = conn->c_transport_data; /* We will only ever look at IB transports */ if (conn->c_trans != &rds_ib_transport) @@ -337,6 +353,10 @@ static int rds6_ib_conn_info_visitor(struct rds_connection *conn, iinfo6->src_addr = conn->c_laddr; iinfo6->dst_addr = conn->c_faddr; + if (ic) { + iinfo6->tos = conn->c_tos; + iinfo6->sl = ic->i_sl; + } memset(&iinfo6->src_gid, 0, sizeof(iinfo6->src_gid)); memset(&iinfo6->dst_gid, 0, sizeof(iinfo6->dst_gid)); @@ -344,7 +364,6 @@ static int rds6_ib_conn_info_visitor(struct rds_connection *conn, if (rds_conn_state(conn) == RDS_CONN_UP) { struct rds_ib_device *rds_ibdev; - ic = conn->c_transport_data; rdma_read_gids(ic->i_cm_id, (union ib_gid *)&iinfo6->src_gid, (union ib_gid *)&iinfo6->dst_gid); rds_ibdev = ic->rds_ibdev; diff --git a/net/rds/ib.h b/net/rds/ib.h index 303c6ee8bdb7..0296f1f7acda 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -165,8 +165,8 @@ struct rds_ib_connection { /* tx */ struct rds_ib_work_ring i_send_ring; struct rm_data_op *i_data_op; - struct rds_header *i_send_hdrs; - dma_addr_t i_send_hdrs_dma; + struct rds_header **i_send_hdrs; + dma_addr_t *i_send_hdrs_dma; struct rds_ib_send_work *i_sends; atomic_t i_signaled_sends; @@ -175,8 +175,8 @@ struct rds_ib_connection { struct rds_ib_work_ring i_recv_ring; struct rds_ib_incoming *i_ibinc; u32 i_recv_data_rem; - struct rds_header *i_recv_hdrs; - dma_addr_t i_recv_hdrs_dma; + struct rds_header **i_recv_hdrs; + dma_addr_t *i_recv_hdrs_dma; struct rds_ib_recv_work *i_recvs; u64 i_ack_recv; /* last ACK received */ struct rds_ib_refill_cache i_cache_incs; @@ -220,6 +220,7 @@ struct rds_ib_connection { /* Send/Recv vectors */ int i_scq_vector; int i_rcq_vector; + u8 i_sl; }; /* This assumes that atomic_t is at least 32 bits */ @@ -245,7 +246,9 @@ struct rds_ib_device { struct list_head conn_list; struct ib_device *dev; struct ib_pd *pd; - bool use_fastreg; + struct dma_pool *rid_hdrs_pool; /* RDS headers DMA pool */ + u8 use_fastreg:1; + u8 odp_capable:1; unsigned int max_mrs; struct rds_ib_mr_pool *mr_1m_pool; @@ -380,7 +383,11 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6); void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event); - +struct rds_header **rds_dma_hdrs_alloc(struct ib_device *ibdev, + struct dma_pool *pool, + dma_addr_t **dma_addrs, u32 num_hdrs); +void rds_dma_hdrs_free(struct dma_pool *pool, struct rds_header **hdrs, + dma_addr_t *dma_addrs, u32 num_hdrs); #define rds_ib_conn_error(conn, fmt...) \ __rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt) diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index fddaa09f7b0d..c71f4328d138 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -30,6 +30,7 @@ * SOFTWARE. * */ +#include <linux/dmapool.h> #include <linux/kernel.h> #include <linux/in.h> #include <linux/slab.h> @@ -152,6 +153,9 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even RDS_PROTOCOL_MINOR(conn->c_version), ic->i_flowctl ? ", flow control" : ""); + /* receive sl from the peer */ + ic->i_sl = ic->i_cm_id->route.path_rec->sl; + atomic_set(&ic->i_cq_quiesce, 0); /* Init rings and fill recv. this needs to wait until protocol @@ -436,6 +440,68 @@ static inline void ibdev_put_vector(struct rds_ib_device *rds_ibdev, int index) rds_ibdev->vector_load[index]--; } +/* Allocate DMA coherent memory to be used to store struct rds_header for + * sending/receiving packets. The pointers to the DMA memory and the + * associated DMA addresses are stored in two arrays. + * + * @ibdev: the IB device + * @pool: the DMA memory pool + * @dma_addrs: pointer to the array for storing DMA addresses + * @num_hdrs: number of headers to allocate + * + * It returns the pointer to the array storing the DMA memory pointers. On + * error, NULL pointer is returned. + */ +struct rds_header **rds_dma_hdrs_alloc(struct ib_device *ibdev, + struct dma_pool *pool, + dma_addr_t **dma_addrs, u32 num_hdrs) +{ + struct rds_header **hdrs; + dma_addr_t *hdr_daddrs; + u32 i; + + hdrs = kvmalloc_node(sizeof(*hdrs) * num_hdrs, GFP_KERNEL, + ibdev_to_node(ibdev)); + if (!hdrs) + return NULL; + + hdr_daddrs = kvmalloc_node(sizeof(*hdr_daddrs) * num_hdrs, GFP_KERNEL, + ibdev_to_node(ibdev)); + if (!hdr_daddrs) { + kvfree(hdrs); + return NULL; + } + + for (i = 0; i < num_hdrs; i++) { + hdrs[i] = dma_pool_zalloc(pool, GFP_KERNEL, &hdr_daddrs[i]); + if (!hdrs[i]) { + rds_dma_hdrs_free(pool, hdrs, hdr_daddrs, i); + return NULL; + } + } + + *dma_addrs = hdr_daddrs; + return hdrs; +} + +/* Free the DMA memory used to store struct rds_header. + * + * @pool: the DMA memory pool + * @hdrs: pointer to the array storing DMA memory pointers + * @dma_addrs: pointer to the array storing DMA addresses + * @num_hdars: number of headers to free. + */ +void rds_dma_hdrs_free(struct dma_pool *pool, struct rds_header **hdrs, + dma_addr_t *dma_addrs, u32 num_hdrs) +{ + u32 i; + + for (i = 0; i < num_hdrs; i++) + dma_pool_free(pool, hdrs[i], dma_addrs[i]); + kvfree(hdrs); + kvfree(dma_addrs); +} + /* * This needs to be very careful to not leave IS_ERR pointers around for * cleanup to trip over. @@ -447,7 +513,9 @@ static int rds_ib_setup_qp(struct rds_connection *conn) struct ib_qp_init_attr attr; struct ib_cq_init_attr cq_attr = {}; struct rds_ib_device *rds_ibdev; + unsigned long max_wrs; int ret, fr_queue_space; + struct dma_pool *pool; /* * It's normal to see a null device if an incoming connection races @@ -466,10 +534,15 @@ static int rds_ib_setup_qp(struct rds_connection *conn) /* add the conn now so that connection establishment has the dev */ rds_ib_add_conn(rds_ibdev, conn); - if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1) - rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1); - if (rds_ibdev->max_wrs < ic->i_recv_ring.w_nr + 1) - rds_ib_ring_resize(&ic->i_recv_ring, rds_ibdev->max_wrs - 1); + max_wrs = rds_ibdev->max_wrs < rds_ib_sysctl_max_send_wr + 1 ? + rds_ibdev->max_wrs - 1 : rds_ib_sysctl_max_send_wr; + if (ic->i_send_ring.w_nr != max_wrs) + rds_ib_ring_resize(&ic->i_send_ring, max_wrs); + + max_wrs = rds_ibdev->max_wrs < rds_ib_sysctl_max_recv_wr + 1 ? + rds_ibdev->max_wrs - 1 : rds_ib_sysctl_max_recv_wr; + if (ic->i_recv_ring.w_nr != max_wrs) + rds_ib_ring_resize(&ic->i_recv_ring, max_wrs); /* Protection domain and memory range */ ic->i_pd = rds_ibdev->pd; @@ -538,31 +611,28 @@ static int rds_ib_setup_qp(struct rds_connection *conn) goto recv_cq_out; } - ic->i_send_hdrs = ib_dma_alloc_coherent(dev, - ic->i_send_ring.w_nr * - sizeof(struct rds_header), - &ic->i_send_hdrs_dma, GFP_KERNEL); + pool = rds_ibdev->rid_hdrs_pool; + ic->i_send_hdrs = rds_dma_hdrs_alloc(dev, pool, &ic->i_send_hdrs_dma, + ic->i_send_ring.w_nr); if (!ic->i_send_hdrs) { ret = -ENOMEM; - rdsdebug("ib_dma_alloc_coherent send failed\n"); + rdsdebug("DMA send hdrs alloc failed\n"); goto qp_out; } - ic->i_recv_hdrs = ib_dma_alloc_coherent(dev, - ic->i_recv_ring.w_nr * - sizeof(struct rds_header), - &ic->i_recv_hdrs_dma, GFP_KERNEL); + ic->i_recv_hdrs = rds_dma_hdrs_alloc(dev, pool, &ic->i_recv_hdrs_dma, + ic->i_recv_ring.w_nr); if (!ic->i_recv_hdrs) { ret = -ENOMEM; - rdsdebug("ib_dma_alloc_coherent recv failed\n"); + rdsdebug("DMA recv hdrs alloc failed\n"); goto send_hdrs_dma_out; } - ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), - &ic->i_ack_dma, GFP_KERNEL); + ic->i_ack = dma_pool_zalloc(pool, GFP_KERNEL, + &ic->i_ack_dma); if (!ic->i_ack) { ret = -ENOMEM; - rdsdebug("ib_dma_alloc_coherent ack failed\n"); + rdsdebug("DMA ack header alloc failed\n"); goto recv_hdrs_dma_out; } @@ -593,17 +663,23 @@ static int rds_ib_setup_qp(struct rds_connection *conn) sends_out: vfree(ic->i_sends); + ack_dma_out: - ib_dma_free_coherent(dev, sizeof(struct rds_header), - ic->i_ack, ic->i_ack_dma); + dma_pool_free(pool, ic->i_ack, ic->i_ack_dma); + ic->i_ack = NULL; + recv_hdrs_dma_out: - ib_dma_free_coherent(dev, ic->i_recv_ring.w_nr * - sizeof(struct rds_header), - ic->i_recv_hdrs, ic->i_recv_hdrs_dma); + rds_dma_hdrs_free(pool, ic->i_recv_hdrs, ic->i_recv_hdrs_dma, + ic->i_recv_ring.w_nr); + ic->i_recv_hdrs = NULL; + ic->i_recv_hdrs_dma = NULL; + send_hdrs_dma_out: - ib_dma_free_coherent(dev, ic->i_send_ring.w_nr * - sizeof(struct rds_header), - ic->i_send_hdrs, ic->i_send_hdrs_dma); + rds_dma_hdrs_free(pool, ic->i_send_hdrs, ic->i_send_hdrs_dma, + ic->i_send_ring.w_nr); + ic->i_send_hdrs = NULL; + ic->i_send_hdrs_dma = NULL; + qp_out: rdma_destroy_qp(ic->i_cm_id); recv_cq_out: @@ -981,8 +1057,6 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp) ic->i_cm_id ? ic->i_cm_id->qp : NULL); if (ic->i_cm_id) { - struct ib_device *dev = ic->i_cm_id->device; - rdsdebug("disconnecting cm %p\n", ic->i_cm_id); err = rdma_disconnect(ic->i_cm_id); if (err) { @@ -1032,24 +1106,39 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp) ib_destroy_cq(ic->i_recv_cq); } - /* then free the resources that ib callbacks use */ - if (ic->i_send_hdrs) - ib_dma_free_coherent(dev, - ic->i_send_ring.w_nr * - sizeof(struct rds_header), - ic->i_send_hdrs, - ic->i_send_hdrs_dma); - - if (ic->i_recv_hdrs) - ib_dma_free_coherent(dev, - ic->i_recv_ring.w_nr * - sizeof(struct rds_header), - ic->i_recv_hdrs, - ic->i_recv_hdrs_dma); - - if (ic->i_ack) - ib_dma_free_coherent(dev, sizeof(struct rds_header), - ic->i_ack, ic->i_ack_dma); + if (ic->rds_ibdev) { + struct dma_pool *pool; + + pool = ic->rds_ibdev->rid_hdrs_pool; + + /* then free the resources that ib callbacks use */ + if (ic->i_send_hdrs) { + rds_dma_hdrs_free(pool, ic->i_send_hdrs, + ic->i_send_hdrs_dma, + ic->i_send_ring.w_nr); + ic->i_send_hdrs = NULL; + ic->i_send_hdrs_dma = NULL; + } + + if (ic->i_recv_hdrs) { + rds_dma_hdrs_free(pool, ic->i_recv_hdrs, + ic->i_recv_hdrs_dma, + ic->i_recv_ring.w_nr); + ic->i_recv_hdrs = NULL; + ic->i_recv_hdrs_dma = NULL; + } + + if (ic->i_ack) { + dma_pool_free(pool, ic->i_ack, ic->i_ack_dma); + ic->i_ack = NULL; + } + } else { + WARN_ON(ic->i_send_hdrs); + WARN_ON(ic->i_send_hdrs_dma); + WARN_ON(ic->i_recv_hdrs); + WARN_ON(ic->i_recv_hdrs_dma); + WARN_ON(ic->i_ack); + } if (ic->i_sends) rds_ib_send_clear_ring(ic); @@ -1068,9 +1157,6 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp) ic->i_pd = NULL; ic->i_send_cq = NULL; ic->i_recv_cq = NULL; - ic->i_send_hdrs = NULL; - ic->i_recv_hdrs = NULL; - ic->i_ack = NULL; } BUG_ON(ic->rds_ibdev); @@ -1096,8 +1182,9 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp) ic->i_flowctl = 0; atomic_set(&ic->i_credits, 0); - rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr); - rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr); + /* Re-init rings, but retain sizes. */ + rds_ib_ring_init(&ic->i_send_ring, ic->i_send_ring.w_nr); + rds_ib_ring_init(&ic->i_recv_ring, ic->i_recv_ring.w_nr); if (ic->i_ibinc) { rds_inc_put(&ic->i_ibinc->ii_inc); @@ -1144,8 +1231,8 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) * rds_ib_conn_shutdown() waits for these to be emptied so they * must be initialized before it can be called. */ - rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr); - rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr); + rds_ib_ring_init(&ic->i_send_ring, 0); + rds_ib_ring_init(&ic->i_recv_ring, 0); ic->conn = conn; conn->c_transport_data = ic; diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h index 9045a8c0edff..0c8252d7fe2b 100644 --- a/net/rds/ib_mr.h +++ b/net/rds/ib_mr.h @@ -67,6 +67,7 @@ struct rds_ib_frmr { /* This is stored as mr->r_trans_private. */ struct rds_ib_mr { + struct delayed_work work; struct rds_ib_device *device; struct rds_ib_mr_pool *pool; struct rds_ib_connection *ic; @@ -81,9 +82,11 @@ struct rds_ib_mr { unsigned int sg_len; int sg_dma_len; + u8 odp:1; union { struct rds_ib_fmr fmr; struct rds_ib_frmr frmr; + struct ib_mr *mr; } u; }; @@ -122,12 +125,14 @@ void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev, void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, struct rds_sock *rs, u32 *key_ret, - struct rds_connection *conn); + struct rds_connection *conn, u64 start, u64 length, + int need_odp); void rds_ib_sync_mr(void *trans_private, int dir); void rds_ib_free_mr(void *trans_private, int invalidate); void rds_ib_flush_mrs(void); int rds_ib_mr_init(void); void rds_ib_mr_exit(void); +u32 rds_ib_get_lkey(void *trans_private); void __rds_ib_teardown_mr(struct rds_ib_mr *); void rds_ib_teardown_mr(struct rds_ib_mr *); diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index c8c1e3ae8d84..b34b24e237f8 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -37,8 +37,15 @@ #include "rds_single_path.h" #include "ib_mr.h" +#include "rds.h" struct workqueue_struct *rds_ib_mr_wq; +struct rds_ib_dereg_odp_mr { + struct work_struct work; + struct ib_mr *mr; +}; + +static void rds_ib_odp_mr_worker(struct work_struct *work); static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) { @@ -213,6 +220,9 @@ void rds_ib_sync_mr(void *trans_private, int direction) struct rds_ib_mr *ibmr = trans_private; struct rds_ib_device *rds_ibdev = ibmr->device; + if (ibmr->odp) + return; + switch (direction) { case DMA_FROM_DEVICE: ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg, @@ -482,6 +492,16 @@ void rds_ib_free_mr(void *trans_private, int invalidate) rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); + if (ibmr->odp) { + /* A MR created and marked as use_once. We use delayed work, + * because there is a change that we are in interrupt and can't + * call to ib_dereg_mr() directly. + */ + INIT_DELAYED_WORK(&ibmr->work, rds_ib_odp_mr_worker); + queue_delayed_work(rds_ib_mr_wq, &ibmr->work, 0); + return; + } + /* Return it to the pool's free list */ if (rds_ibdev->use_fastreg) rds_ib_free_frmr_list(ibmr); @@ -526,9 +546,17 @@ void rds_ib_flush_mrs(void) up_read(&rds_ib_devices_lock); } +u32 rds_ib_get_lkey(void *trans_private) +{ + struct rds_ib_mr *ibmr = trans_private; + + return ibmr->u.mr->lkey; +} + void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, struct rds_sock *rs, u32 *key_ret, - struct rds_connection *conn) + struct rds_connection *conn, + u64 start, u64 length, int need_odp) { struct rds_ib_device *rds_ibdev; struct rds_ib_mr *ibmr = NULL; @@ -541,6 +569,51 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, goto out; } + if (need_odp == ODP_ZEROBASED || need_odp == ODP_VIRTUAL) { + u64 virt_addr = need_odp == ODP_ZEROBASED ? 0 : start; + int access_flags = + (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC | + IB_ACCESS_ON_DEMAND); + struct ib_sge sge = {}; + struct ib_mr *ib_mr; + + if (!rds_ibdev->odp_capable) { + ret = -EOPNOTSUPP; + goto out; + } + + ib_mr = ib_reg_user_mr(rds_ibdev->pd, start, length, virt_addr, + access_flags); + + if (IS_ERR(ib_mr)) { + rdsdebug("rds_ib_get_user_mr returned %d\n", + IS_ERR(ib_mr)); + ret = PTR_ERR(ib_mr); + goto out; + } + if (key_ret) + *key_ret = ib_mr->rkey; + + ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL); + if (!ibmr) { + ib_dereg_mr(ib_mr); + ret = -ENOMEM; + goto out; + } + ibmr->u.mr = ib_mr; + ibmr->odp = 1; + + sge.addr = virt_addr; + sge.length = length; + sge.lkey = ib_mr->lkey; + + ib_advise_mr(rds_ibdev->pd, + IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE, + IB_UVERBS_ADVISE_MR_FLAG_FLUSH, &sge, 1); + return ibmr; + } + if (conn) ic = conn->c_transport_data; @@ -629,3 +702,12 @@ void rds_ib_mr_exit(void) { destroy_workqueue(rds_ib_mr_wq); } + +static void rds_ib_odp_mr_worker(struct work_struct *work) +{ + struct rds_ib_mr *ibmr; + + ibmr = container_of(work, struct rds_ib_mr, work.work); + ib_dereg_mr(ibmr->u.mr); + kfree(ibmr); +} diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 3cae88cbdaa0..694d411dc72f 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -61,7 +61,7 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic) recv->r_wr.num_sge = RDS_IB_RECV_SGE; sge = &recv->r_sge[0]; - sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); + sge->addr = ic->i_recv_hdrs_dma[i]; sge->length = sizeof(struct rds_header); sge->lkey = ic->i_pd->local_dma_lkey; @@ -343,7 +343,7 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn, WARN_ON(ret != 1); sge = &recv->r_sge[0]; - sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); + sge->addr = ic->i_recv_hdrs_dma[recv - ic->i_recvs]; sge->length = sizeof(struct rds_header); sge = &recv->r_sge[1]; @@ -385,6 +385,7 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp) unsigned int posted = 0; int ret = 0; bool can_wait = !!(gfp & __GFP_DIRECT_RECLAIM); + bool must_wake = false; u32 pos; /* the goal here is to just make sure that someone, somewhere @@ -405,6 +406,7 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp) recv = &ic->i_recvs[pos]; ret = rds_ib_recv_refill_one(conn, recv, gfp); if (ret) { + must_wake = true; break; } @@ -423,6 +425,11 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp) } posted++; + + if ((posted > 128 && need_resched()) || posted > 8192) { + must_wake = true; + break; + } } /* We're doing flow control - update the window. */ @@ -445,10 +452,13 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp) * if we should requeue. */ if (rds_conn_up(conn) && - ((can_wait && rds_ib_ring_low(&ic->i_recv_ring)) || + (must_wake || + (can_wait && rds_ib_ring_low(&ic->i_recv_ring)) || rds_ib_ring_empty(&ic->i_recv_ring))) { queue_delayed_work(rds_wq, &conn->c_recv_w, 1); } + if (can_wait) + cond_resched(); } /* @@ -851,7 +861,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, } data_len -= sizeof(struct rds_header); - ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs]; + ihdr = ic->i_recv_hdrs[recv - ic->i_recvs]; /* Validate the checksum. */ if (!rds_message_verify_checksum(ihdr)) { @@ -983,10 +993,11 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic, } else { /* We expect errors as the qp is drained during shutdown */ if (rds_conn_up(conn) || rds_conn_connecting(conn)) - rds_ib_conn_error(conn, "recv completion on <%pI6c,%pI6c, %d> had status %u (%s), disconnecting and reconnecting\n", + rds_ib_conn_error(conn, "recv completion on <%pI6c,%pI6c, %d> had status %u (%s), vendor err 0x%x, disconnecting and reconnecting\n", &conn->c_laddr, &conn->c_faddr, conn->c_tos, wc->status, - ib_wc_status_msg(wc->status)); + ib_wc_status_msg(wc->status), + wc->vendor_err); } /* rds_ib_process_recv() doesn't always consume the frag, and @@ -1038,9 +1049,14 @@ int rds_ib_recv_init(void) si_meminfo(&si); rds_ib_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE; - rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming", - sizeof(struct rds_ib_incoming), - 0, SLAB_HWCACHE_ALIGN, NULL); + rds_ib_incoming_slab = + kmem_cache_create_usercopy("rds_ib_incoming", + sizeof(struct rds_ib_incoming), + 0, SLAB_HWCACHE_ALIGN, + offsetof(struct rds_ib_incoming, + ii_inc.i_usercopy), + sizeof(struct rds_inc_usercopy), + NULL); if (!rds_ib_incoming_slab) goto out; diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index dfe6237dafe2..dfe778220657 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -39,6 +39,7 @@ #include "rds_single_path.h" #include "rds.h" #include "ib.h" +#include "ib_mr.h" /* * Convert IB-specific error message to RDS error message and call core @@ -201,7 +202,8 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic) send->s_wr.ex.imm_data = 0; sge = &send->s_sge[0]; - sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); + sge->addr = ic->i_send_hdrs_dma[i]; + sge->length = sizeof(struct rds_header); sge->lkey = ic->i_pd->local_dma_lkey; @@ -300,10 +302,10 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) /* We expect errors as the qp is drained during shutdown */ if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) { - rds_ib_conn_error(conn, "send completion on <%pI6c,%pI6c,%d> had status %u (%s), disconnecting and reconnecting\n", + rds_ib_conn_error(conn, "send completion on <%pI6c,%pI6c,%d> had status %u (%s), vendor err 0x%x, disconnecting and reconnecting\n", &conn->c_laddr, &conn->c_faddr, conn->c_tos, wc->status, - ib_wc_status_msg(wc->status)); + ib_wc_status_msg(wc->status), wc->vendor_err); } } @@ -631,11 +633,14 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, send->s_queued = jiffies; send->s_op = NULL; - send->s_sge[0].addr = ic->i_send_hdrs_dma - + (pos * sizeof(struct rds_header)); + send->s_sge[0].addr = ic->i_send_hdrs_dma[pos]; + send->s_sge[0].length = sizeof(struct rds_header); + send->s_sge[0].lkey = ic->i_pd->local_dma_lkey; + + memcpy(ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, + sizeof(struct rds_header)); - memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); /* Set up the data, if present */ if (i < work_alloc @@ -647,6 +652,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, send->s_sge[1].addr = sg_dma_address(scat); send->s_sge[1].addr += rm->data.op_dmaoff; send->s_sge[1].length = len; + send->s_sge[1].lkey = ic->i_pd->local_dma_lkey; bytes_sent += len; rm->data.op_dmaoff += len; @@ -674,7 +680,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, &send->s_wr, send->s_wr.num_sge, send->s_wr.next); if (ic->i_flowctl && adv_credits) { - struct rds_header *hdr = &ic->i_send_hdrs[pos]; + struct rds_header *hdr = ic->i_send_hdrs[pos]; /* add credit and redo the header checksum */ hdr->h_credit = adv_credits; @@ -855,20 +861,29 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) int ret; int num_sge; int nr_sig = 0; + u64 odp_addr = op->op_odp_addr; + u32 odp_lkey = 0; /* map the op the first time we see it */ - if (!op->op_mapped) { - op->op_count = ib_dma_map_sg(ic->i_cm_id->device, - op->op_sg, op->op_nents, (op->op_write) ? - DMA_TO_DEVICE : DMA_FROM_DEVICE); - rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count); - if (op->op_count == 0) { - rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); - ret = -ENOMEM; /* XXX ? */ - goto out; + if (!op->op_odp_mr) { + if (!op->op_mapped) { + op->op_count = + ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, + op->op_nents, + (op->op_write) ? DMA_TO_DEVICE : + DMA_FROM_DEVICE); + rdsdebug("ic %p mapping op %p: %d\n", ic, op, + op->op_count); + if (op->op_count == 0) { + rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); + ret = -ENOMEM; /* XXX ? */ + goto out; + } + op->op_mapped = 1; } - - op->op_mapped = 1; + } else { + op->op_count = op->op_nents; + odp_lkey = rds_ib_get_lkey(op->op_odp_mr->r_trans_private); } /* @@ -920,14 +935,20 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) for (j = 0; j < send->s_rdma_wr.wr.num_sge && scat != &op->op_sg[op->op_count]; j++) { len = sg_dma_len(scat); - send->s_sge[j].addr = sg_dma_address(scat); + if (!op->op_odp_mr) { + send->s_sge[j].addr = sg_dma_address(scat); + send->s_sge[j].lkey = ic->i_pd->local_dma_lkey; + } else { + send->s_sge[j].addr = odp_addr; + send->s_sge[j].lkey = odp_lkey; + } send->s_sge[j].length = len; - send->s_sge[j].lkey = ic->i_pd->local_dma_lkey; sent += len; rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr); remote_addr += len; + odp_addr += len; scat++; } diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c index 9252ad126335..ac46d8961b61 100644 --- a/net/rds/ib_stats.c +++ b/net/rds/ib_stats.c @@ -42,7 +42,7 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats); static const char *const rds_ib_stat_names[] = { "ib_connect_raced", "ib_listen_closed_stale", - "s_ib_evt_handler_call", + "ib_evt_handler_call", "ib_tasklet_call", "ib_tx_cq_event", "ib_tx_ring_full", diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 916f5ec373d8..3341eee87bf9 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -156,11 +156,13 @@ void rds_rdma_drop_keys(struct rds_sock *rs) static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages, struct page **pages, int write) { + unsigned int gup_flags = FOLL_LONGTERM; int ret; - ret = get_user_pages_fast(user_addr, nr_pages, write ? FOLL_WRITE : 0, - pages); + if (write) + gup_flags |= FOLL_WRITE; + ret = get_user_pages_fast(user_addr, nr_pages, gup_flags, pages); if (ret >= 0 && ret < nr_pages) { while (ret--) put_page(pages[ret]); @@ -175,13 +177,14 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, struct rds_conn_path *cp) { struct rds_mr *mr = NULL, *found; + struct scatterlist *sg = NULL; unsigned int nr_pages; struct page **pages = NULL; - struct scatterlist *sg; void *trans_private; unsigned long flags; rds_rdma_cookie_t cookie; - unsigned int nents; + unsigned int nents = 0; + int need_odp = 0; long i; int ret; @@ -195,6 +198,21 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, goto out; } + /* If the combination of the addr and size requested for this memory + * region causes an integer overflow, return error. + */ + if (((args->vec.addr + args->vec.bytes) < args->vec.addr) || + PAGE_ALIGN(args->vec.addr + args->vec.bytes) < + (args->vec.addr + args->vec.bytes)) { + ret = -EINVAL; + goto out; + } + + if (!can_do_mlock()) { + ret = -EPERM; + goto out; + } + nr_pages = rds_pages_in_vec(&args->vec); if (nr_pages == 0) { ret = -EINVAL; @@ -248,36 +266,44 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, * the zero page. */ ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1); - if (ret < 0) - goto out; - - nents = ret; - sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL); - if (!sg) { - ret = -ENOMEM; + if (ret == -EOPNOTSUPP) { + need_odp = 1; + } else if (ret <= 0) { goto out; - } - WARN_ON(!nents); - sg_init_table(sg, nents); - - /* Stick all pages into the scatterlist */ - for (i = 0 ; i < nents; i++) - sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0); + } else { + nents = ret; + sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL); + if (!sg) { + ret = -ENOMEM; + goto out; + } + WARN_ON(!nents); + sg_init_table(sg, nents); - rdsdebug("RDS: trans_private nents is %u\n", nents); + /* Stick all pages into the scatterlist */ + for (i = 0 ; i < nents; i++) + sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0); + rdsdebug("RDS: trans_private nents is %u\n", nents); + } /* Obtain a transport specific MR. If this succeeds, the * s/g list is now owned by the MR. * Note that dma_map() implies that pending writes are * flushed to RAM, so no dma_sync is needed here. */ - trans_private = rs->rs_transport->get_mr(sg, nents, rs, - &mr->r_key, - cp ? cp->cp_conn : NULL); + trans_private = rs->rs_transport->get_mr( + sg, nents, rs, &mr->r_key, cp ? cp->cp_conn : NULL, + args->vec.addr, args->vec.bytes, + need_odp ? ODP_ZEROBASED : ODP_NOT_NEEDED); if (IS_ERR(trans_private)) { - for (i = 0 ; i < nents; i++) - put_page(sg_page(&sg[i])); - kfree(sg); + /* In ODP case, we don't GUP pages, so don't need + * to release anything. + */ + if (!need_odp) { + for (i = 0 ; i < nents; i++) + put_page(sg_page(&sg[i])); + kfree(sg); + } ret = PTR_ERR(trans_private); goto out; } @@ -291,7 +317,11 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, * map page aligned regions. So we keep the offset, and build * a 64bit cookie containing <R_Key, offset> and pass that * around. */ - cookie = rds_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGE_MASK); + if (need_odp) + cookie = rds_rdma_make_cookie(mr->r_key, 0); + else + cookie = rds_rdma_make_cookie(mr->r_key, + args->vec.addr & ~PAGE_MASK); if (cookie_ret) *cookie_ret = cookie; @@ -456,22 +486,26 @@ void rds_rdma_free_op(struct rm_rdma_op *ro) { unsigned int i; - for (i = 0; i < ro->op_nents; i++) { - struct page *page = sg_page(&ro->op_sg[i]); - - /* Mark page dirty if it was possibly modified, which - * is the case for a RDMA_READ which copies from remote - * to local memory */ - if (!ro->op_write) { - WARN_ON(!page->mapping && irqs_disabled()); - set_page_dirty(page); + if (ro->op_odp_mr) { + rds_mr_put(ro->op_odp_mr); + } else { + for (i = 0; i < ro->op_nents; i++) { + struct page *page = sg_page(&ro->op_sg[i]); + + /* Mark page dirty if it was possibly modified, which + * is the case for a RDMA_READ which copies from remote + * to local memory + */ + if (!ro->op_write) + set_page_dirty(page); + put_page(page); } - put_page(page); } kfree(ro->op_notifier); ro->op_notifier = NULL; ro->op_active = 0; + ro->op_odp_mr = NULL; } void rds_atomic_free_op(struct rm_atomic_op *ao) @@ -581,6 +615,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, struct rds_iovec *iovs; unsigned int i, j; int ret = 0; + bool odp_supported = true; if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) || rm->rdma.op_active) @@ -602,6 +637,9 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, ret = -EINVAL; goto out_ret; } + /* odp-mr is not supported for multiple requests within one message */ + if (args->nr_local != 1) + odp_supported = false; iovs = vec->iov; @@ -623,6 +661,8 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, op->op_silent = !!(args->flags & RDS_RDMA_SILENT); op->op_active = 1; op->op_recverr = rs->rs_recverr; + op->op_odp_mr = NULL; + WARN_ON(!nr_pages); op->op_sg = rds_message_alloc_sgs(rm, nr_pages, &ret); if (!op->op_sg) @@ -672,10 +712,44 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, * If it's a READ operation, we need to pin the pages for writing. */ ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write); - if (ret < 0) + if ((!odp_supported && ret <= 0) || + (odp_supported && ret <= 0 && ret != -EOPNOTSUPP)) goto out_pages; - else - ret = 0; + + if (ret == -EOPNOTSUPP) { + struct rds_mr *local_odp_mr; + + if (!rs->rs_transport->get_mr) { + ret = -EOPNOTSUPP; + goto out_pages; + } + local_odp_mr = + kzalloc(sizeof(*local_odp_mr), GFP_KERNEL); + if (!local_odp_mr) { + ret = -ENOMEM; + goto out_pages; + } + RB_CLEAR_NODE(&local_odp_mr->r_rb_node); + refcount_set(&local_odp_mr->r_refcount, 1); + local_odp_mr->r_trans = rs->rs_transport; + local_odp_mr->r_sock = rs; + local_odp_mr->r_trans_private = + rs->rs_transport->get_mr( + NULL, 0, rs, &local_odp_mr->r_key, NULL, + iov->addr, iov->bytes, ODP_VIRTUAL); + if (IS_ERR(local_odp_mr->r_trans_private)) { + ret = IS_ERR(local_odp_mr->r_trans_private); + rdsdebug("get_mr ret %d %p\"", ret, + local_odp_mr->r_trans_private); + kfree(local_odp_mr); + ret = -EOPNOTSUPP; + goto out_pages; + } + rdsdebug("Need odp; local_odp_mr %p trans_private %p\n", + local_odp_mr, local_odp_mr->r_trans_private); + op->op_odp_mr = local_odp_mr; + op->op_odp_addr = iov->addr; + } rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n", nr_bytes, nr, iov->bytes, iov->addr); @@ -691,6 +765,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, min_t(unsigned int, iov->bytes, PAGE_SIZE - offset), offset); + sg_dma_len(sg) = sg->length; rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n", sg->offset, sg->length, iov->addr, iov->bytes); @@ -709,6 +784,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, goto out_pages; } op->op_bytes = nr_bytes; + ret = 0; out_pages: kfree(pages); @@ -755,7 +831,8 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); if (mr) { - mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE); + mr->r_trans->sync_mr(mr->r_trans_private, + DMA_TO_DEVICE); rm->rdma.op_rdma_mr = mr; } return err; diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index 9986d6065c4d..5f741e51b4ba 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -43,6 +43,9 @@ static struct rdma_cm_id *rds_rdma_listen_id; static struct rdma_cm_id *rds6_rdma_listen_id; #endif +/* Per IB specification 7.7.3, service level is a 4-bit field. */ +#define TOS_TO_SL(tos) ((tos) & 0xF) + static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id, struct rdma_cm_event *event, bool isv6) @@ -97,10 +100,13 @@ static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id, struct rds_ib_connection *ibic; ibic = conn->c_transport_data; - if (ibic && ibic->i_cm_id == cm_id) + if (ibic && ibic->i_cm_id == cm_id) { + cm_id->route.path_rec[0].sl = + TOS_TO_SL(conn->c_tos); ret = trans->cm_initiate_connect(cm_id, isv6); - else + } else { rds_conn_drop(conn); + } } break; diff --git a/net/rds/rds.h b/net/rds/rds.h index f0066d168499..e4a603523083 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -40,7 +40,6 @@ #ifdef ATOMIC64_INIT #define KERNEL_HAS_ATOMIC64 #endif - #ifdef RDS_DEBUG #define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args) #else @@ -271,6 +270,12 @@ struct rds_ext_header_rdma_dest { #define RDS_MSG_RX_END 2 #define RDS_MSG_RX_CMSG 3 +/* The following values are whitelisted for usercopy */ +struct rds_inc_usercopy { + rds_rdma_cookie_t rdma_cookie; + ktime_t rx_tstamp; +}; + struct rds_incoming { refcount_t i_refcount; struct list_head i_item; @@ -280,8 +285,7 @@ struct rds_incoming { unsigned long i_rx_jiffies; struct in6_addr i_saddr; - rds_rdma_cookie_t i_rdma_cookie; - ktime_t i_rx_tstamp; + struct rds_inc_usercopy i_usercopy; u64 i_rx_lat_trace[RDS_RX_MAX_TRACES]; }; @@ -473,6 +477,9 @@ struct rds_message { struct rds_notifier *op_notifier; struct rds_mr *op_rdma_mr; + + u64 op_odp_addr; + struct rds_mr *op_odp_mr; } rdma; struct rm_data_op { unsigned int op_active:1; @@ -568,7 +575,8 @@ struct rds_transport { void (*exit)(void); void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg, struct rds_sock *rs, u32 *key_ret, - struct rds_connection *conn); + struct rds_connection *conn, + u64 start, u64 length, int need_odp); void (*sync_mr)(void *trans_private, int direction); void (*free_mr)(void *trans_private, int invalidate); void (*flush_mrs)(void); @@ -717,7 +725,7 @@ struct rds_statistics { uint64_t s_cong_send_blocked; uint64_t s_recv_bytes_added_to_socket; uint64_t s_recv_bytes_removed_from_socket; - + uint64_t s_send_stuck_rm; }; /* af_rds.c */ @@ -951,6 +959,12 @@ static inline bool rds_destroy_pending(struct rds_connection *conn) (conn->c_trans->t_unloading && conn->c_trans->t_unloading(conn)); } +enum { + ODP_NOT_NEEDED, + ODP_ZEROBASED, + ODP_VIRTUAL +}; + /* stats.c */ DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); #define rds_stats_inc_which(which, member) do { \ diff --git a/net/rds/recv.c b/net/rds/recv.c index 853de4876088..c8404971d5ab 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -47,8 +47,8 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, INIT_LIST_HEAD(&inc->i_item); inc->i_conn = conn; inc->i_saddr = *saddr; - inc->i_rdma_cookie = 0; - inc->i_rx_tstamp = ktime_set(0, 0); + inc->i_usercopy.rdma_cookie = 0; + inc->i_usercopy.rx_tstamp = ktime_set(0, 0); memset(inc->i_rx_lat_trace, 0, sizeof(inc->i_rx_lat_trace)); } @@ -62,8 +62,8 @@ void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *cp, inc->i_conn = cp->cp_conn; inc->i_conn_path = cp; inc->i_saddr = *saddr; - inc->i_rdma_cookie = 0; - inc->i_rx_tstamp = ktime_set(0, 0); + inc->i_usercopy.rdma_cookie = 0; + inc->i_usercopy.rx_tstamp = ktime_set(0, 0); } EXPORT_SYMBOL_GPL(rds_inc_path_init); @@ -186,7 +186,7 @@ static void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock case RDS_EXTHDR_RDMA_DEST: /* We ignore the size for now. We could stash it * somewhere and use it for error checking. */ - inc->i_rdma_cookie = rds_rdma_make_cookie( + inc->i_usercopy.rdma_cookie = rds_rdma_make_cookie( be32_to_cpu(buffer.rdma_dest.h_rdma_rkey), be32_to_cpu(buffer.rdma_dest.h_rdma_offset)); @@ -380,7 +380,7 @@ void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr, be32_to_cpu(inc->i_hdr.h_len), inc->i_hdr.h_dport); if (sock_flag(sk, SOCK_RCVTSTAMP)) - inc->i_rx_tstamp = ktime_get_real(); + inc->i_usercopy.rx_tstamp = ktime_get_real(); rds_inc_addref(inc); inc->i_rx_lat_trace[RDS_MSG_RX_END] = local_clock(); list_add_tail(&inc->i_item, &rs->rs_recv_queue); @@ -540,16 +540,18 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg, { int ret = 0; - if (inc->i_rdma_cookie) { + if (inc->i_usercopy.rdma_cookie) { ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST, - sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie); + sizeof(inc->i_usercopy.rdma_cookie), + &inc->i_usercopy.rdma_cookie); if (ret) goto out; } - if ((inc->i_rx_tstamp != 0) && + if ((inc->i_usercopy.rx_tstamp != 0) && sock_flag(rds_rs_to_sk(rs), SOCK_RCVTSTAMP)) { - struct __kernel_old_timeval tv = ns_to_kernel_old_timeval(inc->i_rx_tstamp); + struct __kernel_old_timeval tv = + ns_to_kernel_old_timeval(inc->i_usercopy.rx_tstamp); if (!sock_flag(rds_rs_to_sk(rs), SOCK_TSTAMP_NEW)) { ret = put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD, @@ -811,6 +813,7 @@ void rds6_inc_info_copy(struct rds_incoming *inc, minfo6.seq = be64_to_cpu(inc->i_hdr.h_sequence); minfo6.len = be32_to_cpu(inc->i_hdr.h_len); + minfo6.tos = inc->i_conn->c_tos; if (flip) { minfo6.laddr = *daddr; @@ -824,6 +827,8 @@ void rds6_inc_info_copy(struct rds_incoming *inc, minfo6.fport = inc->i_hdr.h_dport; } + minfo6.flags = 0; + rds_info_copy(iter, &minfo6, sizeof(minfo6)); } #endif diff --git a/net/rds/send.c b/net/rds/send.c index 031b1e97a466..82dcd8b84fe7 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -145,6 +145,7 @@ int rds_send_xmit(struct rds_conn_path *cp) LIST_HEAD(to_be_dropped); int batch_count; unsigned long send_gen = 0; + int same_rm = 0; restart: batch_count = 0; @@ -200,6 +201,17 @@ restart: rm = cp->cp_xmit_rm; + if (!rm) { + same_rm = 0; + } else { + same_rm++; + if (same_rm >= 4096) { + rds_stats_inc(s_send_stuck_rm); + ret = -EAGAIN; + break; + } + } + /* * If between sending messages, we can send a pending congestion * map update. @@ -1132,7 +1144,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) case AF_INET: if (usin->sin_addr.s_addr == htonl(INADDR_ANY) || usin->sin_addr.s_addr == htonl(INADDR_BROADCAST) || - IN_MULTICAST(ntohl(usin->sin_addr.s_addr))) { + ipv4_is_multicast(usin->sin_addr.s_addr)) { ret = -EINVAL; goto out; } @@ -1163,7 +1175,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) addr4 = sin6->sin6_addr.s6_addr32[3]; if (addr4 == htonl(INADDR_ANY) || addr4 == htonl(INADDR_BROADCAST) || - IN_MULTICAST(ntohl(addr4))) { + ipv4_is_multicast(addr4)) { ret = -EINVAL; goto out; } diff --git a/net/rds/stats.c b/net/rds/stats.c index 73be187d389e..9e87da43c004 100644 --- a/net/rds/stats.c +++ b/net/rds/stats.c @@ -76,6 +76,9 @@ static const char *const rds_stat_names[] = { "cong_update_received", "cong_send_error", "cong_send_blocked", + "recv_bytes_added_to_sock", + "recv_bytes_freed_fromsock", + "send_stuck_rm", }; void rds_stats_info_copy(struct rds_info_iterator *iter, diff --git a/net/rfkill/core.c b/net/rfkill/core.c index f9b08a6d8dbe..971c73c7d34c 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -1002,10 +1002,13 @@ static void rfkill_sync_work(struct work_struct *work) int __must_check rfkill_register(struct rfkill *rfkill) { static unsigned long rfkill_no; - struct device *dev = &rfkill->dev; + struct device *dev; int error; - BUG_ON(!rfkill); + if (!rfkill) + return -EINVAL; + + dev = &rfkill->dev; mutex_lock(&rfkill_global_mutex); @@ -1311,15 +1314,17 @@ static const struct file_operations rfkill_fops = { .release = rfkill_fop_release, #ifdef CONFIG_RFKILL_INPUT .unlocked_ioctl = rfkill_fop_ioctl, - .compat_ioctl = rfkill_fop_ioctl, + .compat_ioctl = compat_ptr_ioctl, #endif .llseek = no_llseek, }; +#define RFKILL_NAME "rfkill" + static struct miscdevice rfkill_miscdev = { - .name = "rfkill", .fops = &rfkill_fops, - .minor = MISC_DYNAMIC_MINOR, + .name = RFKILL_NAME, + .minor = RFKILL_MINOR, }; static int __init rfkill_init(void) @@ -1371,3 +1376,6 @@ static void __exit rfkill_exit(void) class_unregister(&rfkill_class); } module_exit(rfkill_exit); + +MODULE_ALIAS_MISCDEV(RFKILL_MINOR); +MODULE_ALIAS("devname:" RFKILL_NAME); diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index f0e9ccf472a9..1e8eeb044b07 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -65,28 +65,6 @@ static const struct proto_ops rose_proto_ops; ax25_address rose_callsign; /* - * ROSE network devices are virtual network devices encapsulating ROSE - * frames into AX.25 which will be sent through an AX.25 device, so form a - * special "super class" of normal net devices; split their locks off into a - * separate class since they always nest. - */ -static struct lock_class_key rose_netdev_xmit_lock_key; -static struct lock_class_key rose_netdev_addr_lock_key; - -static void rose_set_lockdep_one(struct net_device *dev, - struct netdev_queue *txq, - void *_unused) -{ - lockdep_set_class(&txq->_xmit_lock, &rose_netdev_xmit_lock_key); -} - -static void rose_set_lockdep_key(struct net_device *dev) -{ - lockdep_set_class(&dev->addr_list_lock, &rose_netdev_addr_lock_key); - netdev_for_each_tx_queue(dev, rose_set_lockdep_one, NULL); -} - -/* * Convert a ROSE address into text. */ char *rose2asc(char *buf, const rose_address *addr) @@ -928,7 +906,7 @@ static int rose_accept(struct socket *sock, struct socket *newsock, int flags, /* Now attach up the new socket */ skb->sk = NULL; kfree_skb(skb); - sk->sk_ack_backlog--; + sk_acceptq_removed(sk); out_release: release_sock(sk); @@ -1033,7 +1011,7 @@ int rose_rx_call_request(struct sk_buff *skb, struct net_device *dev, struct ros make_rose->va = 0; make_rose->vr = 0; make_rose->vl = 0; - sk->sk_ack_backlog++; + sk_acceptq_added(sk); rose_insert_socket(make); @@ -1497,7 +1475,7 @@ static int __init rose_proto_init(void) int rc; if (rose_ndevs > 0x7FFFFFFF/sizeof(struct net_device *)) { - printk(KERN_ERR "ROSE: rose_proto_init - rose_ndevs parameter to large\n"); + printk(KERN_ERR "ROSE: rose_proto_init - rose_ndevs parameter too large\n"); rc = -EINVAL; goto out; } @@ -1533,7 +1511,6 @@ static int __init rose_proto_init(void) free_netdev(dev); goto fail; } - rose_set_lockdep_key(dev); dev_rose[i] = dev; } diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index c53307623236..5277631fa14c 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -696,7 +696,6 @@ struct rose_neigh *rose_get_neigh(rose_address *addr, unsigned char *cause, for (i = 0; i < node->count; i++) { if (!rose_ftimer_running(node->neighbour[i])) { res = node->neighbour[i]; - failed = 0; goto out; } failed = 1; diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig index 05610c3a3d25..57ebb29c26ad 100644 --- a/net/rxrpc/Kconfig +++ b/net/rxrpc/Kconfig @@ -49,7 +49,7 @@ config RXKAD depends on AF_RXRPC select CRYPTO select CRYPTO_MANAGER - select CRYPTO_BLKCIPHER + select CRYPTO_SKCIPHER select CRYPTO_PCBC select CRYPTO_FCRYPT help diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 0dbbfd1b6487..fe42f986cd94 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -194,6 +194,7 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) service_in_use: write_unlock(&local->services_lock); rxrpc_unuse_local(local); + rxrpc_put_local(local); ret = -EADDRINUSE; error_unlock: release_sock(&rx->sk); @@ -862,7 +863,6 @@ static void rxrpc_sock_destructor(struct sock *sk) static int rxrpc_release_sock(struct sock *sk) { struct rxrpc_sock *rx = rxrpc_sk(sk); - struct rxrpc_net *rxnet = rxrpc_net(sock_net(&rx->sk)); _enter("%p{%d,%d}", sk, sk->sk_state, refcount_read(&sk->sk_refcnt)); @@ -898,10 +898,9 @@ static int rxrpc_release_sock(struct sock *sk) rxrpc_release_calls_on_socket(rx); flush_workqueue(rxrpc_workqueue); rxrpc_purge_queue(&sk->sk_receive_queue); - rxrpc_queue_work(&rxnet->service_conn_reaper); - rxrpc_queue_work(&rxnet->client_conn_reaper); rxrpc_unuse_local(rx->local); + rxrpc_put_local(rx->local); rx->local = NULL; key_put(rx->key); rx->key = NULL; @@ -975,7 +974,7 @@ static int __init af_rxrpc_init(void) int ret = -1; unsigned int tmp; - BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > FIELD_SIZEOF(struct sk_buff, cb)); + BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > sizeof_field(struct sk_buff, cb)); get_random_bytes(&tmp, sizeof(tmp)); tmp &= 0x3fffffff; diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 145335611af6..7d730c438404 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -185,11 +185,17 @@ struct rxrpc_host_header { * - max 48 bytes (struct sk_buff::cb) */ struct rxrpc_skb_priv { - union { - u8 nr_jumbo; /* Number of jumbo subpackets */ - }; + atomic_t nr_ring_pins; /* Number of rxtx ring pins */ + u8 nr_subpackets; /* Number of subpackets */ + u8 rx_flags; /* Received packet flags */ +#define RXRPC_SKB_INCL_LAST 0x01 /* - Includes last packet */ +#define RXRPC_SKB_TX_BUFFER 0x02 /* - Is transmit buffer */ union { int remain; /* amount of space remaining for next write */ + + /* List of requested ACKs on subpackets */ + unsigned long rx_req_ack[(RXRPC_MAX_NR_JUMBO + BITS_PER_LONG - 1) / + BITS_PER_LONG]; }; struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */ @@ -203,6 +209,7 @@ struct rxrpc_skb_priv { struct rxrpc_security { const char *name; /* name of this service */ u8 security_index; /* security type provided */ + u32 no_key_abort; /* Abort code indicating no key */ /* Initialise a security service */ int (*init)(void); @@ -226,6 +233,9 @@ struct rxrpc_security { int (*verify_packet)(struct rxrpc_call *, struct sk_buff *, unsigned int, unsigned int, rxrpc_seq_t, u16); + /* Free crypto request on a call */ + void (*free_call_crypto)(struct rxrpc_call *); + /* Locate the data in a received packet that has been verified. */ void (*locate_data)(struct rxrpc_call *, struct sk_buff *, unsigned int *, unsigned int *); @@ -480,6 +490,7 @@ enum rxrpc_call_flag { RXRPC_CALL_RX_HEARD, /* The peer responded at least once to this call */ RXRPC_CALL_RX_UNDERRUN, /* Got data underrun */ RXRPC_CALL_IS_INTR, /* The call is interruptible */ + RXRPC_CALL_DISCONNECTED, /* The call has been disconnected */ }; /* @@ -547,6 +558,7 @@ struct rxrpc_call { struct rxrpc_peer *peer; /* Peer record for remote address */ struct rxrpc_sock __rcu *socket; /* socket responsible */ struct rxrpc_net *rxnet; /* Network namespace to which call belongs */ + const struct rxrpc_security *security; /* applied security module */ struct mutex user_mutex; /* User access mutex */ unsigned long ack_at; /* When deferred ACK needs to happen */ unsigned long ack_lost_at; /* When ACK is figured as lost */ @@ -558,6 +570,7 @@ struct rxrpc_call { unsigned long expect_term_by; /* When we expect call termination by */ u32 next_rx_timo; /* Timeout for next Rx packet (jif) */ u32 next_req_timo; /* Timeout for next Rx request packet (jif) */ + struct skcipher_request *cipher_req; /* Packet cipher request buffer */ struct timer_list timer; /* Combined event timer */ struct work_struct processor; /* Event processor */ rxrpc_notify_rx_t notify_rx; /* kernel service Rx notification function */ @@ -590,6 +603,7 @@ struct rxrpc_call { int debug_id; /* debug ID for printks */ unsigned short rx_pkt_offset; /* Current recvmsg packet offset */ unsigned short rx_pkt_len; /* Current recvmsg packet len */ + bool rx_pkt_last; /* Current recvmsg packet is last */ /* Rx/Tx circular buffer, depending on phase. * @@ -613,8 +627,7 @@ struct rxrpc_call { #define RXRPC_TX_ANNO_LAST 0x04 #define RXRPC_TX_ANNO_RESENT 0x08 -#define RXRPC_RX_ANNO_JUMBO 0x3f /* Jumbo subpacket number + 1 if not zero */ -#define RXRPC_RX_ANNO_JLAST 0x40 /* Set if last element of a jumbo packet */ +#define RXRPC_RX_ANNO_SUBPACKET 0x3f /* Subpacket number in jumbogram */ #define RXRPC_RX_ANNO_VERIFIED 0x80 /* Set if verified and decrypted */ rxrpc_seq_t tx_hard_ack; /* Dead slot in buffer; the first transmitted but * not hard-ACK'd packet follows this. @@ -905,6 +918,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *); void rxrpc_put_client_conn(struct rxrpc_connection *); void rxrpc_discard_expired_client_conns(struct work_struct *); void rxrpc_destroy_all_client_connections(struct rxrpc_net *); +void rxrpc_clean_up_local_conns(struct rxrpc_local *); /* * conn_event.c @@ -965,8 +979,9 @@ static inline void rxrpc_reduce_conn_timer(struct rxrpc_connection *conn, struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *, struct sk_buff *); struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *, gfp_t); -void rxrpc_new_incoming_connection(struct rxrpc_sock *, - struct rxrpc_connection *, struct sk_buff *); +void rxrpc_new_incoming_connection(struct rxrpc_sock *, struct rxrpc_connection *, + const struct rxrpc_security *, struct key *, + struct sk_buff *); void rxrpc_unpublish_service_conn(struct rxrpc_connection *); /* @@ -1007,6 +1022,16 @@ void rxrpc_unuse_local(struct rxrpc_local *); void rxrpc_queue_local(struct rxrpc_local *); void rxrpc_destroy_all_locals(struct rxrpc_net *); +static inline bool __rxrpc_unuse_local(struct rxrpc_local *local) +{ + return atomic_dec_return(&local->active_users) == 0; +} + +static inline bool __rxrpc_use_local(struct rxrpc_local *local) +{ + return atomic_fetch_add_unless(&local->active_users, 1, 0) != 0; +} + /* * misc.c */ @@ -1091,7 +1116,9 @@ extern const struct rxrpc_security rxkad; int __init rxrpc_init_security(void); void rxrpc_exit_security(void); int rxrpc_init_client_conn_security(struct rxrpc_connection *); -int rxrpc_init_server_conn_security(struct rxrpc_connection *); +bool rxrpc_look_up_server_security(struct rxrpc_local *, struct rxrpc_sock *, + const struct rxrpc_security **, struct key **, + struct sk_buff *); /* * sendmsg.c @@ -1105,6 +1132,7 @@ void rxrpc_kernel_data_consumed(struct rxrpc_call *, struct sk_buff *); void rxrpc_packet_destructor(struct sk_buff *); void rxrpc_new_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_see_skb(struct sk_buff *, enum rxrpc_skb_trace); +void rxrpc_eaten_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_get_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_free_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_purge_queue(struct sk_buff_head *); diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 00c095d74145..70e44abf106c 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -84,7 +84,7 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, smp_store_release(&b->conn_backlog_head, (head + 1) & (size - 1)); - trace_rxrpc_conn(conn, rxrpc_conn_new_service, + trace_rxrpc_conn(conn->debug_id, rxrpc_conn_new_service, atomic_read(&conn->usage), here); } @@ -97,7 +97,7 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, call->flags |= (1 << RXRPC_CALL_IS_SERVICE); call->state = RXRPC_CALL_SERVER_PREALLOC; - trace_rxrpc_call(call, rxrpc_call_new_service, + trace_rxrpc_call(call->debug_id, rxrpc_call_new_service, atomic_read(&call->usage), here, (const void *)user_call_ID); @@ -240,6 +240,22 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) } /* + * Ping the other end to fill our RTT cache and to retrieve the rwind + * and MTU parameters. + */ +static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + ktime_t now = skb->tstamp; + + if (call->peer->rtt_usage < 3 || + ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), now)) + rxrpc_propose_ACK(call, RXRPC_ACK_PING, sp->hdr.serial, + true, true, + rxrpc_propose_ack_ping_for_params); +} + +/* * Allocate a new incoming call from the prealloc pool, along with a connection * and a peer as necessary. */ @@ -247,6 +263,8 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, struct rxrpc_local *local, struct rxrpc_peer *peer, struct rxrpc_connection *conn, + const struct rxrpc_security *sec, + struct key *key, struct sk_buff *skb) { struct rxrpc_backlog *b = rx->backlog; @@ -294,7 +312,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, conn->params.local = rxrpc_get_local(local); conn->params.peer = peer; rxrpc_see_connection(conn); - rxrpc_new_incoming_connection(rx, conn, skb); + rxrpc_new_incoming_connection(rx, conn, sec, key, skb); } else { rxrpc_get_connection(conn); } @@ -307,6 +325,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, rxrpc_see_call(call); call->conn = conn; + call->security = conn->security; call->peer = rxrpc_get_peer(conn->params.peer); call->cong_cwnd = call->peer->cong_cwnd; return call; @@ -332,9 +351,11 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + const struct rxrpc_security *sec = NULL; struct rxrpc_connection *conn; struct rxrpc_peer *peer = NULL; - struct rxrpc_call *call; + struct rxrpc_call *call = NULL; + struct key *key = NULL; _enter(""); @@ -345,9 +366,7 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, sp->hdr.seq, RX_INVALID_OPERATION, ESHUTDOWN); skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; skb->priority = RX_INVALID_OPERATION; - _leave(" = NULL [close]"); - call = NULL; - goto out; + goto no_call; } /* The peer, connection and call may all have sprung into existence due @@ -357,29 +376,19 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, */ conn = rxrpc_find_connection_rcu(local, skb, &peer); - call = rxrpc_alloc_incoming_call(rx, local, peer, conn, skb); + if (!conn && !rxrpc_look_up_server_security(local, rx, &sec, &key, skb)) + goto no_call; + + call = rxrpc_alloc_incoming_call(rx, local, peer, conn, sec, key, skb); + key_put(key); if (!call) { skb->mark = RXRPC_SKB_MARK_REJECT_BUSY; - _leave(" = NULL [busy]"); - call = NULL; - goto out; + goto no_call; } trace_rxrpc_receive(call, rxrpc_receive_incoming, sp->hdr.serial, sp->hdr.seq); - /* Lock the call to prevent rxrpc_kernel_send/recv_data() and - * sendmsg()/recvmsg() inconveniently stealing the mutex once the - * notification is generated. - * - * The BUG should never happen because the kernel should be well - * behaved enough not to access the call before the first notification - * event and userspace is prevented from doing so until the state is - * appropriate. - */ - if (!mutex_trylock(&call->user_mutex)) - BUG(); - /* Make the call live. */ rxrpc_incoming_call(rx, call, skb); conn = call->conn; @@ -420,6 +429,9 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, BUG(); } spin_unlock(&conn->state_lock); + spin_unlock(&rx->incoming_lock); + + rxrpc_send_ping(call, skb); if (call->state == RXRPC_CALL_SERVER_ACCEPTING) rxrpc_notify_socket(call); @@ -432,9 +444,12 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, rxrpc_put_call(call, rxrpc_call_put); _leave(" = %p{%d}", call, call->debug_id); -out: - spin_unlock(&rx->incoming_lock); return call; + +no_call: + spin_unlock(&rx->incoming_lock); + _leave(" = NULL [%u]", skb->mark); + return NULL; } /* diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index c767679bfa5d..cedbbb3a7c2e 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -199,7 +199,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) continue; skb = call->rxtx_buffer[ix]; - rxrpc_see_skb(skb, rxrpc_skb_tx_seen); + rxrpc_see_skb(skb, rxrpc_skb_seen); if (anno_type == RXRPC_TX_ANNO_UNACK) { if (ktime_after(skb->tstamp, max_age)) { @@ -255,18 +255,18 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) continue; skb = call->rxtx_buffer[ix]; - rxrpc_get_skb(skb, rxrpc_skb_tx_got); + rxrpc_get_skb(skb, rxrpc_skb_got); spin_unlock_bh(&call->lock); if (rxrpc_send_data_packet(call, skb, true) < 0) { - rxrpc_free_skb(skb, rxrpc_skb_tx_freed); + rxrpc_free_skb(skb, rxrpc_skb_freed); return; } if (rxrpc_is_client_call(call)) rxrpc_expose_client_call(call); - rxrpc_free_skb(skb, rxrpc_skb_tx_freed); + rxrpc_free_skb(skb, rxrpc_skb_freed); spin_lock_bh(&call->lock); /* We need to clear the retransmit state, but there are two diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 217b12be9e08..c9f34b0a11df 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -240,7 +240,8 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, if (p->intr) __set_bit(RXRPC_CALL_IS_INTR, &call->flags); call->tx_total_len = p->tx_total_len; - trace_rxrpc_call(call, rxrpc_call_new_client, atomic_read(&call->usage), + trace_rxrpc_call(call->debug_id, rxrpc_call_new_client, + atomic_read(&call->usage), here, (const void *)p->user_call_ID); /* We need to protect a partially set up call against the user as we @@ -290,8 +291,8 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, if (ret < 0) goto error; - trace_rxrpc_call(call, rxrpc_call_connected, atomic_read(&call->usage), - here, NULL); + trace_rxrpc_call(call->debug_id, rxrpc_call_connected, + atomic_read(&call->usage), here, NULL); rxrpc_start_call_timer(call); @@ -313,8 +314,8 @@ error_dup_user_ID: error: __rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, RX_CALL_DEAD, ret); - trace_rxrpc_call(call, rxrpc_call_error, atomic_read(&call->usage), - here, ERR_PTR(ret)); + trace_rxrpc_call(call->debug_id, rxrpc_call_error, + atomic_read(&call->usage), here, ERR_PTR(ret)); rxrpc_release_call(rx, call); mutex_unlock(&call->user_mutex); rxrpc_put_call(call, rxrpc_call_put); @@ -376,7 +377,8 @@ bool rxrpc_queue_call(struct rxrpc_call *call) if (n == 0) return false; if (rxrpc_queue_work(&call->processor)) - trace_rxrpc_call(call, rxrpc_call_queued, n + 1, here, NULL); + trace_rxrpc_call(call->debug_id, rxrpc_call_queued, n + 1, + here, NULL); else rxrpc_put_call(call, rxrpc_call_put_noqueue); return true; @@ -391,7 +393,8 @@ bool __rxrpc_queue_call(struct rxrpc_call *call) int n = atomic_read(&call->usage); ASSERTCMP(n, >=, 1); if (rxrpc_queue_work(&call->processor)) - trace_rxrpc_call(call, rxrpc_call_queued_ref, n, here, NULL); + trace_rxrpc_call(call->debug_id, rxrpc_call_queued_ref, n, + here, NULL); else rxrpc_put_call(call, rxrpc_call_put_noqueue); return true; @@ -406,7 +409,8 @@ void rxrpc_see_call(struct rxrpc_call *call) if (call) { int n = atomic_read(&call->usage); - trace_rxrpc_call(call, rxrpc_call_seen, n, here, NULL); + trace_rxrpc_call(call->debug_id, rxrpc_call_seen, n, + here, NULL); } } @@ -418,7 +422,20 @@ void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op) const void *here = __builtin_return_address(0); int n = atomic_inc_return(&call->usage); - trace_rxrpc_call(call, op, n, here, NULL); + trace_rxrpc_call(call->debug_id, op, n, here, NULL); +} + +/* + * Clean up the RxTx skb ring. + */ +static void rxrpc_cleanup_ring(struct rxrpc_call *call) +{ + int i; + + for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++) { + rxrpc_free_skb(call->rxtx_buffer[i], rxrpc_skb_cleaned); + call->rxtx_buffer[i] = NULL; + } } /* @@ -429,11 +446,11 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) const void *here = __builtin_return_address(0); struct rxrpc_connection *conn = call->conn; bool put = false; - int i; _enter("{%d,%d}", call->debug_id, atomic_read(&call->usage)); - trace_rxrpc_call(call, rxrpc_call_release, atomic_read(&call->usage), + trace_rxrpc_call(call->debug_id, rxrpc_call_release, + atomic_read(&call->usage), here, (const void *)call->flags); ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); @@ -476,16 +493,12 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) _debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, conn); - if (conn) + if (conn && !test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) rxrpc_disconnect_call(call); + if (call->security) + call->security->free_call_crypto(call); - for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++) { - rxrpc_free_skb(call->rxtx_buffer[i], - (call->tx_phase ? rxrpc_skb_tx_cleaned : - rxrpc_skb_rx_cleaned)); - call->rxtx_buffer[i] = NULL; - } - + rxrpc_cleanup_ring(call); _leave(""); } @@ -526,12 +539,13 @@ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op) { struct rxrpc_net *rxnet = call->rxnet; const void *here = __builtin_return_address(0); + unsigned int debug_id = call->debug_id; int n; ASSERT(call != NULL); n = atomic_dec_return(&call->usage); - trace_rxrpc_call(call, op, n, here, NULL); + trace_rxrpc_call(debug_id, op, n, here, NULL); ASSERTCMP(n, >=, 0); if (n == 0) { _debug("call %d dead", call->debug_id); @@ -548,13 +562,14 @@ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op) } /* - * Final call destruction under RCU. + * Final call destruction - but must be done in process context. */ -static void rxrpc_rcu_destroy_call(struct rcu_head *rcu) +static void rxrpc_destroy_call(struct work_struct *work) { - struct rxrpc_call *call = container_of(rcu, struct rxrpc_call, rcu); + struct rxrpc_call *call = container_of(work, struct rxrpc_call, processor); struct rxrpc_net *rxnet = call->rxnet; + rxrpc_put_connection(call->conn); rxrpc_put_peer(call->peer); kfree(call->rxtx_buffer); kfree(call->rxtx_annotations); @@ -564,12 +579,26 @@ static void rxrpc_rcu_destroy_call(struct rcu_head *rcu) } /* + * Final call destruction under RCU. + */ +static void rxrpc_rcu_destroy_call(struct rcu_head *rcu) +{ + struct rxrpc_call *call = container_of(rcu, struct rxrpc_call, rcu); + + if (in_softirq()) { + INIT_WORK(&call->processor, rxrpc_destroy_call); + if (!rxrpc_queue_work(&call->processor)) + BUG(); + } else { + rxrpc_destroy_call(&call->processor); + } +} + +/* * clean up a call */ void rxrpc_cleanup_call(struct rxrpc_call *call) { - int i; - _net("DESTROY CALL %d", call->debug_id); memset(&call->sock_node, 0xcd, sizeof(call->sock_node)); @@ -578,15 +607,9 @@ void rxrpc_cleanup_call(struct rxrpc_call *call) ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags)); - ASSERTCMP(call->conn, ==, NULL); - - /* Clean up the Rx/Tx buffer */ - for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++) - rxrpc_free_skb(call->rxtx_buffer[i], - (call->tx_phase ? rxrpc_skb_tx_cleaned : - rxrpc_skb_rx_cleaned)); - rxrpc_free_skb(call->tx_pending, rxrpc_skb_tx_cleaned); + rxrpc_cleanup_ring(call); + rxrpc_free_skb(call->tx_pending, rxrpc_skb_cleaned); call_rcu(&call->rcu, rxrpc_rcu_destroy_call); } diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index aea82f909c60..ea7d4c21f889 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -212,7 +212,8 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp) rxrpc_get_local(conn->params.local); key_get(conn->params.key); - trace_rxrpc_conn(conn, rxrpc_conn_new_client, atomic_read(&conn->usage), + trace_rxrpc_conn(conn->debug_id, rxrpc_conn_new_client, + atomic_read(&conn->usage), __builtin_return_address(0)); trace_rxrpc_client(conn, -1, rxrpc_client_alloc); _leave(" = %p", conn); @@ -352,6 +353,7 @@ static int rxrpc_get_client_conn(struct rxrpc_sock *rx, if (cp->exclusive) { call->conn = candidate; + call->security = candidate->security; call->security_ix = candidate->security_ix; call->service_id = candidate->service_id; _leave(" = 0 [exclusive %d]", candidate->debug_id); @@ -403,6 +405,7 @@ static int rxrpc_get_client_conn(struct rxrpc_sock *rx, candidate_published: set_bit(RXRPC_CONN_IN_CLIENT_CONNS, &candidate->flags); call->conn = candidate; + call->security = candidate->security; call->security_ix = candidate->security_ix; call->service_id = candidate->service_id; spin_unlock(&local->client_conns_lock); @@ -425,6 +428,7 @@ found_extant_conn: spin_lock(&conn->channel_lock); call->conn = conn; + call->security = conn->security; call->security_ix = conn->security_ix; call->service_id = conn->service_id; list_add_tail(&call->chan_wait_link, &conn->waiting_calls); @@ -781,6 +785,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call) u32 cid; spin_lock(&conn->channel_lock); + set_bit(RXRPC_CALL_DISCONNECTED, &call->flags); cid = call->cid; if (cid) { @@ -788,7 +793,6 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call) chan = &conn->channels[channel]; } trace_rxrpc_client(conn, channel, rxrpc_client_chan_disconnect); - call->conn = NULL; /* Calls that have never actually been assigned a channel can simply be * discarded. If the conn didn't get used either, it will follow @@ -904,7 +908,6 @@ out: spin_unlock(&rxnet->client_conn_cache_lock); out_2: spin_unlock(&conn->channel_lock); - rxrpc_put_connection(conn); _leave(""); return; @@ -985,11 +988,12 @@ rxrpc_put_one_client_conn(struct rxrpc_connection *conn) void rxrpc_put_client_conn(struct rxrpc_connection *conn) { const void *here = __builtin_return_address(0); + unsigned int debug_id = conn->debug_id; int n; do { n = atomic_dec_return(&conn->usage); - trace_rxrpc_conn(conn, rxrpc_conn_put_client, n, here); + trace_rxrpc_conn(debug_id, rxrpc_conn_put_client, n, here); if (n > 0) return; ASSERTCMP(n, >=, 0); @@ -1162,3 +1166,47 @@ void rxrpc_destroy_all_client_connections(struct rxrpc_net *rxnet) _leave(""); } + +/* + * Clean up the client connections on a local endpoint. + */ +void rxrpc_clean_up_local_conns(struct rxrpc_local *local) +{ + struct rxrpc_connection *conn, *tmp; + struct rxrpc_net *rxnet = local->rxnet; + unsigned int nr_active; + LIST_HEAD(graveyard); + + _enter(""); + + spin_lock(&rxnet->client_conn_cache_lock); + nr_active = rxnet->nr_active_client_conns; + + list_for_each_entry_safe(conn, tmp, &rxnet->idle_client_conns, + cache_link) { + if (conn->params.local == local) { + ASSERTCMP(conn->cache_state, ==, RXRPC_CONN_CLIENT_IDLE); + + trace_rxrpc_client(conn, -1, rxrpc_client_discard); + if (!test_and_clear_bit(RXRPC_CONN_EXPOSED, &conn->flags)) + BUG(); + conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE; + list_move(&conn->cache_link, &graveyard); + nr_active--; + } + } + + rxnet->nr_active_client_conns = nr_active; + spin_unlock(&rxnet->client_conn_cache_lock); + ASSERTCMP(nr_active, >=, 0); + + while (!list_empty(&graveyard)) { + conn = list_entry(graveyard.next, + struct rxrpc_connection, cache_link); + list_del_init(&conn->cache_link); + + rxrpc_put_connection(conn); + } + + _leave(" [culled]"); +} diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index df6624c140be..06fcff2ebbba 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -376,21 +376,7 @@ static void rxrpc_secure_connection(struct rxrpc_connection *conn) _enter("{%d}", conn->debug_id); ASSERT(conn->security_ix != 0); - - if (!conn->params.key) { - _debug("set up security"); - ret = rxrpc_init_server_conn_security(conn); - switch (ret) { - case 0: - break; - case -ENOENT: - abort_code = RX_CALL_DEAD; - goto abort; - default: - abort_code = RXKADNOAUTH; - goto abort; - } - } + ASSERT(conn->server_key); if (conn->security->issue_challenge(conn) < 0) { abort_code = RX_CALL_DEAD; @@ -452,16 +438,12 @@ again: /* * connection-level event processor */ -void rxrpc_process_connection(struct work_struct *work) +static void rxrpc_do_process_connection(struct rxrpc_connection *conn) { - struct rxrpc_connection *conn = - container_of(work, struct rxrpc_connection, processor); struct sk_buff *skb; u32 abort_code = RX_PROTOCOL_ERROR; int ret; - rxrpc_see_connection(conn); - if (test_and_clear_bit(RXRPC_CONN_EV_CHALLENGE, &conn->events)) rxrpc_secure_connection(conn); @@ -472,7 +454,7 @@ void rxrpc_process_connection(struct work_struct *work) /* go through the conn-level event packets, releasing the ref on this * connection that each one has when we've finished with it */ while ((skb = skb_dequeue(&conn->rx_queue))) { - rxrpc_see_skb(skb, rxrpc_skb_rx_seen); + rxrpc_see_skb(skb, rxrpc_skb_seen); ret = rxrpc_process_event(conn, skb, &abort_code); switch (ret) { case -EPROTO: @@ -484,23 +466,37 @@ void rxrpc_process_connection(struct work_struct *work) goto requeue_and_leave; case -ECONNABORTED: default: - rxrpc_free_skb(skb, rxrpc_skb_rx_freed); + rxrpc_free_skb(skb, rxrpc_skb_freed); break; } } -out: - rxrpc_put_connection(conn); - _leave(""); return; requeue_and_leave: skb_queue_head(&conn->rx_queue, skb); - goto out; + return; protocol_error: if (rxrpc_abort_connection(conn, ret, abort_code) < 0) goto requeue_and_leave; - rxrpc_free_skb(skb, rxrpc_skb_rx_freed); - goto out; + rxrpc_free_skb(skb, rxrpc_skb_freed); + return; +} + +void rxrpc_process_connection(struct work_struct *work) +{ + struct rxrpc_connection *conn = + container_of(work, struct rxrpc_connection, processor); + + rxrpc_see_connection(conn); + + if (__rxrpc_use_local(conn->params.local)) { + rxrpc_do_process_connection(conn); + rxrpc_unuse_local(conn->params.local); + } + + rxrpc_put_connection(conn); + _leave(""); + return; } diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 434ef392212b..19e141eeed17 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -223,9 +223,8 @@ void rxrpc_disconnect_call(struct rxrpc_call *call) __rxrpc_disconnect_call(conn, call); spin_unlock(&conn->channel_lock); - call->conn = NULL; + set_bit(RXRPC_CALL_DISCONNECTED, &call->flags); conn->idle_timestamp = jiffies; - rxrpc_put_connection(conn); } /* @@ -269,7 +268,7 @@ bool rxrpc_queue_conn(struct rxrpc_connection *conn) if (n == 0) return false; if (rxrpc_queue_work(&conn->processor)) - trace_rxrpc_conn(conn, rxrpc_conn_queued, n + 1, here); + trace_rxrpc_conn(conn->debug_id, rxrpc_conn_queued, n + 1, here); else rxrpc_put_connection(conn); return true; @@ -284,7 +283,7 @@ void rxrpc_see_connection(struct rxrpc_connection *conn) if (conn) { int n = atomic_read(&conn->usage); - trace_rxrpc_conn(conn, rxrpc_conn_seen, n, here); + trace_rxrpc_conn(conn->debug_id, rxrpc_conn_seen, n, here); } } @@ -296,7 +295,7 @@ void rxrpc_get_connection(struct rxrpc_connection *conn) const void *here = __builtin_return_address(0); int n = atomic_inc_return(&conn->usage); - trace_rxrpc_conn(conn, rxrpc_conn_got, n, here); + trace_rxrpc_conn(conn->debug_id, rxrpc_conn_got, n, here); } /* @@ -310,7 +309,7 @@ rxrpc_get_connection_maybe(struct rxrpc_connection *conn) if (conn) { int n = atomic_fetch_add_unless(&conn->usage, 1, 0); if (n > 0) - trace_rxrpc_conn(conn, rxrpc_conn_got, n + 1, here); + trace_rxrpc_conn(conn->debug_id, rxrpc_conn_got, n + 1, here); else conn = NULL; } @@ -333,10 +332,11 @@ static void rxrpc_set_service_reap_timer(struct rxrpc_net *rxnet, void rxrpc_put_service_conn(struct rxrpc_connection *conn) { const void *here = __builtin_return_address(0); + unsigned int debug_id = conn->debug_id; int n; n = atomic_dec_return(&conn->usage); - trace_rxrpc_conn(conn, rxrpc_conn_put_service, n, here); + trace_rxrpc_conn(debug_id, rxrpc_conn_put_service, n, here); ASSERTCMP(n, >=, 0); if (n == 1) rxrpc_set_service_reap_timer(conn->params.local->rxnet, @@ -398,7 +398,7 @@ void rxrpc_service_connection_reaper(struct work_struct *work) if (conn->state == RXRPC_CONN_SERVICE_PREALLOC) continue; - if (rxnet->live) { + if (rxnet->live && !conn->params.local->dead) { idle_timestamp = READ_ONCE(conn->idle_timestamp); expire_at = idle_timestamp + rxrpc_connection_expiry * HZ; if (conn->params.local->service_closed) @@ -420,7 +420,7 @@ void rxrpc_service_connection_reaper(struct work_struct *work) */ if (atomic_cmpxchg(&conn->usage, 1, 0) != 1) continue; - trace_rxrpc_conn(conn, rxrpc_conn_reap_service, 0, NULL); + trace_rxrpc_conn(conn->debug_id, rxrpc_conn_reap_service, 0, NULL); if (rxrpc_conn_is_client(conn)) BUG(); diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index b30e13f6d95f..21da48e3d2e5 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -134,7 +134,7 @@ struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *rxn list_add_tail(&conn->proc_link, &rxnet->conn_proc_list); write_unlock(&rxnet->conn_lock); - trace_rxrpc_conn(conn, rxrpc_conn_new_service, + trace_rxrpc_conn(conn->debug_id, rxrpc_conn_new_service, atomic_read(&conn->usage), __builtin_return_address(0)); } @@ -148,6 +148,8 @@ struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *rxn */ void rxrpc_new_incoming_connection(struct rxrpc_sock *rx, struct rxrpc_connection *conn, + const struct rxrpc_security *sec, + struct key *key, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); @@ -160,6 +162,8 @@ void rxrpc_new_incoming_connection(struct rxrpc_sock *rx, conn->service_id = sp->hdr.serviceId; conn->security_ix = sp->hdr.securityIndex; conn->out_clientflag = 0; + conn->security = sec; + conn->server_key = key_get(key); if (conn->security_ix) conn->state = RXRPC_CONN_SERVICE_UNSECURED; else diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index dd47d465d1d3..ef10fbf71b15 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -193,22 +193,6 @@ send_extra_data: } /* - * Ping the other end to fill our RTT cache and to retrieve the rwind - * and MTU parameters. - */ -static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb) -{ - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - ktime_t now = skb->tstamp; - - if (call->peer->rtt_usage < 3 || - ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), now)) - rxrpc_propose_ACK(call, RXRPC_ACK_PING, sp->hdr.serial, - true, true, - rxrpc_propose_ack_ping_for_params); -} - -/* * Apply a hard ACK by advancing the Tx window. */ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, @@ -233,7 +217,7 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, ix = call->tx_hard_ack & RXRPC_RXTX_BUFF_MASK; skb = call->rxtx_buffer[ix]; annotation = call->rxtx_annotations[ix]; - rxrpc_see_skb(skb, rxrpc_skb_tx_rotated); + rxrpc_see_skb(skb, rxrpc_skb_rotated); call->rxtx_buffer[ix] = NULL; call->rxtx_annotations[ix] = 0; skb->next = list; @@ -258,7 +242,7 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, skb = list; list = skb->next; skb_mark_not_on_list(skb); - rxrpc_free_skb(skb, rxrpc_skb_tx_freed); + rxrpc_free_skb(skb, rxrpc_skb_freed); } return rot_last; @@ -347,7 +331,7 @@ static bool rxrpc_receiving_reply(struct rxrpc_call *call) } /* - * Scan a jumbo packet to validate its structure and to work out how many + * Scan a data packet to validate its structure and to work out how many * subpackets it contains. * * A jumbo packet is a collection of consecutive packets glued together with @@ -358,16 +342,21 @@ static bool rxrpc_receiving_reply(struct rxrpc_call *call) * the last are RXRPC_JUMBO_DATALEN in size. The last subpacket may be of any * size. */ -static bool rxrpc_validate_jumbo(struct sk_buff *skb) +static bool rxrpc_validate_data(struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); unsigned int offset = sizeof(struct rxrpc_wire_header); unsigned int len = skb->len; - int nr_jumbo = 1; u8 flags = sp->hdr.flags; - do { - nr_jumbo++; + for (;;) { + if (flags & RXRPC_REQUEST_ACK) + __set_bit(sp->nr_subpackets, sp->rx_req_ack); + sp->nr_subpackets++; + + if (!(flags & RXRPC_JUMBO_PACKET)) + break; + if (len - offset < RXRPC_JUMBO_SUBPKTLEN) goto protocol_error; if (flags & RXRPC_LAST_PACKET) @@ -376,9 +365,10 @@ static bool rxrpc_validate_jumbo(struct sk_buff *skb) if (skb_copy_bits(skb, offset, &flags, 1) < 0) goto protocol_error; offset += sizeof(struct rxrpc_jumbo_header); - } while (flags & RXRPC_JUMBO_PACKET); + } - sp->nr_jumbo = nr_jumbo; + if (flags & RXRPC_LAST_PACKET) + sp->rx_flags |= RXRPC_SKB_INCL_LAST; return true; protocol_error: @@ -399,10 +389,10 @@ protocol_error: * (that information is encoded in the ACK packet). */ static void rxrpc_input_dup_data(struct rxrpc_call *call, rxrpc_seq_t seq, - u8 annotation, bool *_jumbo_bad) + bool is_jumbo, bool *_jumbo_bad) { /* Discard normal packets that are duplicates. */ - if (annotation == 0) + if (is_jumbo) return; /* Skip jumbo subpackets that are duplicates. When we've had three or @@ -416,29 +406,30 @@ static void rxrpc_input_dup_data(struct rxrpc_call *call, rxrpc_seq_t seq, } /* - * Process a DATA packet, adding the packet to the Rx ring. + * Process a DATA packet, adding the packet to the Rx ring. The caller's + * packet ref must be passed on or discarded. */ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); enum rxrpc_call_state state; - unsigned int offset = sizeof(struct rxrpc_wire_header); - unsigned int ix; + unsigned int j, nr_subpackets; rxrpc_serial_t serial = sp->hdr.serial, ack_serial = 0; - rxrpc_seq_t seq = sp->hdr.seq, hard_ack; - bool immediate_ack = false, jumbo_bad = false, queued; - u16 len; - u8 ack = 0, flags, annotation = 0; + rxrpc_seq_t seq0 = sp->hdr.seq, hard_ack; + bool immediate_ack = false, jumbo_bad = false; + u8 ack = 0; _enter("{%u,%u},{%u,%u}", - call->rx_hard_ack, call->rx_top, skb->len, seq); + call->rx_hard_ack, call->rx_top, skb->len, seq0); - _proto("Rx DATA %%%u { #%u f=%02x }", - sp->hdr.serial, seq, sp->hdr.flags); + _proto("Rx DATA %%%u { #%u f=%02x n=%u }", + sp->hdr.serial, seq0, sp->hdr.flags, sp->nr_subpackets); state = READ_ONCE(call->state); - if (state >= RXRPC_CALL_COMPLETE) + if (state >= RXRPC_CALL_COMPLETE) { + rxrpc_free_skb(skb, rxrpc_skb_freed); return; + } if (call->state == RXRPC_CALL_SERVER_RECV_REQUEST) { unsigned long timo = READ_ONCE(call->next_req_timo); @@ -463,137 +454,139 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) !rxrpc_receiving_reply(call)) goto unlock; - call->ackr_prev_seq = seq; - + call->ackr_prev_seq = seq0; hard_ack = READ_ONCE(call->rx_hard_ack); - if (after(seq, hard_ack + call->rx_winsize)) { - ack = RXRPC_ACK_EXCEEDS_WINDOW; - ack_serial = serial; - goto ack; - } - flags = sp->hdr.flags; - if (flags & RXRPC_JUMBO_PACKET) { + nr_subpackets = sp->nr_subpackets; + if (nr_subpackets > 1) { if (call->nr_jumbo_bad > 3) { ack = RXRPC_ACK_NOSPACE; ack_serial = serial; goto ack; } - annotation = 1; } -next_subpacket: - queued = false; - ix = seq & RXRPC_RXTX_BUFF_MASK; - len = skb->len; - if (flags & RXRPC_JUMBO_PACKET) - len = RXRPC_JUMBO_DATALEN; - - if (flags & RXRPC_LAST_PACKET) { - if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) && - seq != call->rx_top) { - rxrpc_proto_abort("LSN", call, seq); - goto unlock; - } - } else { - if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) && - after_eq(seq, call->rx_top)) { - rxrpc_proto_abort("LSA", call, seq); - goto unlock; + for (j = 0; j < nr_subpackets; j++) { + rxrpc_serial_t serial = sp->hdr.serial + j; + rxrpc_seq_t seq = seq0 + j; + unsigned int ix = seq & RXRPC_RXTX_BUFF_MASK; + bool terminal = (j == nr_subpackets - 1); + bool last = terminal && (sp->rx_flags & RXRPC_SKB_INCL_LAST); + u8 flags, annotation = j; + + _proto("Rx DATA+%u %%%u { #%x t=%u l=%u }", + j, serial, seq, terminal, last); + + if (last) { + if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) && + seq != call->rx_top) { + rxrpc_proto_abort("LSN", call, seq); + goto unlock; + } + } else { + if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) && + after_eq(seq, call->rx_top)) { + rxrpc_proto_abort("LSA", call, seq); + goto unlock; + } } - } - trace_rxrpc_rx_data(call->debug_id, seq, serial, flags, annotation); - if (before_eq(seq, hard_ack)) { - ack = RXRPC_ACK_DUPLICATE; - ack_serial = serial; - goto skip; - } - - if (flags & RXRPC_REQUEST_ACK && !ack) { - ack = RXRPC_ACK_REQUESTED; - ack_serial = serial; - } + flags = 0; + if (last) + flags |= RXRPC_LAST_PACKET; + if (!terminal) + flags |= RXRPC_JUMBO_PACKET; + if (test_bit(j, sp->rx_req_ack)) + flags |= RXRPC_REQUEST_ACK; + trace_rxrpc_rx_data(call->debug_id, seq, serial, flags, annotation); - if (call->rxtx_buffer[ix]) { - rxrpc_input_dup_data(call, seq, annotation, &jumbo_bad); - if (ack != RXRPC_ACK_DUPLICATE) { + if (before_eq(seq, hard_ack)) { ack = RXRPC_ACK_DUPLICATE; ack_serial = serial; + continue; } - immediate_ack = true; - goto skip; - } - - /* Queue the packet. We use a couple of memory barriers here as need - * to make sure that rx_top is perceived to be set after the buffer - * pointer and that the buffer pointer is set after the annotation and - * the skb data. - * - * Barriers against rxrpc_recvmsg_data() and rxrpc_rotate_rx_window() - * and also rxrpc_fill_out_ack(). - */ - rxrpc_get_skb(skb, rxrpc_skb_rx_got); - call->rxtx_annotations[ix] = annotation; - smp_wmb(); - call->rxtx_buffer[ix] = skb; - if (after(seq, call->rx_top)) { - smp_store_release(&call->rx_top, seq); - } else if (before(seq, call->rx_top)) { - /* Send an immediate ACK if we fill in a hole */ - if (!ack) { - ack = RXRPC_ACK_DELAY; - ack_serial = serial; - } - immediate_ack = true; - } - if (flags & RXRPC_LAST_PACKET) { - set_bit(RXRPC_CALL_RX_LAST, &call->flags); - trace_rxrpc_receive(call, rxrpc_receive_queue_last, serial, seq); - } else { - trace_rxrpc_receive(call, rxrpc_receive_queue, serial, seq); - } - queued = true; - if (after_eq(seq, call->rx_expect_next)) { - if (after(seq, call->rx_expect_next)) { - _net("OOS %u > %u", seq, call->rx_expect_next); - ack = RXRPC_ACK_OUT_OF_SEQUENCE; - ack_serial = serial; + if (call->rxtx_buffer[ix]) { + rxrpc_input_dup_data(call, seq, nr_subpackets > 1, + &jumbo_bad); + if (ack != RXRPC_ACK_DUPLICATE) { + ack = RXRPC_ACK_DUPLICATE; + ack_serial = serial; + } + immediate_ack = true; + continue; } - call->rx_expect_next = seq + 1; - } -skip: - offset += len; - if (flags & RXRPC_JUMBO_PACKET) { - if (skb_copy_bits(skb, offset, &flags, 1) < 0) { - rxrpc_proto_abort("XJF", call, seq); - goto unlock; - } - offset += sizeof(struct rxrpc_jumbo_header); - seq++; - serial++; - annotation++; - if (flags & RXRPC_JUMBO_PACKET) - annotation |= RXRPC_RX_ANNO_JLAST; if (after(seq, hard_ack + call->rx_winsize)) { ack = RXRPC_ACK_EXCEEDS_WINDOW; ack_serial = serial; - if (!jumbo_bad) { - call->nr_jumbo_bad++; - jumbo_bad = true; + if (flags & RXRPC_JUMBO_PACKET) { + if (!jumbo_bad) { + call->nr_jumbo_bad++; + jumbo_bad = true; + } } + goto ack; } - _proto("Rx DATA Jumbo %%%u", serial); - goto next_subpacket; - } + if (flags & RXRPC_REQUEST_ACK && !ack) { + ack = RXRPC_ACK_REQUESTED; + ack_serial = serial; + } + + /* Queue the packet. We use a couple of memory barriers here as need + * to make sure that rx_top is perceived to be set after the buffer + * pointer and that the buffer pointer is set after the annotation and + * the skb data. + * + * Barriers against rxrpc_recvmsg_data() and rxrpc_rotate_rx_window() + * and also rxrpc_fill_out_ack(). + */ + if (!terminal) + rxrpc_get_skb(skb, rxrpc_skb_got); + call->rxtx_annotations[ix] = annotation; + smp_wmb(); + call->rxtx_buffer[ix] = skb; + if (after(seq, call->rx_top)) { + smp_store_release(&call->rx_top, seq); + } else if (before(seq, call->rx_top)) { + /* Send an immediate ACK if we fill in a hole */ + if (!ack) { + ack = RXRPC_ACK_DELAY; + ack_serial = serial; + } + immediate_ack = true; + } + + if (terminal) { + /* From this point on, we're not allowed to touch the + * packet any longer as its ref now belongs to the Rx + * ring. + */ + skb = NULL; + sp = NULL; + } - if (queued && flags & RXRPC_LAST_PACKET && !ack) { - ack = RXRPC_ACK_DELAY; - ack_serial = serial; + if (last) { + set_bit(RXRPC_CALL_RX_LAST, &call->flags); + if (!ack) { + ack = RXRPC_ACK_DELAY; + ack_serial = serial; + } + trace_rxrpc_receive(call, rxrpc_receive_queue_last, serial, seq); + } else { + trace_rxrpc_receive(call, rxrpc_receive_queue, serial, seq); + } + + if (after_eq(seq, call->rx_expect_next)) { + if (after(seq, call->rx_expect_next)) { + _net("OOS %u > %u", seq, call->rx_expect_next); + ack = RXRPC_ACK_OUT_OF_SEQUENCE; + ack_serial = serial; + } + call->rx_expect_next = seq + 1; + } } ack: @@ -606,13 +599,12 @@ ack: false, true, rxrpc_propose_ack_input_data); - if (sp->hdr.seq == READ_ONCE(call->rx_hard_ack) + 1) { - trace_rxrpc_notify_socket(call->debug_id, serial); - rxrpc_notify_socket(call); - } + trace_rxrpc_notify_socket(call->debug_id, serial); + rxrpc_notify_socket(call); unlock: spin_unlock(&call->input_lock); + rxrpc_free_skb(skb, rxrpc_skb_freed); _leave(" [queued]"); } @@ -1021,7 +1013,7 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call, switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_DATA: rxrpc_input_data(call, skb); - break; + goto no_free; case RXRPC_PACKET_TYPE_ACK: rxrpc_input_ack(call, skb); @@ -1048,6 +1040,8 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call, break; } + rxrpc_free_skb(skb, rxrpc_skb_freed); +no_free: _leave(""); } @@ -1109,7 +1103,7 @@ static void rxrpc_post_packet_to_local(struct rxrpc_local *local, skb_queue_tail(&local->event_queue, skb); rxrpc_queue_local(local); } else { - rxrpc_free_skb(skb, rxrpc_skb_rx_freed); + rxrpc_free_skb(skb, rxrpc_skb_freed); } } @@ -1124,7 +1118,7 @@ static void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb) skb_queue_tail(&local->reject_queue, skb); rxrpc_queue_local(local); } else { - rxrpc_free_skb(skb, rxrpc_skb_rx_freed); + rxrpc_free_skb(skb, rxrpc_skb_freed); } } @@ -1188,7 +1182,7 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb) if (skb->tstamp == 0) skb->tstamp = ktime_get_real(); - rxrpc_new_skb(skb, rxrpc_skb_rx_received); + rxrpc_new_skb(skb, rxrpc_skb_received); skb_pull(skb, sizeof(struct udphdr)); @@ -1205,7 +1199,7 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb) static int lose; if ((lose++ & 7) == 7) { trace_rxrpc_rx_lose(sp); - rxrpc_free_skb(skb, rxrpc_skb_rx_lost); + rxrpc_free_skb(skb, rxrpc_skb_lost); return 0; } } @@ -1237,9 +1231,26 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb) if (sp->hdr.callNumber == 0 || sp->hdr.seq == 0) goto bad_message; - if (sp->hdr.flags & RXRPC_JUMBO_PACKET && - !rxrpc_validate_jumbo(skb)) + if (!rxrpc_validate_data(skb)) goto bad_message; + + /* Unshare the packet so that it can be modified for in-place + * decryption. + */ + if (sp->hdr.securityIndex != 0) { + struct sk_buff *nskb = skb_unshare(skb, GFP_ATOMIC); + if (!nskb) { + rxrpc_eaten_skb(skb, rxrpc_skb_unshared_nomem); + goto out; + } + + if (nskb != skb) { + rxrpc_eaten_skb(skb, rxrpc_skb_received); + skb = nskb; + rxrpc_new_skb(skb, rxrpc_skb_unshared); + sp = rxrpc_skb(skb); + } + } break; case RXRPC_PACKET_TYPE_CHALLENGE: @@ -1369,15 +1380,16 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb) call = rxrpc_new_incoming_call(local, rx, skb); if (!call) goto reject_packet; - rxrpc_send_ping(call, skb); - mutex_unlock(&call->user_mutex); } + /* Process a call packet; this either discards or passes on the ref + * elsewhere. + */ rxrpc_input_call_packet(call, skb); - goto discard; + goto out; discard: - rxrpc_free_skb(skb, rxrpc_skb_rx_freed); + rxrpc_free_skb(skb, rxrpc_skb_freed); out: trace_rxrpc_rx_done(0, 0); return 0; diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c index a29d26c273b5..f6c59f5fae9d 100644 --- a/net/rxrpc/insecure.c +++ b/net/rxrpc/insecure.c @@ -33,6 +33,10 @@ static int none_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, return 0; } +static void none_free_call_crypto(struct rxrpc_call *call) +{ +} + static void none_locate_data(struct rxrpc_call *call, struct sk_buff *skb, unsigned int *_offset, unsigned int *_len) { @@ -83,6 +87,7 @@ const struct rxrpc_security rxrpc_no_security = { .exit = none_exit, .init_connection_security = none_init_connection_security, .prime_packet_security = none_prime_packet_security, + .free_call_crypto = none_free_call_crypto, .secure_packet = none_secure_packet, .verify_packet = none_verify_packet, .locate_data = none_locate_data, diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c index e93a78f7c05e..3ce6d628cd75 100644 --- a/net/rxrpc/local_event.c +++ b/net/rxrpc/local_event.c @@ -90,7 +90,7 @@ void rxrpc_process_local_events(struct rxrpc_local *local) if (skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - rxrpc_see_skb(skb, rxrpc_skb_rx_seen); + rxrpc_see_skb(skb, rxrpc_skb_seen); _debug("{%d},{%u}", local->debug_id, sp->hdr.type); switch (sp->hdr.type) { @@ -108,7 +108,7 @@ void rxrpc_process_local_events(struct rxrpc_local *local) break; } - rxrpc_free_skb(skb, rxrpc_skb_rx_freed); + rxrpc_free_skb(skb, rxrpc_skb_freed); } _leave(""); diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 72a6e12a9304..a6c1349e965d 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -364,11 +364,14 @@ void rxrpc_queue_local(struct rxrpc_local *local) void rxrpc_put_local(struct rxrpc_local *local) { const void *here = __builtin_return_address(0); + unsigned int debug_id; int n; if (local) { + debug_id = local->debug_id; + n = atomic_dec_return(&local->usage); - trace_rxrpc_local(local->debug_id, rxrpc_local_put, n, here); + trace_rxrpc_local(debug_id, rxrpc_local_put, n, here); if (n == 0) call_rcu(&local->rcu, rxrpc_local_rcu); @@ -380,14 +383,11 @@ void rxrpc_put_local(struct rxrpc_local *local) */ struct rxrpc_local *rxrpc_use_local(struct rxrpc_local *local) { - unsigned int au; - local = rxrpc_get_local_maybe(local); if (!local) return NULL; - au = atomic_fetch_add_unless(&local->active_users, 1, 0); - if (au == 0) { + if (!__rxrpc_use_local(local)) { rxrpc_put_local(local); return NULL; } @@ -401,14 +401,11 @@ struct rxrpc_local *rxrpc_use_local(struct rxrpc_local *local) */ void rxrpc_unuse_local(struct rxrpc_local *local) { - unsigned int au; - if (local) { - au = atomic_dec_return(&local->active_users); - if (au == 0) + if (__rxrpc_unuse_local(local)) { + rxrpc_get_local(local); rxrpc_queue_local(local); - else - rxrpc_put_local(local); + } } } @@ -426,11 +423,14 @@ static void rxrpc_local_destroyer(struct rxrpc_local *local) _enter("%d", local->debug_id); + local->dead = true; + mutex_lock(&rxnet->local_mutex); list_del_init(&local->link); mutex_unlock(&rxnet->local_mutex); - ASSERT(RB_EMPTY_ROOT(&local->client_conns)); + rxrpc_clean_up_local_conns(local); + rxrpc_service_connection_reaper(&rxnet->service_conn_reaper); ASSERT(!local->service); if (socket) { @@ -462,7 +462,7 @@ static void rxrpc_local_processor(struct work_struct *work) do { again = false; - if (atomic_read(&local->active_users) == 0) { + if (!__rxrpc_use_local(local)) { rxrpc_local_destroyer(local); break; } @@ -476,6 +476,8 @@ static void rxrpc_local_processor(struct work_struct *work) rxrpc_process_local_events(local); again = true; } + + __rxrpc_unuse_local(local); } while (again); rxrpc_put_local(local); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 369e516c4bdf..bad3d2420344 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -129,7 +129,7 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn, int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, rxrpc_serial_t *_serial) { - struct rxrpc_connection *conn = NULL; + struct rxrpc_connection *conn; struct rxrpc_ack_buffer *pkt; struct msghdr msg; struct kvec iov[2]; @@ -139,18 +139,14 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, int ret; u8 reason; - spin_lock_bh(&call->lock); - if (call->conn) - conn = rxrpc_get_connection_maybe(call->conn); - spin_unlock_bh(&call->lock); - if (!conn) + if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) return -ECONNRESET; pkt = kzalloc(sizeof(*pkt), GFP_KERNEL); - if (!pkt) { - rxrpc_put_connection(conn); + if (!pkt) return -ENOMEM; - } + + conn = call->conn; msg.msg_name = &call->peer->srx.transport; msg.msg_namelen = call->peer->srx.transport_len; @@ -244,7 +240,6 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, } out: - rxrpc_put_connection(conn); kfree(pkt); return ret; } @@ -254,7 +249,7 @@ out: */ int rxrpc_send_abort_packet(struct rxrpc_call *call) { - struct rxrpc_connection *conn = NULL; + struct rxrpc_connection *conn; struct rxrpc_abort_buffer pkt; struct msghdr msg; struct kvec iov[1]; @@ -271,13 +266,11 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call) test_bit(RXRPC_CALL_TX_LAST, &call->flags)) return 0; - spin_lock_bh(&call->lock); - if (call->conn) - conn = rxrpc_get_connection_maybe(call->conn); - spin_unlock_bh(&call->lock); - if (!conn) + if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) return -ECONNRESET; + conn = call->conn; + msg.msg_name = &call->peer->srx.transport; msg.msg_namelen = call->peer->srx.transport_len; msg.msg_control = NULL; @@ -312,8 +305,6 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call) trace_rxrpc_tx_packet(call->debug_id, &pkt.whdr, rxrpc_tx_point_call_abort); rxrpc_tx_backoff(call, ret); - - rxrpc_put_connection(conn); return ret; } @@ -565,7 +556,7 @@ void rxrpc_reject_packets(struct rxrpc_local *local) memset(&whdr, 0, sizeof(whdr)); while ((skb = skb_dequeue(&local->reject_queue))) { - rxrpc_see_skb(skb, rxrpc_skb_rx_seen); + rxrpc_see_skb(skb, rxrpc_skb_seen); sp = rxrpc_skb(skb); switch (skb->mark) { @@ -581,7 +572,7 @@ void rxrpc_reject_packets(struct rxrpc_local *local) ioc = 2; break; default: - rxrpc_free_skb(skb, rxrpc_skb_rx_freed); + rxrpc_free_skb(skb, rxrpc_skb_freed); continue; } @@ -606,7 +597,7 @@ void rxrpc_reject_packets(struct rxrpc_local *local) rxrpc_tx_point_reject); } - rxrpc_free_skb(skb, rxrpc_skb_rx_freed); + rxrpc_free_skb(skb, rxrpc_skb_freed); } _leave(""); diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 7666ec72d37e..923b263c401b 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -147,10 +147,16 @@ void rxrpc_error_report(struct sock *sk) { struct sock_exterr_skb *serr; struct sockaddr_rxrpc srx; - struct rxrpc_local *local = sk->sk_user_data; + struct rxrpc_local *local; struct rxrpc_peer *peer; struct sk_buff *skb; + rcu_read_lock(); + local = rcu_dereference_sk_user_data(sk); + if (unlikely(!local)) { + rcu_read_unlock(); + return; + } _enter("%p{%d}", sk, local->debug_id); /* Clear the outstanding error value on the socket so that it doesn't @@ -160,24 +166,25 @@ void rxrpc_error_report(struct sock *sk) skb = sock_dequeue_err_skb(sk); if (!skb) { + rcu_read_unlock(); _leave("UDP socket errqueue empty"); return; } - rxrpc_new_skb(skb, rxrpc_skb_rx_received); + rxrpc_new_skb(skb, rxrpc_skb_received); serr = SKB_EXT_ERR(skb); if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) { _leave("UDP empty message"); - rxrpc_free_skb(skb, rxrpc_skb_rx_freed); + rcu_read_unlock(); + rxrpc_free_skb(skb, rxrpc_skb_freed); return; } - rcu_read_lock(); peer = rxrpc_lookup_peer_icmp_rcu(local, skb, &srx); if (peer && !rxrpc_get_peer_maybe(peer)) peer = NULL; if (!peer) { rcu_read_unlock(); - rxrpc_free_skb(skb, rxrpc_skb_rx_freed); + rxrpc_free_skb(skb, rxrpc_skb_freed); _leave(" [no peer]"); return; } @@ -189,7 +196,7 @@ void rxrpc_error_report(struct sock *sk) serr->ee.ee_code == ICMP_FRAG_NEEDED)) { rxrpc_adjust_mtu(peer, serr); rcu_read_unlock(); - rxrpc_free_skb(skb, rxrpc_skb_rx_freed); + rxrpc_free_skb(skb, rxrpc_skb_freed); rxrpc_put_peer(peer); _leave(" [MTU update]"); return; @@ -197,7 +204,7 @@ void rxrpc_error_report(struct sock *sk) rxrpc_store_error(peer, serr); rcu_read_unlock(); - rxrpc_free_skb(skb, rxrpc_skb_rx_freed); + rxrpc_free_skb(skb, rxrpc_skb_freed); rxrpc_put_peer(peer); _leave(""); @@ -357,27 +364,31 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, if (!rxrpc_get_peer_maybe(peer)) continue; - spin_unlock_bh(&rxnet->peer_hash_lock); + if (__rxrpc_use_local(peer->local)) { + spin_unlock_bh(&rxnet->peer_hash_lock); - keepalive_at = peer->last_tx_at + RXRPC_KEEPALIVE_TIME; - slot = keepalive_at - base; - _debug("%02x peer %u t=%d {%pISp}", - cursor, peer->debug_id, slot, &peer->srx.transport); + keepalive_at = peer->last_tx_at + RXRPC_KEEPALIVE_TIME; + slot = keepalive_at - base; + _debug("%02x peer %u t=%d {%pISp}", + cursor, peer->debug_id, slot, &peer->srx.transport); - if (keepalive_at <= base || - keepalive_at > base + RXRPC_KEEPALIVE_TIME) { - rxrpc_send_keepalive(peer); - slot = RXRPC_KEEPALIVE_TIME; - } + if (keepalive_at <= base || + keepalive_at > base + RXRPC_KEEPALIVE_TIME) { + rxrpc_send_keepalive(peer); + slot = RXRPC_KEEPALIVE_TIME; + } - /* A transmission to this peer occurred since last we examined - * it so put it into the appropriate future bucket. - */ - slot += cursor; - slot &= mask; - spin_lock_bh(&rxnet->peer_hash_lock); - list_add_tail(&peer->keepalive_link, - &rxnet->peer_keepalive[slot & mask]); + /* A transmission to this peer occurred since last we + * examined it so put it into the appropriate future + * bucket. + */ + slot += cursor; + slot &= mask; + spin_lock_bh(&rxnet->peer_hash_lock); + list_add_tail(&peer->keepalive_link, + &rxnet->peer_keepalive[slot & mask]); + rxrpc_unuse_local(peer->local); + } rxrpc_put_peer_locked(peer); } diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 9c3ac96f71cb..452163eadb98 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -209,6 +209,7 @@ static void rxrpc_assess_MTU_size(struct rxrpc_sock *rx, */ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp) { + const void *here = __builtin_return_address(0); struct rxrpc_peer *peer; _enter(""); @@ -216,7 +217,7 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp) peer = kzalloc(sizeof(struct rxrpc_peer), gfp); if (peer) { atomic_set(&peer->usage, 1); - peer->local = local; + peer->local = rxrpc_get_local(local); INIT_HLIST_HEAD(&peer->error_targets); peer->service_conns = RB_ROOT; seqlock_init(&peer->service_conn_lock); @@ -230,6 +231,7 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp) peer->cong_cwnd = 3; else peer->cong_cwnd = 4; + trace_rxrpc_peer(peer->debug_id, rxrpc_peer_new, 1, here); } _leave(" = %p", peer); @@ -307,7 +309,6 @@ void rxrpc_new_incoming_peer(struct rxrpc_sock *rx, struct rxrpc_local *local, unsigned long hash_key; hash_key = rxrpc_peer_hash_key(local, &peer->srx); - peer->local = local; rxrpc_init_peer(rx, peer, hash_key); spin_lock(&rxnet->peer_hash_lock); @@ -382,7 +383,7 @@ struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *peer) int n; n = atomic_inc_return(&peer->usage); - trace_rxrpc_peer(peer, rxrpc_peer_got, n, here); + trace_rxrpc_peer(peer->debug_id, rxrpc_peer_got, n, here); return peer; } @@ -396,7 +397,7 @@ struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *peer) if (peer) { int n = atomic_fetch_add_unless(&peer->usage, 1, 0); if (n > 0) - trace_rxrpc_peer(peer, rxrpc_peer_got, n + 1, here); + trace_rxrpc_peer(peer->debug_id, rxrpc_peer_got, n + 1, here); else peer = NULL; } @@ -417,6 +418,7 @@ static void __rxrpc_put_peer(struct rxrpc_peer *peer) list_del_init(&peer->keepalive_link); spin_unlock_bh(&rxnet->peer_hash_lock); + rxrpc_put_local(peer->local); kfree_rcu(peer, rcu); } @@ -426,11 +428,13 @@ static void __rxrpc_put_peer(struct rxrpc_peer *peer) void rxrpc_put_peer(struct rxrpc_peer *peer) { const void *here = __builtin_return_address(0); + unsigned int debug_id; int n; if (peer) { + debug_id = peer->debug_id; n = atomic_dec_return(&peer->usage); - trace_rxrpc_peer(peer, rxrpc_peer_put, n, here); + trace_rxrpc_peer(debug_id, rxrpc_peer_put, n, here); if (n == 0) __rxrpc_put_peer(peer); } @@ -443,13 +447,15 @@ void rxrpc_put_peer(struct rxrpc_peer *peer) void rxrpc_put_peer_locked(struct rxrpc_peer *peer) { const void *here = __builtin_return_address(0); + unsigned int debug_id = peer->debug_id; int n; n = atomic_dec_return(&peer->usage); - trace_rxrpc_peer(peer, rxrpc_peer_put, n, here); + trace_rxrpc_peer(debug_id, rxrpc_peer_put, n, here); if (n == 0) { hash_del_rcu(&peer->hash_link); list_del_init(&peer->keepalive_link); + rxrpc_put_local(peer->local); kfree_rcu(peer, rcu); } } diff --git a/net/rxrpc/protocol.h b/net/rxrpc/protocol.h index 99ce322d7caa..49bb972539aa 100644 --- a/net/rxrpc/protocol.h +++ b/net/rxrpc/protocol.h @@ -89,6 +89,15 @@ struct rxrpc_jumbo_header { #define RXRPC_JUMBO_DATALEN 1412 /* non-terminal jumbo packet data length */ #define RXRPC_JUMBO_SUBPKTLEN (RXRPC_JUMBO_DATALEN + sizeof(struct rxrpc_jumbo_header)) +/* + * The maximum number of subpackets that can possibly fit in a UDP packet is: + * + * ((max_IP - IP_hdr - UDP_hdr) / RXRPC_JUMBO_SUBPKTLEN) + 1 + * = ((65535 - 28 - 28) / 1416) + 1 + * = 46 non-terminal packets and 1 terminal packet. + */ +#define RXRPC_MAX_NR_JUMBO 47 + /*****************************************************************************/ /* * on-the-wire Rx ACK packet data payload diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 9a7e1bc9791d..8578c39ec839 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -177,7 +177,8 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) struct sk_buff *skb; rxrpc_serial_t serial; rxrpc_seq_t hard_ack, top; - u8 flags; + bool last = false; + u8 subpacket; int ix; _enter("%d", call->debug_id); @@ -189,23 +190,25 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) hard_ack++; ix = hard_ack & RXRPC_RXTX_BUFF_MASK; skb = call->rxtx_buffer[ix]; - rxrpc_see_skb(skb, rxrpc_skb_rx_rotated); + rxrpc_see_skb(skb, rxrpc_skb_rotated); sp = rxrpc_skb(skb); - flags = sp->hdr.flags; - serial = sp->hdr.serial; - if (call->rxtx_annotations[ix] & RXRPC_RX_ANNO_JUMBO) - serial += (call->rxtx_annotations[ix] & RXRPC_RX_ANNO_JUMBO) - 1; + + subpacket = call->rxtx_annotations[ix] & RXRPC_RX_ANNO_SUBPACKET; + serial = sp->hdr.serial + subpacket; + + if (subpacket == sp->nr_subpackets - 1 && + sp->rx_flags & RXRPC_SKB_INCL_LAST) + last = true; call->rxtx_buffer[ix] = NULL; call->rxtx_annotations[ix] = 0; /* Barrier against rxrpc_input_data(). */ smp_store_release(&call->rx_hard_ack, hard_ack); - rxrpc_free_skb(skb, rxrpc_skb_rx_freed); + rxrpc_free_skb(skb, rxrpc_skb_freed); - _debug("%u,%u,%02x", hard_ack, top, flags); trace_rxrpc_receive(call, rxrpc_receive_rotate, serial, hard_ack); - if (flags & RXRPC_LAST_PACKET) { + if (last) { rxrpc_end_rx_phase(call, serial); } else { /* Check to see if there's an ACK that needs sending. */ @@ -233,22 +236,23 @@ static int rxrpc_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, struct rxrpc_skb_priv *sp = rxrpc_skb(skb); rxrpc_seq_t seq = sp->hdr.seq; u16 cksum = sp->hdr.cksum; + u8 subpacket = annotation & RXRPC_RX_ANNO_SUBPACKET; _enter(""); /* For all but the head jumbo subpacket, the security checksum is in a * jumbo header immediately prior to the data. */ - if ((annotation & RXRPC_RX_ANNO_JUMBO) > 1) { + if (subpacket > 0) { __be16 tmp; if (skb_copy_bits(skb, offset - 2, &tmp, 2) < 0) BUG(); cksum = ntohs(tmp); - seq += (annotation & RXRPC_RX_ANNO_JUMBO) - 1; + seq += subpacket; } - return call->conn->security->verify_packet(call, skb, offset, len, - seq, cksum); + return call->security->verify_packet(call, skb, offset, len, + seq, cksum); } /* @@ -263,21 +267,24 @@ static int rxrpc_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, */ static int rxrpc_locate_data(struct rxrpc_call *call, struct sk_buff *skb, u8 *_annotation, - unsigned int *_offset, unsigned int *_len) + unsigned int *_offset, unsigned int *_len, + bool *_last) { + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); unsigned int offset = sizeof(struct rxrpc_wire_header); unsigned int len; + bool last = false; int ret; u8 annotation = *_annotation; + u8 subpacket = annotation & RXRPC_RX_ANNO_SUBPACKET; /* Locate the subpacket */ + offset += subpacket * RXRPC_JUMBO_SUBPKTLEN; len = skb->len - offset; - if ((annotation & RXRPC_RX_ANNO_JUMBO) > 0) { - offset += (((annotation & RXRPC_RX_ANNO_JUMBO) - 1) * - RXRPC_JUMBO_SUBPKTLEN); - len = (annotation & RXRPC_RX_ANNO_JLAST) ? - skb->len - offset : RXRPC_JUMBO_SUBPKTLEN; - } + if (subpacket < sp->nr_subpackets - 1) + len = RXRPC_JUMBO_DATALEN; + else if (sp->rx_flags & RXRPC_SKB_INCL_LAST) + last = true; if (!(annotation & RXRPC_RX_ANNO_VERIFIED)) { ret = rxrpc_verify_packet(call, skb, annotation, offset, len); @@ -288,7 +295,8 @@ static int rxrpc_locate_data(struct rxrpc_call *call, struct sk_buff *skb, *_offset = offset; *_len = len; - call->conn->security->locate_data(call, skb, _offset, _len); + *_last = last; + call->security->locate_data(call, skb, _offset, _len); return 0; } @@ -303,9 +311,10 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, { struct rxrpc_skb_priv *sp; struct sk_buff *skb; + rxrpc_serial_t serial; rxrpc_seq_t hard_ack, top, seq; size_t remain; - bool last; + bool rx_pkt_last; unsigned int rx_pkt_offset, rx_pkt_len; int ix, copy, ret = -EAGAIN, ret2; @@ -315,6 +324,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, rx_pkt_offset = call->rx_pkt_offset; rx_pkt_len = call->rx_pkt_len; + rx_pkt_last = call->rx_pkt_last; if (call->state >= RXRPC_CALL_SERVER_ACK_REQUEST) { seq = call->rx_hard_ack; @@ -325,6 +335,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, /* Barriers against rxrpc_input_data(). */ hard_ack = call->rx_hard_ack; seq = hard_ack + 1; + while (top = smp_load_acquire(&call->rx_top), before_eq(seq, top) ) { @@ -336,12 +347,15 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, break; } smp_rmb(); - rxrpc_see_skb(skb, rxrpc_skb_rx_seen); + rxrpc_see_skb(skb, rxrpc_skb_seen); sp = rxrpc_skb(skb); - if (!(flags & MSG_PEEK)) + if (!(flags & MSG_PEEK)) { + serial = sp->hdr.serial; + serial += call->rxtx_annotations[ix] & RXRPC_RX_ANNO_SUBPACKET; trace_rxrpc_receive(call, rxrpc_receive_front, - sp->hdr.serial, seq); + serial, seq); + } if (msg) sock_recv_timestamp(msg, sock->sk, skb); @@ -349,7 +363,8 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, if (rx_pkt_offset == 0) { ret2 = rxrpc_locate_data(call, skb, &call->rxtx_annotations[ix], - &rx_pkt_offset, &rx_pkt_len); + &rx_pkt_offset, &rx_pkt_len, + &rx_pkt_last); trace_rxrpc_recvmsg(call, rxrpc_recvmsg_next, seq, rx_pkt_offset, rx_pkt_len, ret2); if (ret2 < 0) { @@ -389,13 +404,12 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, } /* The whole packet has been transferred. */ - last = sp->hdr.flags & RXRPC_LAST_PACKET; if (!(flags & MSG_PEEK)) rxrpc_rotate_rx_window(call); rx_pkt_offset = 0; rx_pkt_len = 0; - if (last) { + if (rx_pkt_last) { ASSERTCMP(seq, ==, READ_ONCE(call->rx_top)); ret = 1; goto out; @@ -408,6 +422,7 @@ out: if (!(flags & MSG_PEEK)) { call->rx_pkt_offset = rx_pkt_offset; call->rx_pkt_len = rx_pkt_len; + call->rx_pkt_last = rx_pkt_last; } done: trace_rxrpc_recvmsg(call, rxrpc_recvmsg_data_return, seq, diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index ae8cd8926456..098f1f9ec53b 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -43,6 +43,7 @@ struct rxkad_level2_hdr { * packets */ static struct crypto_sync_skcipher *rxkad_ci; +static struct skcipher_request *rxkad_ci_req; static DEFINE_MUTEX(rxkad_ci_mutex); /* @@ -99,8 +100,8 @@ error: */ static int rxkad_prime_packet_security(struct rxrpc_connection *conn) { + struct skcipher_request *req; struct rxrpc_key_token *token; - SYNC_SKCIPHER_REQUEST_ON_STACK(req, conn->cipher); struct scatterlist sg; struct rxrpc_crypt iv; __be32 *tmpbuf; @@ -115,6 +116,12 @@ static int rxkad_prime_packet_security(struct rxrpc_connection *conn) if (!tmpbuf) return -ENOMEM; + req = skcipher_request_alloc(&conn->cipher->base, GFP_NOFS); + if (!req) { + kfree(tmpbuf); + return -ENOMEM; + } + token = conn->params.key->payload.data[0]; memcpy(&iv, token->kad->session_key, sizeof(iv)); @@ -128,7 +135,7 @@ static int rxkad_prime_packet_security(struct rxrpc_connection *conn) skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &sg, &sg, tmpsize, iv.x); crypto_skcipher_encrypt(req); - skcipher_request_zero(req); + skcipher_request_free(req); memcpy(&conn->csum_iv, tmpbuf + 2, sizeof(conn->csum_iv)); kfree(tmpbuf); @@ -137,6 +144,35 @@ static int rxkad_prime_packet_security(struct rxrpc_connection *conn) } /* + * Allocate and prepare the crypto request on a call. For any particular call, + * this is called serially for the packets, so no lock should be necessary. + */ +static struct skcipher_request *rxkad_get_call_crypto(struct rxrpc_call *call) +{ + struct crypto_skcipher *tfm = &call->conn->cipher->base; + struct skcipher_request *cipher_req = call->cipher_req; + + if (!cipher_req) { + cipher_req = skcipher_request_alloc(tfm, GFP_NOFS); + if (!cipher_req) + return NULL; + call->cipher_req = cipher_req; + } + + return cipher_req; +} + +/* + * Clean up the crypto on a call. + */ +static void rxkad_free_call_crypto(struct rxrpc_call *call) +{ + if (call->cipher_req) + skcipher_request_free(call->cipher_req); + call->cipher_req = NULL; +} + +/* * partially encrypt a packet (level 1 security) */ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, @@ -187,10 +223,8 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, struct rxrpc_skb_priv *sp; struct rxrpc_crypt iv; struct scatterlist sg[16]; - struct sk_buff *trailer; unsigned int len; u16 check; - int nsg; int err; sp = rxrpc_skb(skb); @@ -214,15 +248,14 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, crypto_skcipher_encrypt(req); /* we want to encrypt the skbuff in-place */ - nsg = skb_cow_data(skb, 0, &trailer); - err = -ENOMEM; - if (nsg < 0 || nsg > 16) + err = -EMSGSIZE; + if (skb_shinfo(skb)->nr_frags > 16) goto out; len = data_size + call->conn->size_align - 1; len &= ~(call->conn->size_align - 1); - sg_init_table(sg, nsg); + sg_init_table(sg, ARRAY_SIZE(sg)); err = skb_to_sgvec(skb, sg, 0, len); if (unlikely(err < 0)) goto out; @@ -246,7 +279,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call, void *sechdr) { struct rxrpc_skb_priv *sp; - SYNC_SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); + struct skcipher_request *req; struct rxrpc_crypt iv; struct scatterlist sg; u32 x, y; @@ -265,6 +298,10 @@ static int rxkad_secure_packet(struct rxrpc_call *call, if (ret < 0) return ret; + req = rxkad_get_call_crypto(call); + if (!req) + return -ENOMEM; + /* continue encrypting from where we left off */ memcpy(&iv, call->conn->csum_iv.x, sizeof(iv)); @@ -319,11 +356,10 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, struct rxkad_level1_hdr sechdr; struct rxrpc_crypt iv; struct scatterlist sg[16]; - struct sk_buff *trailer; bool aborted; u32 data_size, buf; u16 check; - int nsg, ret; + int ret; _enter(""); @@ -336,11 +372,7 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, /* Decrypt the skbuff in-place. TODO: We really want to decrypt * directly into the target buffer. */ - nsg = skb_cow_data(skb, 0, &trailer); - if (nsg < 0 || nsg > 16) - goto nomem; - - sg_init_table(sg, nsg); + sg_init_table(sg, ARRAY_SIZE(sg)); ret = skb_to_sgvec(skb, sg, offset, 8); if (unlikely(ret < 0)) return ret; @@ -388,10 +420,6 @@ protocol_error: if (aborted) rxrpc_send_abort_packet(call); return -EPROTO; - -nomem: - _leave(" = -ENOMEM"); - return -ENOMEM; } /* @@ -406,7 +434,6 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, struct rxkad_level2_hdr sechdr; struct rxrpc_crypt iv; struct scatterlist _sg[4], *sg; - struct sk_buff *trailer; bool aborted; u32 data_size, buf; u16 check; @@ -423,12 +450,11 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, /* Decrypt the skbuff in-place. TODO: We really want to decrypt * directly into the target buffer. */ - nsg = skb_cow_data(skb, 0, &trailer); - if (nsg < 0) - goto nomem; - sg = _sg; - if (unlikely(nsg > 4)) { + nsg = skb_shinfo(skb)->nr_frags; + if (nsg <= 4) { + nsg = 4; + } else { sg = kmalloc_array(nsg, sizeof(*sg), GFP_NOIO); if (!sg) goto nomem; @@ -502,7 +528,7 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, unsigned int offset, unsigned int len, rxrpc_seq_t seq, u16 expected_cksum) { - SYNC_SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); + struct skcipher_request *req; struct rxrpc_crypt iv; struct scatterlist sg; bool aborted; @@ -515,6 +541,10 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, if (!call->conn->cipher) return 0; + req = rxkad_get_call_crypto(call); + if (!req) + return -ENOMEM; + /* continue encrypting from where we left off */ memcpy(&iv, call->conn->csum_iv.x, sizeof(iv)); @@ -618,9 +648,9 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) u32 serial; int ret; - _enter("{%d,%x}", conn->debug_id, key_serial(conn->params.key)); + _enter("{%d,%x}", conn->debug_id, key_serial(conn->server_key)); - ret = key_validate(conn->params.key); + ret = key_validate(conn->server_key); if (ret < 0) return ret; @@ -747,14 +777,18 @@ static void rxkad_calc_response_checksum(struct rxkad_response *response) /* * encrypt the response packet */ -static void rxkad_encrypt_response(struct rxrpc_connection *conn, - struct rxkad_response *resp, - const struct rxkad_key *s2) +static int rxkad_encrypt_response(struct rxrpc_connection *conn, + struct rxkad_response *resp, + const struct rxkad_key *s2) { - SYNC_SKCIPHER_REQUEST_ON_STACK(req, conn->cipher); + struct skcipher_request *req; struct rxrpc_crypt iv; struct scatterlist sg[1]; + req = skcipher_request_alloc(&conn->cipher->base, GFP_NOFS); + if (!req) + return -ENOMEM; + /* continue encrypting from where we left off */ memcpy(&iv, s2->session_key, sizeof(iv)); @@ -764,7 +798,8 @@ static void rxkad_encrypt_response(struct rxrpc_connection *conn, skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, sizeof(resp->encrypted), iv.x); crypto_skcipher_encrypt(req); - skcipher_request_zero(req); + skcipher_request_free(req); + return 0; } /* @@ -839,8 +874,9 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, /* calculate the response checksum and then do the encryption */ rxkad_calc_response_checksum(resp); - rxkad_encrypt_response(conn, resp, token->kad); - ret = rxkad_send_response(conn, &sp->hdr, resp, token->kad); + ret = rxkad_encrypt_response(conn, resp, token->kad); + if (ret == 0) + ret = rxkad_send_response(conn, &sp->hdr, resp, token->kad); kfree(resp); return ret; @@ -1017,18 +1053,16 @@ static void rxkad_decrypt_response(struct rxrpc_connection *conn, struct rxkad_response *resp, const struct rxrpc_crypt *session_key) { - SYNC_SKCIPHER_REQUEST_ON_STACK(req, rxkad_ci); + struct skcipher_request *req = rxkad_ci_req; struct scatterlist sg[1]; struct rxrpc_crypt iv; _enter(",,%08x%08x", ntohl(session_key->n[0]), ntohl(session_key->n[1])); - ASSERT(rxkad_ci != NULL); - mutex_lock(&rxkad_ci_mutex); if (crypto_sync_skcipher_setkey(rxkad_ci, session_key->x, - sizeof(*session_key)) < 0) + sizeof(*session_key)) < 0) BUG(); memcpy(&iv, session_key, sizeof(iv)); @@ -1222,10 +1256,26 @@ static void rxkad_clear(struct rxrpc_connection *conn) */ static int rxkad_init(void) { + struct crypto_sync_skcipher *tfm; + struct skcipher_request *req; + /* pin the cipher we need so that the crypto layer doesn't invoke * keventd to go get it */ - rxkad_ci = crypto_alloc_sync_skcipher("pcbc(fcrypt)", 0, 0); - return PTR_ERR_OR_ZERO(rxkad_ci); + tfm = crypto_alloc_sync_skcipher("pcbc(fcrypt)", 0, 0); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + req = skcipher_request_alloc(&tfm->base, GFP_KERNEL); + if (!req) + goto nomem_tfm; + + rxkad_ci_req = req; + rxkad_ci = tfm; + return 0; + +nomem_tfm: + crypto_free_sync_skcipher(tfm); + return -ENOMEM; } /* @@ -1233,8 +1283,8 @@ static int rxkad_init(void) */ static void rxkad_exit(void) { - if (rxkad_ci) - crypto_free_sync_skcipher(rxkad_ci); + crypto_free_sync_skcipher(rxkad_ci); + skcipher_request_free(rxkad_ci_req); } /* @@ -1243,12 +1293,14 @@ static void rxkad_exit(void) const struct rxrpc_security rxkad = { .name = "rxkad", .security_index = RXRPC_SECURITY_RXKAD, + .no_key_abort = RXKADUNKNOWNKEY, .init = rxkad_init, .exit = rxkad_exit, .init_connection_security = rxkad_init_connection_security, .prime_packet_security = rxkad_prime_packet_security, .secure_packet = rxkad_secure_packet, .verify_packet = rxkad_verify_packet, + .free_call_crypto = rxkad_free_call_crypto, .locate_data = rxkad_locate_data, .issue_challenge = rxkad_issue_challenge, .respond_to_challenge = rxkad_respond_to_challenge, diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c index a4c47d2b7054..9b1fb9ed0717 100644 --- a/net/rxrpc/security.c +++ b/net/rxrpc/security.c @@ -101,62 +101,58 @@ int rxrpc_init_client_conn_security(struct rxrpc_connection *conn) } /* - * initialise the security on a server connection + * Find the security key for a server connection. */ -int rxrpc_init_server_conn_security(struct rxrpc_connection *conn) +bool rxrpc_look_up_server_security(struct rxrpc_local *local, struct rxrpc_sock *rx, + const struct rxrpc_security **_sec, + struct key **_key, + struct sk_buff *skb) { const struct rxrpc_security *sec; - struct rxrpc_local *local = conn->params.local; - struct rxrpc_sock *rx; - struct key *key; - key_ref_t kref; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + key_ref_t kref = NULL; char kdesc[5 + 1 + 3 + 1]; _enter(""); - sprintf(kdesc, "%u:%u", conn->service_id, conn->security_ix); + sprintf(kdesc, "%u:%u", sp->hdr.serviceId, sp->hdr.securityIndex); - sec = rxrpc_security_lookup(conn->security_ix); + sec = rxrpc_security_lookup(sp->hdr.securityIndex); if (!sec) { - _leave(" = -ENOKEY [lookup]"); - return -ENOKEY; + trace_rxrpc_abort(0, "SVS", + sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + RX_INVALID_OPERATION, EKEYREJECTED); + skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; + skb->priority = RX_INVALID_OPERATION; + return false; } - /* find the service */ - read_lock(&local->services_lock); - rx = rcu_dereference_protected(local->service, - lockdep_is_held(&local->services_lock)); - if (rx && (rx->srx.srx_service == conn->service_id || - rx->second_service == conn->service_id)) - goto found_service; + if (sp->hdr.securityIndex == RXRPC_SECURITY_NONE) + goto out; - /* the service appears to have died */ - read_unlock(&local->services_lock); - _leave(" = -ENOENT"); - return -ENOENT; - -found_service: if (!rx->securities) { - read_unlock(&local->services_lock); - _leave(" = -ENOKEY"); - return -ENOKEY; + trace_rxrpc_abort(0, "SVR", + sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + RX_INVALID_OPERATION, EKEYREJECTED); + skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; + skb->priority = RX_INVALID_OPERATION; + return false; } /* look through the service's keyring */ kref = keyring_search(make_key_ref(rx->securities, 1UL), &key_type_rxrpc_s, kdesc, true); if (IS_ERR(kref)) { - read_unlock(&local->services_lock); - _leave(" = %ld [search]", PTR_ERR(kref)); - return PTR_ERR(kref); + trace_rxrpc_abort(0, "SVK", + sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + sec->no_key_abort, EKEYREJECTED); + skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; + skb->priority = sec->no_key_abort; + return false; } - key = key_ref_to_ptr(kref); - read_unlock(&local->services_lock); - - conn->server_key = key; - conn->security = sec; - - _leave(" = 0"); - return 0; +out: + *_sec = sec; + *_key = key_ref_to_ptr(kref); + return true; } diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index bae14438f869..813fd6888142 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -176,7 +176,7 @@ static int rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, skb->tstamp = ktime_get_real(); ix = seq & RXRPC_RXTX_BUFF_MASK; - rxrpc_get_skb(skb, rxrpc_skb_tx_got); + rxrpc_get_skb(skb, rxrpc_skb_got); call->rxtx_annotations[ix] = annotation; smp_wmb(); call->rxtx_buffer[ix] = skb; @@ -248,7 +248,7 @@ static int rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, } out: - rxrpc_free_skb(skb, rxrpc_skb_tx_freed); + rxrpc_free_skb(skb, rxrpc_skb_freed); _leave(" = %d", ret); return ret; } @@ -289,7 +289,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, skb = call->tx_pending; call->tx_pending = NULL; - rxrpc_see_skb(skb, rxrpc_skb_tx_seen); + rxrpc_see_skb(skb, rxrpc_skb_seen); copied = 0; do { @@ -336,7 +336,9 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, if (!skb) goto maybe_error; - rxrpc_new_skb(skb, rxrpc_skb_tx_new); + sp = rxrpc_skb(skb); + sp->rx_flags |= RXRPC_SKB_TX_BUFFER; + rxrpc_new_skb(skb, rxrpc_skb_new); _debug("ALLOC SEND %p", skb); @@ -346,7 +348,6 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, skb_reserve(skb, call->conn->security_size); skb->len += call->conn->security_size; - sp = rxrpc_skb(skb); sp->remain = chunk; if (sp->remain > skb_tailroom(skb)) sp->remain = skb_tailroom(skb); @@ -418,7 +419,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, call->tx_winsize) sp->hdr.flags |= RXRPC_MORE_PACKETS; - ret = conn->security->secure_packet( + ret = call->security->secure_packet( call, skb, skb->mark, skb->head); if (ret < 0) goto out; @@ -439,7 +440,7 @@ out: return ret; call_terminated: - rxrpc_free_skb(skb, rxrpc_skb_tx_freed); + rxrpc_free_skb(skb, rxrpc_skb_freed); _leave(" = %d", call->error); return call->error; @@ -660,6 +661,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) case RXRPC_CALL_SERVER_PREALLOC: case RXRPC_CALL_SERVER_SECURING: case RXRPC_CALL_SERVER_ACCEPTING: + rxrpc_put_call(call, rxrpc_call_put); ret = -EBUSY; goto error_release_sock; default: diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c index 9ad5045b7c2f..0348d2bf6f7d 100644 --- a/net/rxrpc/skbuff.c +++ b/net/rxrpc/skbuff.c @@ -14,7 +14,8 @@ #include <net/af_rxrpc.h> #include "ar-internal.h" -#define select_skb_count(op) (op >= rxrpc_skb_tx_cleaned ? &rxrpc_n_tx_skbs : &rxrpc_n_rx_skbs) +#define is_tx_skb(skb) (rxrpc_skb(skb)->rx_flags & RXRPC_SKB_TX_BUFFER) +#define select_skb_count(skb) (is_tx_skb(skb) ? &rxrpc_n_tx_skbs : &rxrpc_n_rx_skbs) /* * Note the allocation or reception of a socket buffer. @@ -22,8 +23,9 @@ void rxrpc_new_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) { const void *here = __builtin_return_address(0); - int n = atomic_inc_return(select_skb_count(op)); - trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, here); + int n = atomic_inc_return(select_skb_count(skb)); + trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, + rxrpc_skb(skb)->rx_flags, here); } /* @@ -33,8 +35,9 @@ void rxrpc_see_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) { const void *here = __builtin_return_address(0); if (skb) { - int n = atomic_read(select_skb_count(op)); - trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, here); + int n = atomic_read(select_skb_count(skb)); + trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, + rxrpc_skb(skb)->rx_flags, here); } } @@ -44,12 +47,23 @@ void rxrpc_see_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) void rxrpc_get_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) { const void *here = __builtin_return_address(0); - int n = atomic_inc_return(select_skb_count(op)); - trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, here); + int n = atomic_inc_return(select_skb_count(skb)); + trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, + rxrpc_skb(skb)->rx_flags, here); skb_get(skb); } /* + * Note the dropping of a ref on a socket buffer by the core. + */ +void rxrpc_eaten_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) +{ + const void *here = __builtin_return_address(0); + int n = atomic_inc_return(&rxrpc_n_rx_skbs); + trace_rxrpc_skb(skb, op, 0, n, 0, here); +} + +/* * Note the destruction of a socket buffer. */ void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) @@ -58,8 +72,9 @@ void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) if (skb) { int n; CHECK_SLAB_OKAY(&skb->users); - n = atomic_dec_return(select_skb_count(op)); - trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, here); + n = atomic_dec_return(select_skb_count(skb)); + trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, + rxrpc_skb(skb)->rx_flags, here); kfree_skb(skb); } } @@ -72,9 +87,10 @@ void rxrpc_purge_queue(struct sk_buff_head *list) const void *here = __builtin_return_address(0); struct sk_buff *skb; while ((skb = skb_dequeue((list))) != NULL) { - int n = atomic_dec_return(select_skb_count(rxrpc_skb_rx_purged)); - trace_rxrpc_skb(skb, rxrpc_skb_rx_purged, - refcount_read(&skb->users), n, here); + int n = atomic_dec_return(select_skb_count(skb)); + trace_rxrpc_skb(skb, rxrpc_skb_purged, + refcount_read(&skb->users), n, + rxrpc_skb(skb)->rx_flags, here); kfree_skb(skb); } } diff --git a/net/sched/Kconfig b/net/sched/Kconfig index afd2ba157a13..edde0e519438 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -324,7 +324,7 @@ config NET_SCH_CAKE tristate "Common Applications Kept Enhanced (CAKE)" help Say Y here if you want to use the Common Applications Kept Enhanced - (CAKE) queue management algorithm. + (CAKE) queue management algorithm. To compile this driver as a module, choose M here: the module will be called sch_cake. @@ -366,6 +366,19 @@ config NET_SCH_PIE If unsure, say N. +config NET_SCH_FQ_PIE + depends on NET_SCH_PIE + tristate "Flow Queue Proportional Integral controller Enhanced (FQ-PIE)" + help + Say Y here if you want to use the Flow Queue Proportional Integral + controller Enhanced (FQ-PIE) packet scheduling algorithm. + For more information, please see https://tools.ietf.org/html/rfc8033 + + To compile this driver as a module, choose M here: the module + will be called sch_fq_pie. + + If unsure, say N. + config NET_SCH_INGRESS tristate "Ingress/classifier-action Qdisc" depends on NET_CLS_ACT @@ -409,6 +422,23 @@ config NET_SCH_PLUG To compile this code as a module, choose M here: the module will be called sch_plug. +config NET_SCH_ETS + tristate "Enhanced transmission selection scheduler (ETS)" + help + The Enhanced Transmission Selection scheduler is a classful + queuing discipline that merges functionality of PRIO and DRR + qdiscs in one scheduler. ETS makes it easy to configure a set of + strict and bandwidth-sharing bands to implement the transmission + selection described in 802.1Qaz. + + Say Y here if you want to use the ETS packet scheduling + algorithm. + + To compile this driver as a module, choose M here: the module + will be called sch_ets. + + If unsure, say N. + menuconfig NET_SCH_DEFAULT bool "Allow override default queue discipline" ---help--- @@ -730,8 +760,8 @@ config NET_CLS_ACT config NET_ACT_POLICE tristate "Traffic Policing" - depends on NET_CLS_ACT - ---help--- + depends on NET_CLS_ACT + ---help--- Say Y here if you want to do traffic policing, i.e. strict bandwidth limiting. This action replaces the existing policing module. @@ -740,9 +770,9 @@ config NET_ACT_POLICE module will be called act_police. config NET_ACT_GACT - tristate "Generic actions" - depends on NET_CLS_ACT - ---help--- + tristate "Generic actions" + depends on NET_CLS_ACT + ---help--- Say Y here to take generic actions such as dropping and accepting packets. @@ -750,15 +780,15 @@ config NET_ACT_GACT module will be called act_gact. config GACT_PROB - bool "Probability support" - depends on NET_ACT_GACT - ---help--- + bool "Probability support" + depends on NET_ACT_GACT + ---help--- Say Y here to use the generic action randomly or deterministically. config NET_ACT_MIRRED - tristate "Redirecting and Mirroring" - depends on NET_CLS_ACT - ---help--- + tristate "Redirecting and Mirroring" + depends on NET_CLS_ACT + ---help--- Say Y here to allow packets to be mirrored or redirected to other devices. @@ -766,10 +796,10 @@ config NET_ACT_MIRRED module will be called act_mirred. config NET_ACT_SAMPLE - tristate "Traffic Sampling" - depends on NET_CLS_ACT - select PSAMPLE - ---help--- + tristate "Traffic Sampling" + depends on NET_CLS_ACT + select PSAMPLE + ---help--- Say Y here to allow packet sampling tc action. The packet sample action consists of statistically choosing packets and sampling them using the psample module. @@ -778,9 +808,9 @@ config NET_ACT_SAMPLE module will be called act_sample. config NET_ACT_IPT - tristate "IPtables targets" - depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES - ---help--- + tristate "IPtables targets" + depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES + ---help--- Say Y here to be able to invoke iptables targets after successful classification. @@ -788,9 +818,9 @@ config NET_ACT_IPT module will be called act_ipt. config NET_ACT_NAT - tristate "Stateless NAT" - depends on NET_CLS_ACT - ---help--- + tristate "Stateless NAT" + depends on NET_CLS_ACT + ---help--- Say Y here to do stateless NAT on IPv4 packets. You should use netfilter for NAT unless you know what you are doing. @@ -798,18 +828,18 @@ config NET_ACT_NAT module will be called act_nat. config NET_ACT_PEDIT - tristate "Packet Editing" - depends on NET_CLS_ACT - ---help--- + tristate "Packet Editing" + depends on NET_CLS_ACT + ---help--- Say Y here if you want to mangle the content of packets. To compile this code as a module, choose M here: the module will be called act_pedit. config NET_ACT_SIMP - tristate "Simple Example (Debug)" - depends on NET_CLS_ACT - ---help--- + tristate "Simple Example (Debug)" + depends on NET_CLS_ACT + ---help--- Say Y here to add a simple action for demonstration purposes. It is meant as an example and for debugging purposes. It will print a configured policy string followed by the packet count @@ -821,9 +851,9 @@ config NET_ACT_SIMP module will be called act_simple. config NET_ACT_SKBEDIT - tristate "SKB Editing" - depends on NET_CLS_ACT - ---help--- + tristate "SKB Editing" + depends on NET_CLS_ACT + ---help--- Say Y here to change skb priority or queue_mapping settings. If unsure, say N. @@ -832,10 +862,10 @@ config NET_ACT_SKBEDIT module will be called act_skbedit. config NET_ACT_CSUM - tristate "Checksum Updating" - depends on NET_CLS_ACT && INET - select LIBCRC32C - ---help--- + tristate "Checksum Updating" + depends on NET_CLS_ACT && INET + select LIBCRC32C + ---help--- Say Y here to update some common checksum after some direct packet alterations. @@ -854,9 +884,9 @@ config NET_ACT_MPLS module will be called act_mpls. config NET_ACT_VLAN - tristate "Vlan manipulation" - depends on NET_CLS_ACT - ---help--- + tristate "Vlan manipulation" + depends on NET_CLS_ACT + ---help--- Say Y here to push or pop vlan headers. If unsure, say N. @@ -865,9 +895,9 @@ config NET_ACT_VLAN module will be called act_vlan. config NET_ACT_BPF - tristate "BPF based action" - depends on NET_CLS_ACT - ---help--- + tristate "BPF based action" + depends on NET_CLS_ACT + ---help--- Say Y here to execute BPF code on packets. The BPF code will decide if the packet should be dropped or not. @@ -877,10 +907,10 @@ config NET_ACT_BPF module will be called act_bpf. config NET_ACT_CONNMARK - tristate "Netfilter Connection Mark Retriever" - depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES - depends on NF_CONNTRACK && NF_CONNTRACK_MARK - ---help--- + tristate "Netfilter Connection Mark Retriever" + depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES + depends on NF_CONNTRACK && NF_CONNTRACK_MARK + ---help--- Say Y here to allow retrieving of conn mark If unsure, say N. @@ -889,10 +919,10 @@ config NET_ACT_CONNMARK module will be called act_connmark. config NET_ACT_CTINFO - tristate "Netfilter Connection Mark Actions" - depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES - depends on NF_CONNTRACK && NF_CONNTRACK_MARK - help + tristate "Netfilter Connection Mark Actions" + depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES + depends on NF_CONNTRACK && NF_CONNTRACK_MARK + help Say Y here to allow transfer of a connmark stored information. Current actions transfer connmark stored DSCP into ipv4/v6 diffserv and/or to transfer connmark to packet @@ -906,21 +936,21 @@ config NET_ACT_CTINFO module will be called act_ctinfo. config NET_ACT_SKBMOD - tristate "skb data modification action" - depends on NET_CLS_ACT - ---help--- - Say Y here to allow modification of skb data + tristate "skb data modification action" + depends on NET_CLS_ACT + ---help--- + Say Y here to allow modification of skb data - If unsure, say N. + If unsure, say N. - To compile this code as a module, choose M here: the - module will be called act_skbmod. + To compile this code as a module, choose M here: the + module will be called act_skbmod. config NET_ACT_IFE - tristate "Inter-FE action based on IETF ForCES InterFE LFB" - depends on NET_CLS_ACT - select NET_IFE - ---help--- + tristate "Inter-FE action based on IETF ForCES InterFE LFB" + depends on NET_CLS_ACT + select NET_IFE + ---help--- Say Y here to allow for sourcing and terminating metadata For details refer to netdev01 paper: "Distributing Linux Traffic Control Classifier-Action Subsystem" @@ -930,9 +960,9 @@ config NET_ACT_IFE module will be called act_ife. config NET_ACT_TUNNEL_KEY - tristate "IP tunnel metadata manipulation" - depends on NET_CLS_ACT - ---help--- + tristate "IP tunnel metadata manipulation" + depends on NET_CLS_ACT + ---help--- Say Y here to set/release ip tunnel metadata. If unsure, say N. @@ -941,9 +971,9 @@ config NET_ACT_TUNNEL_KEY module will be called act_tunnel_key. config NET_ACT_CT - tristate "connection tracking tc action" - depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT - help + tristate "connection tracking tc action" + depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT + help Say Y here to allow sending the packets to conntrack module. If unsure, say N. @@ -952,16 +982,28 @@ config NET_ACT_CT module will be called act_ct. config NET_IFE_SKBMARK - tristate "Support to encoding decoding skb mark on IFE action" - depends on NET_ACT_IFE + tristate "Support to encoding decoding skb mark on IFE action" + depends on NET_ACT_IFE config NET_IFE_SKBPRIO - tristate "Support to encoding decoding skb prio on IFE action" - depends on NET_ACT_IFE + tristate "Support to encoding decoding skb prio on IFE action" + depends on NET_ACT_IFE config NET_IFE_SKBTCINDEX - tristate "Support to encoding decoding skb tcindex on IFE action" - depends on NET_ACT_IFE + tristate "Support to encoding decoding skb tcindex on IFE action" + depends on NET_ACT_IFE + +config NET_TC_SKB_EXT + bool "TC recirculation support" + depends on NET_CLS_ACT + select SKB_EXTENSIONS + + help + Say Y here to allow tc chain misses to continue in OvS datapath in + the correct recirc_id, and hardware chain misses to continue in + the correct chain in tc software datapath. + + Say N here if you won't be using tc<->ovs offload or tc chains offload. endif # NET_SCHED diff --git a/net/sched/Makefile b/net/sched/Makefile index 415d1e1f237e..31c367a6cd09 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -48,6 +48,7 @@ obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o +obj-$(CONFIG_NET_SCH_ETS) += sch_ets.o obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o obj-$(CONFIG_NET_SCH_SKBPRIO) += sch_skbprio.o obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o @@ -58,6 +59,7 @@ obj-$(CONFIG_NET_SCH_CAKE) += sch_cake.o obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o +obj-$(CONFIG_NET_SCH_FQ_PIE) += sch_fq_pie.o obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 339712296164..90a31b15585f 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -88,7 +88,7 @@ struct tcf_chain *tcf_action_set_ctrlact(struct tc_action *a, int action, struct tcf_chain *goto_chain) { a->tcfa_action = action; - rcu_swap_protected(a->goto_chain, goto_chain, 1); + goto_chain = rcu_replace_pointer(a->goto_chain, goto_chain, 1); return goto_chain; } EXPORT_SYMBOL(tcf_action_set_ctrlact); @@ -188,6 +188,8 @@ static size_t tcf_action_shared_attrs_size(const struct tc_action *act) + nla_total_size(0) /* TCA_ACT_STATS nested */ /* TCA_STATS_BASIC */ + nla_total_size_64bit(sizeof(struct gnet_stats_basic)) + /* TCA_STATS_PKT64 */ + + nla_total_size_64bit(sizeof(u64)) /* TCA_STATS_QUEUE */ + nla_total_size_64bit(sizeof(struct gnet_stats_queue)) + nla_total_size(0) /* TCA_OPTIONS nested */ @@ -399,7 +401,7 @@ static int tcf_idr_delete_index(struct tcf_idrinfo *idrinfo, u32 index) int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, - int bind, bool cpustats) + int bind, bool cpustats, u32 flags) { struct tc_action *p = kzalloc(ops->size, GFP_KERNEL); struct tcf_idrinfo *idrinfo = tn->idrinfo; @@ -427,6 +429,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, p->tcfa_tm.install = jiffies; p->tcfa_tm.lastuse = jiffies; p->tcfa_tm.firstuse = 0; + p->tcfa_flags = flags; if (est) { err = gen_new_estimator(&p->tcfa_bstats, p->cpu_bstats, &p->tcfa_rate_est, @@ -451,6 +454,17 @@ err1: } EXPORT_SYMBOL(tcf_idr_create); +int tcf_idr_create_from_flags(struct tc_action_net *tn, u32 index, + struct nlattr *est, struct tc_action **a, + const struct tc_action_ops *ops, int bind, + u32 flags) +{ + /* Set cpustats according to actions flags. */ + return tcf_idr_create(tn, index, est, a, ops, bind, + !(flags & TCA_ACT_FLAGS_NO_PERCPU_STATS), flags); +} +EXPORT_SYMBOL(tcf_idr_create_from_flags); + void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a) { struct tcf_idrinfo *idrinfo = tn->idrinfo; @@ -773,6 +787,14 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) } rcu_read_unlock(); + if (a->tcfa_flags) { + struct nla_bitfield32 flags = { a->tcfa_flags, + a->tcfa_flags, }; + + if (nla_put(skb, TCA_ACT_FLAGS, sizeof(flags), &flags)) + goto nla_put_failure; + } + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; @@ -831,12 +853,24 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb) return c; } +static const u32 tca_act_flags_allowed = TCA_ACT_FLAGS_NO_PERCPU_STATS; +static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = { + [TCA_ACT_KIND] = { .type = NLA_STRING }, + [TCA_ACT_INDEX] = { .type = NLA_U32 }, + [TCA_ACT_COOKIE] = { .type = NLA_BINARY, + .len = TC_COOKIE_MAX_SIZE }, + [TCA_ACT_OPTIONS] = { .type = NLA_NESTED }, + [TCA_ACT_FLAGS] = { .type = NLA_BITFIELD32, + .validation_data = &tca_act_flags_allowed }, +}; + struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, bool rtnl_held, struct netlink_ext_ack *extack) { + struct nla_bitfield32 flags = { 0, 0 }; struct tc_action *a; struct tc_action_ops *a_o; struct tc_cookie *cookie = NULL; @@ -846,8 +880,8 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, int err; if (name == NULL) { - err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, NULL, - extack); + err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, + tcf_action_policy, extack); if (err < 0) goto err_out; err = -EINVAL; @@ -861,13 +895,6 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, goto err_out; } if (tb[TCA_ACT_COOKIE]) { - int cklen = nla_len(tb[TCA_ACT_COOKIE]); - - if (cklen > TC_COOKIE_MAX_SIZE) { - NL_SET_ERR_MSG(extack, "TC cookie size above the maximum"); - goto err_out; - } - cookie = nla_memdup_cookie(tb); if (!cookie) { NL_SET_ERR_MSG(extack, "No memory to generate TC cookie"); @@ -875,6 +902,8 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, goto err_out; } } + if (tb[TCA_ACT_FLAGS]) + flags = nla_get_bitfield32(tb[TCA_ACT_FLAGS]); } else { if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ) { NL_SET_ERR_MSG(extack, "TC action name too long"); @@ -913,10 +942,10 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, /* backward compatibility for policer */ if (name == NULL) err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind, - rtnl_held, tp, extack); + rtnl_held, tp, flags.value, extack); else err = a_o->init(net, nla, est, &a, ovr, bind, rtnl_held, - tp, extack); + tp, flags.value, extack); if (err < 0) goto err_mod; @@ -974,7 +1003,6 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, err = PTR_ERR(act); goto err; } - act->order = i; sz += tcf_action_fill_size(act); /* Start from index 0 */ actions[i - 1] = act; @@ -988,6 +1016,29 @@ err: return err; } +void tcf_action_update_stats(struct tc_action *a, u64 bytes, u32 packets, + bool drop, bool hw) +{ + if (a->cpu_bstats) { + _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); + + if (drop) + this_cpu_ptr(a->cpu_qstats)->drops += packets; + + if (hw) + _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw), + bytes, packets); + return; + } + + _bstats_update(&a->tcfa_bstats, bytes, packets); + if (drop) + a->tcfa_qstats.drops += packets; + if (hw) + _bstats_update(&a->tcfa_bstats_hw, bytes, packets); +} +EXPORT_SYMBOL(tcf_action_update_stats); + int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p, int compat_mode) { @@ -1098,7 +1149,8 @@ static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla, int index; int err; - err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, NULL, extack); + err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, + tcf_action_policy, extack); if (err < 0) goto err_out; @@ -1152,7 +1204,8 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, b = skb_tail_pointer(skb); - err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, NULL, extack); + err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, + tcf_action_policy, extack); if (err < 0) goto err_out; @@ -1350,11 +1403,16 @@ static int tcf_action_add(struct net *net, struct nlattr *nla, struct netlink_ext_ack *extack) { size_t attr_size = 0; - int ret = 0; + int loop, ret; struct tc_action *actions[TCA_ACT_MAX_PRIO] = {}; - ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, actions, - &attr_size, true, extack); + for (loop = 0; loop < 10; loop++) { + ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, + actions, &attr_size, true, extack); + if (ret != -EAGAIN) + break; + } + if (ret < 0) return ret; ret = tcf_add_notify(net, n, actions, portid, attr_size, extack); @@ -1404,11 +1462,8 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, */ if (n->nlmsg_flags & NLM_F_REPLACE) ovr = 1; -replay: ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr, extack); - if (ret == -EAGAIN) - goto replay; break; case RTM_DELACTION: ret = tca_action_gd(net, tca[TCA_ACT_TAB], n, @@ -1440,7 +1495,7 @@ static struct nlattr *find_dump_kind(struct nlattr **nla) if (tb[1] == NULL) return NULL; - if (nla_parse_nested_deprecated(tb2, TCA_ACT_MAX, tb[1], NULL, NULL) < 0) + if (nla_parse_nested_deprecated(tb2, TCA_ACT_MAX, tb[1], tcf_action_policy, NULL) < 0) return NULL; kind = tb2[TCA_ACT_KIND]; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index fd1f7e799e23..46f47e58b3be 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -275,7 +275,8 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog, static int tcf_bpf_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **act, int replace, int bind, bool rtnl_held, - struct tcf_proto *tp, struct netlink_ext_ack *extack) + struct tcf_proto *tp, u32 flags, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, bpf_net_id); struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; @@ -303,7 +304,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, ret = tcf_idr_check_alloc(tn, &index, act, bind); if (!ret) { ret = tcf_idr_create(tn, index, est, act, - &act_bpf_ops, bind, true); + &act_bpf_ops, bind, true, 0); if (ret < 0) { tcf_idr_cleanup(tn, index); return ret; @@ -422,7 +423,7 @@ static __net_init int bpf_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, bpf_net_id); - return tc_action_net_init(tn, &act_bpf_ops); + return tc_action_net_init(net, tn, &act_bpf_ops); } static void __net_exit bpf_exit_net(struct list_head *net_list) diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 32ac04d77a45..43a243081e7d 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -94,7 +94,7 @@ static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = { static int tcf_connmark_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, + struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, connmark_net_id); @@ -121,7 +121,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, ret = tcf_idr_check_alloc(tn, &index, a, bind); if (!ret) { ret = tcf_idr_create(tn, index, est, a, - &act_connmark_ops, bind, false); + &act_connmark_ops, bind, false, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -231,7 +231,7 @@ static __net_init int connmark_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, connmark_net_id); - return tc_action_net_init(tn, &act_connmark_ops); + return tc_action_net_init(net, tn, &act_connmark_ops); } static void __net_exit connmark_exit_net(struct list_head *net_list) diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 9b9288267a54..cb8608f0a77a 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -43,7 +43,7 @@ static struct tc_action_ops act_csum_ops; static int tcf_csum_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, - struct netlink_ext_ack *extack) + u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, csum_net_id); struct tcf_csum_params *params_new; @@ -68,8 +68,8 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { - ret = tcf_idr_create(tn, index, est, a, - &act_csum_ops, bind, true); + ret = tcf_idr_create_from_flags(tn, index, est, a, + &act_csum_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -101,8 +101,8 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, spin_lock_bh(&p->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); - rcu_swap_protected(p->params, params_new, - lockdep_is_held(&p->tcf_lock)); + params_new = rcu_replace_pointer(p->params, params_new, + lockdep_is_held(&p->tcf_lock)); spin_unlock_bh(&p->tcf_lock); if (goto_ch) @@ -580,7 +580,7 @@ static int tcf_csum_act(struct sk_buff *skb, const struct tc_action *a, params = rcu_dereference_bh(p->params); tcf_lastuse_update(&p->tcf_tm); - bstats_cpu_update(this_cpu_ptr(p->common.cpu_bstats), skb); + tcf_action_update_bstats(&p->common, skb); action = READ_ONCE(p->tcf_action); if (unlikely(action == TC_ACT_SHOT)) @@ -624,7 +624,7 @@ out: return action; drop: - qstats_drop_inc(this_cpu_ptr(p->common.cpu_qstats)); + tcf_action_inc_drop_qstats(&p->common); action = TC_ACT_SHOT; goto out; } @@ -714,7 +714,7 @@ static __net_init int csum_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, csum_net_id); - return tc_action_net_init(tn, &act_csum_ops); + return tc_action_net_init(net, tn, &act_csum_ops); } static void __net_exit csum_exit_net(struct list_head *net_list) diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 33a1a7406e87..f685c0d73708 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -24,12 +24,12 @@ #include <uapi/linux/tc_act/tc_ct.h> #include <net/tc_act/tc_ct.h> -#include <linux/netfilter/nf_nat.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> +#include <uapi/linux/netfilter/nf_nat.h> static struct tc_action_ops act_ct_ops; static unsigned int ct_net_id; @@ -312,7 +312,7 @@ static void tcf_ct_act_set_labels(struct nf_conn *ct, u32 *labels_m) { #if IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) - size_t labels_sz = FIELD_SIZEOF(struct tcf_ct_params, labels); + size_t labels_sz = sizeof_field(struct tcf_ct_params, labels); if (!memchr_inv(labels_m, 0, labels_sz)) return; @@ -329,6 +329,7 @@ static int tcf_ct_act_nat(struct sk_buff *skb, bool commit) { #if IS_ENABLED(CONFIG_NF_NAT) + int err; enum nf_nat_manip_type maniptype; if (!(ct_action & TCA_CT_ACT_NAT)) @@ -359,7 +360,17 @@ static int tcf_ct_act_nat(struct sk_buff *skb, return NF_ACCEPT; } - return ct_nat_execute(skb, ct, ctinfo, range, maniptype); + err = ct_nat_execute(skb, ct, ctinfo, range, maniptype); + if (err == NF_ACCEPT && + ct->status & IPS_SRC_NAT && ct->status & IPS_DST_NAT) { + if (maniptype == NF_NAT_MANIP_SRC) + maniptype = NF_NAT_MANIP_DST; + else + maniptype = NF_NAT_MANIP_SRC; + + err = ct_nat_execute(skb, ct, ctinfo, range, maniptype); + } + return err; #else return NF_ACCEPT; #endif @@ -465,16 +476,15 @@ out_push: skb_push_rcsum(skb, nh_ofs); out: - bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), skb); + tcf_action_update_bstats(&c->common, skb); return retval; drop: - qstats_drop_inc(this_cpu_ptr(a->cpu_qstats)); + tcf_action_inc_drop_qstats(&c->common); return TC_ACT_SHOT; } static const struct nla_policy ct_policy[TCA_CT_MAX + 1] = { - [TCA_CT_UNSPEC] = { .strict_start_type = TCA_CT_UNSPEC + 1 }, [TCA_CT_ACTION] = { .type = NLA_U16 }, [TCA_CT_PARMS] = { .type = NLA_EXACT_LEN, .len = sizeof(struct tc_ct) }, [TCA_CT_ZONE] = { .type = NLA_U16 }, @@ -656,7 +666,7 @@ static int tcf_ct_fill_params(struct net *net, static int tcf_ct_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int replace, int bind, bool rtnl_held, - struct tcf_proto *tp, + struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ct_net_id); @@ -688,8 +698,8 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, return err; if (!err) { - err = tcf_idr_create(tn, index, est, a, - &act_ct_ops, bind, true); + err = tcf_idr_create_from_flags(tn, index, est, a, + &act_ct_ops, bind, flags); if (err) { tcf_idr_cleanup(tn, index); return err; @@ -722,7 +732,8 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, spin_lock_bh(&c->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); - rcu_swap_protected(c->params, params, lockdep_is_held(&c->tcf_lock)); + params = rcu_replace_pointer(c->params, params, + lockdep_is_held(&c->tcf_lock)); spin_unlock_bh(&c->tcf_lock); if (goto_ch) @@ -905,11 +916,7 @@ static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets, { struct tcf_ct *c = to_ct(a); - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); - - if (hw) - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw), - bytes, packets); + tcf_action_update_stats(a, bytes, packets, false, hw); c->tcf_tm.lastuse = max_t(u64, c->tcf_tm.lastuse, lastuse); } @@ -929,7 +936,7 @@ static struct tc_action_ops act_ct_ops = { static __net_init int ct_init_net(struct net *net) { - unsigned int n_bits = FIELD_SIZEOF(struct tcf_ct_params, labels) * 8; + unsigned int n_bits = sizeof_field(struct tcf_ct_params, labels) * 8; struct tc_ct_action_net *tn = net_generic(net, ct_net_id); if (nf_connlabels_get(net, n_bits - 1)) { @@ -939,7 +946,7 @@ static __net_init int ct_init_net(struct net *net) tn->labels = true; } - return tc_action_net_init(&tn->tn, &act_ct_ops); + return tc_action_net_init(net, &tn->tn, &act_ct_ops); } static void __net_exit ct_exit_net(struct list_head *net_list) diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index 06ef74b74911..19649623493b 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -153,7 +153,7 @@ static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = { static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, + struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ctinfo_net_id); @@ -210,7 +210,7 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { ret = tcf_idr_create(tn, index, est, a, - &act_ctinfo_ops, bind, false); + &act_ctinfo_ops, bind, false, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -257,8 +257,8 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, spin_lock_bh(&ci->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, actparm->action, goto_ch); - rcu_swap_protected(ci->params, cp_new, - lockdep_is_held(&ci->tcf_lock)); + cp_new = rcu_replace_pointer(ci->params, cp_new, + lockdep_is_held(&ci->tcf_lock)); spin_unlock_bh(&ci->tcf_lock); if (goto_ch) @@ -360,6 +360,16 @@ static int tcf_ctinfo_search(struct net *net, struct tc_action **a, u32 index) return tcf_idr_search(tn, a, index); } +static void tcf_ctinfo_cleanup(struct tc_action *a) +{ + struct tcf_ctinfo *ci = to_ctinfo(a); + struct tcf_ctinfo_params *cp; + + cp = rcu_dereference_protected(ci->params, 1); + if (cp) + kfree_rcu(cp, rcu); +} + static struct tc_action_ops act_ctinfo_ops = { .kind = "ctinfo", .id = TCA_ID_CTINFO, @@ -367,6 +377,7 @@ static struct tc_action_ops act_ctinfo_ops = { .act = tcf_ctinfo_act, .dump = tcf_ctinfo_dump, .init = tcf_ctinfo_init, + .cleanup= tcf_ctinfo_cleanup, .walk = tcf_ctinfo_walker, .lookup = tcf_ctinfo_search, .size = sizeof(struct tcf_ctinfo), @@ -376,7 +387,7 @@ static __net_init int ctinfo_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, ctinfo_net_id); - return tc_action_net_init(tn, &act_ctinfo_ops); + return tc_action_net_init(net, tn, &act_ctinfo_ops); } static void __net_exit ctinfo_exit_net(struct list_head *net_list) diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 8f0140c6ca58..416065772719 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -53,7 +53,8 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = { static int tcf_gact_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, struct netlink_ext_ack *extack) + struct tcf_proto *tp, u32 flags, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, gact_net_id); struct nlattr *tb[TCA_GACT_MAX + 1]; @@ -98,8 +99,8 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { - ret = tcf_idr_create(tn, index, est, a, - &act_gact_ops, bind, true); + ret = tcf_idr_create_from_flags(tn, index, est, a, + &act_gact_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -161,9 +162,9 @@ static int tcf_gact_act(struct sk_buff *skb, const struct tc_action *a, action = gact_rand[ptype](gact); } #endif - bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), skb); + tcf_action_update_bstats(&gact->common, skb); if (action == TC_ACT_SHOT) - qstats_drop_inc(this_cpu_ptr(gact->common.cpu_qstats)); + tcf_action_inc_drop_qstats(&gact->common); tcf_lastuse_update(&gact->tcf_tm); @@ -177,15 +178,7 @@ static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets, int action = READ_ONCE(gact->tcf_action); struct tcf_t *tm = &gact->tcf_tm; - _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), bytes, - packets); - if (action == TC_ACT_SHOT) - this_cpu_ptr(gact->common.cpu_qstats)->drops += packets; - - if (hw) - _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats_hw), - bytes, packets); - + tcf_action_update_stats(a, bytes, packets, action == TC_ACT_SHOT, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } @@ -278,7 +271,7 @@ static __net_init int gact_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, gact_net_id); - return tc_action_net_init(tn, &act_gact_ops); + return tc_action_net_init(net, tn, &act_gact_ops); } static void __net_exit gact_exit_net(struct list_head *net_list) diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 92ee853d43e6..c1fcd85719d6 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -465,7 +465,8 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, static int tcf_ife_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, struct netlink_ext_ack *extack) + struct tcf_proto *tp, u32 flags, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ife_net_id); struct nlattr *tb[TCA_IFE_MAX + 1]; @@ -522,7 +523,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, index, est, a, &act_ife_ops, - bind, true); + bind, true, 0); if (ret) { tcf_idr_cleanup(tn, index); kfree(p); @@ -536,6 +537,9 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, } ife = to_ife(*a); + if (ret == ACT_P_CREATED) + INIT_LIST_HEAD(&ife->metalist); + err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; @@ -565,10 +569,6 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, p->eth_type = ife_type; } - - if (ret == ACT_P_CREATED) - INIT_LIST_HEAD(&ife->metalist); - if (tb[TCA_IFE_METALST]) { err = nla_parse_nested_deprecated(tb2, IFE_META_MAX, tb[TCA_IFE_METALST], NULL, @@ -594,7 +594,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, spin_lock_bh(&ife->tcf_lock); /* protected by tcf_lock when modifying existing action */ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); - rcu_swap_protected(ife->params, p, 1); + p = rcu_replace_pointer(ife->params, p, 1); if (exists) spin_unlock_bh(&ife->tcf_lock); @@ -890,7 +890,7 @@ static __net_init int ife_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, ife_net_id); - return tc_action_net_init(tn, &act_ife_ops); + return tc_action_net_init(net, tn, &act_ife_ops); } static void __net_exit ife_exit_net(struct list_head *net_list) diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index ce2c30a591d2..400a2cfe8452 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -61,12 +61,13 @@ static int ipt_init_target(struct net *net, struct xt_entry_target *t, return 0; } -static void ipt_destroy_target(struct xt_entry_target *t) +static void ipt_destroy_target(struct xt_entry_target *t, struct net *net) { struct xt_tgdtor_param par = { .target = t->u.kernel.target, .targinfo = t->data, .family = NFPROTO_IPV4, + .net = net, }; if (par.target->destroy != NULL) par.target->destroy(&par); @@ -78,7 +79,7 @@ static void tcf_ipt_release(struct tc_action *a) struct tcf_ipt *ipt = to_ipt(a); if (ipt->tcfi_t) { - ipt_destroy_target(ipt->tcfi_t); + ipt_destroy_target(ipt->tcfi_t, a->idrinfo->net); kfree(ipt->tcfi_t); } kfree(ipt->tcfi_tname); @@ -94,7 +95,7 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = { static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int ovr, int bind, - struct tcf_proto *tp) + struct tcf_proto *tp, u32 flags) { struct tc_action_net *tn = net_generic(net, id); struct nlattr *tb[TCA_IPT_MAX + 1]; @@ -143,7 +144,7 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, index, est, a, ops, bind, - false); + false, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -180,7 +181,7 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, spin_lock_bh(&ipt->tcf_lock); if (ret != ACT_P_CREATED) { - ipt_destroy_target(ipt->tcfi_t); + ipt_destroy_target(ipt->tcfi_t, net); kfree(ipt->tcfi_tname); kfree(ipt->tcfi_t); } @@ -204,19 +205,19 @@ err1: static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, - struct netlink_ext_ack *extack) + u32 flags, struct netlink_ext_ack *extack) { return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, ovr, - bind, tp); + bind, tp, flags); } static int tcf_xt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool unlocked, struct tcf_proto *tp, - struct netlink_ext_ack *extack) + u32 flags, struct netlink_ext_ack *extack) { return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, ovr, - bind, tp); + bind, tp, flags); } static int tcf_ipt_act(struct sk_buff *skb, const struct tc_action *a, @@ -350,7 +351,7 @@ static __net_init int ipt_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, ipt_net_id); - return tc_action_net_init(tn, &act_ipt_ops); + return tc_action_net_init(net, tn, &act_ipt_ops); } static void __net_exit ipt_exit_net(struct list_head *net_list) @@ -399,7 +400,7 @@ static __net_init int xt_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, xt_net_id); - return tc_action_net_init(tn, &act_xt_ops); + return tc_action_net_init(net, tn, &act_xt_ops); } static void __net_exit xt_exit_net(struct list_head *net_list) diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index be3f88dfc37e..1ad300e6dbc0 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -93,7 +93,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, - struct netlink_ext_ack *extack) + u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, mirred_net_id); struct nlattr *tb[TCA_MIRRED_MAX + 1]; @@ -148,8 +148,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, NL_SET_ERR_MSG_MOD(extack, "Specified device does not exist"); return -EINVAL; } - ret = tcf_idr_create(tn, index, est, a, - &act_mirred_ops, bind, true); + ret = tcf_idr_create_from_flags(tn, index, est, a, + &act_mirred_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -178,8 +178,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, goto put_chain; } mac_header_xmit = dev_is_mac_header_xmit(dev); - rcu_swap_protected(m->tcfm_dev, dev, - lockdep_is_held(&m->tcf_lock)); + dev = rcu_replace_pointer(m->tcfm_dev, dev, + lockdep_is_held(&m->tcf_lock)); if (dev) dev_put(dev); m->tcfm_mac_header_xmit = mac_header_xmit; @@ -219,8 +219,10 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, bool use_reinsert; bool want_ingress; bool is_redirect; + bool expects_nh; int m_eaction; int mac_len; + bool at_nh; rec_level = __this_cpu_inc_return(mirred_rec_level); if (unlikely(rec_level > MIRRED_RECURSION_LIMIT)) { @@ -231,7 +233,7 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, } tcf_lastuse_update(&m->tcf_tm); - bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb); + tcf_action_update_bstats(&m->common, skb); m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit); m_eaction = READ_ONCE(m->tcfm_eaction); @@ -261,19 +263,19 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, goto out; } - /* If action's target direction differs than filter's direction, - * and devices expect a mac header on xmit, then mac push/pull is - * needed. - */ want_ingress = tcf_mirred_act_wants_ingress(m_eaction); - if (skb_at_tc_ingress(skb) != want_ingress && m_mac_header_xmit) { - if (!skb_at_tc_ingress(skb)) { - /* caught at egress, act ingress: pull mac */ - mac_len = skb_network_header(skb) - skb_mac_header(skb); + + expects_nh = want_ingress || !m_mac_header_xmit; + at_nh = skb->data == skb_network_header(skb); + if (at_nh != expects_nh) { + mac_len = skb_at_tc_ingress(skb) ? skb->mac_len : + skb_network_header(skb) - skb_mac_header(skb); + if (expects_nh) { + /* target device/action expect data at nh */ skb_pull_rcsum(skb2, mac_len); } else { - /* caught at ingress, act egress: push mac */ - skb_push_rcsum(skb2, skb->mac_len); + /* target device/action expect data at mac */ + skb_push_rcsum(skb2, mac_len); } } @@ -289,8 +291,8 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, /* let's the caller reinsert the packet, if possible */ if (use_reinsert) { res->ingress = want_ingress; - res->qstats = this_cpu_ptr(m->common.cpu_qstats); - skb_tc_reinsert(skb, res); + if (skb_tc_reinsert(skb, res)) + tcf_action_inc_overlimit_qstats(&m->common); __this_cpu_dec(mirred_rec_level); return TC_ACT_CONSUMED; } @@ -303,7 +305,7 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, if (err) { out: - qstats_overlimit_inc(this_cpu_ptr(m->common.cpu_qstats)); + tcf_action_inc_overlimit_qstats(&m->common); if (tcf_mirred_is_act_redirect(m_eaction)) retval = TC_ACT_SHOT; } @@ -318,10 +320,7 @@ static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets, struct tcf_mirred *m = to_mirred(a); struct tcf_t *tm = &m->tcf_tm; - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); - if (hw) - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw), - bytes, packets); + tcf_action_update_stats(a, bytes, packets, false, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } @@ -408,25 +407,31 @@ static struct notifier_block mirred_device_notifier = { .notifier_call = mirred_device_event, }; -static struct net_device *tcf_mirred_get_dev(const struct tc_action *a) +static void tcf_mirred_dev_put(void *priv) +{ + struct net_device *dev = priv; + + dev_put(dev); +} + +static struct net_device * +tcf_mirred_get_dev(const struct tc_action *a, + tc_action_priv_destructor *destructor) { struct tcf_mirred *m = to_mirred(a); struct net_device *dev; rcu_read_lock(); dev = rcu_dereference(m->tcfm_dev); - if (dev) + if (dev) { dev_hold(dev); + *destructor = tcf_mirred_dev_put; + } rcu_read_unlock(); return dev; } -static void tcf_mirred_put_dev(struct net_device *dev) -{ - dev_put(dev); -} - static size_t tcf_mirred_get_fill_size(const struct tc_action *act) { return nla_total_size(sizeof(struct tc_mirred)); @@ -446,14 +451,13 @@ static struct tc_action_ops act_mirred_ops = { .get_fill_size = tcf_mirred_get_fill_size, .size = sizeof(struct tcf_mirred), .get_dev = tcf_mirred_get_dev, - .put_dev = tcf_mirred_put_dev, }; static __net_init int mirred_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, mirred_net_id); - return tc_action_net_init(tn, &act_mirred_ops); + return tc_action_net_init(net, tn, &act_mirred_ops); } static void __net_exit mirred_exit_net(struct list_head *net_list) @@ -479,7 +483,11 @@ static int __init mirred_init_module(void) return err; pr_info("Mirror/redirect action on\n"); - return tcf_register_action(&act_mirred_ops, &mirred_net_ops); + err = tcf_register_action(&act_mirred_ops, &mirred_net_ops); + if (err) + unregister_netdevice_notifier(&mirred_device_notifier); + + return err; } static void __exit mirred_cleanup_module(void) diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index 0f299e3b618c..be3f215cd027 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) /* Copyright (C) 2019 Netronome Systems, Inc. */ +#include <linux/if_arp.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> @@ -55,7 +56,7 @@ static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_mpls *m = to_mpls(a); struct tcf_mpls_params *p; __be32 new_lse; - int ret; + int ret, mac_len; tcf_lastuse_update(&m->tcf_tm); bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb); @@ -63,8 +64,12 @@ static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a, /* Ensure 'data' points at mac_header prior calling mpls manipulating * functions. */ - if (skb_at_tc_ingress(skb)) + if (skb_at_tc_ingress(skb)) { skb_push_rcsum(skb, skb->mac_len); + mac_len = skb->mac_len; + } else { + mac_len = skb_network_header(skb) - skb_mac_header(skb); + } ret = READ_ONCE(m->tcf_action); @@ -72,12 +77,14 @@ static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a, switch (p->tcfm_action) { case TCA_MPLS_ACT_POP: - if (skb_mpls_pop(skb, p->tcfm_proto)) + if (skb_mpls_pop(skb, p->tcfm_proto, mac_len, + skb->dev && skb->dev->type == ARPHRD_ETHER)) goto drop; break; case TCA_MPLS_ACT_PUSH: new_lse = tcf_mpls_get_lse(NULL, p, !eth_p_mpls(skb->protocol)); - if (skb_mpls_push(skb, new_lse, p->tcfm_proto)) + if (skb_mpls_push(skb, new_lse, p->tcfm_proto, mac_len, + skb->dev && skb->dev->type == ARPHRD_ETHER)) goto drop; break; case TCA_MPLS_ACT_MODIFY: @@ -115,7 +122,6 @@ static int valid_label(const struct nlattr *attr, } static const struct nla_policy mpls_policy[TCA_MPLS_MAX + 1] = { - [TCA_MPLS_UNSPEC] = { .strict_start_type = TCA_MPLS_UNSPEC + 1 }, [TCA_MPLS_PARMS] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_mpls)), [TCA_MPLS_PROTO] = { .type = NLA_U16 }, [TCA_MPLS_LABEL] = NLA_POLICY_VALIDATE_FN(NLA_U32, valid_label), @@ -127,7 +133,8 @@ static const struct nla_policy mpls_policy[TCA_MPLS_MAX + 1] = { static int tcf_mpls_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, struct netlink_ext_ack *extack) + struct tcf_proto *tp, u32 flags, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, mpls_net_id); struct nlattr *tb[TCA_MPLS_MAX + 1]; @@ -220,7 +227,7 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, index, est, a, - &act_mpls_ops, bind, true); + &act_mpls_ops, bind, true, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -258,7 +265,7 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla, spin_lock_bh(&m->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); - rcu_swap_protected(m->mpls_p, p, lockdep_is_held(&m->tcf_lock)); + p = rcu_replace_pointer(m->mpls_p, p, lockdep_is_held(&m->tcf_lock)); spin_unlock_bh(&m->tcf_lock); if (goto_ch) @@ -375,7 +382,7 @@ static __net_init int mpls_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, mpls_net_id); - return tc_action_net_init(tn, &act_mpls_ops); + return tc_action_net_init(net, tn, &act_mpls_ops); } static void __net_exit mpls_exit_net(struct list_head *net_list) diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 7b858c11b1b5..855a6fa16a62 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -36,7 +36,7 @@ static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, - struct netlink_ext_ack *extack) + u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, nat_net_id); struct nlattr *tb[TCA_NAT_MAX + 1]; @@ -61,7 +61,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { ret = tcf_idr_create(tn, index, est, a, - &act_nat_ops, bind, false); + &act_nat_ops, bind, false, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -206,9 +206,7 @@ static int tcf_nat_act(struct sk_buff *skb, const struct tc_action *a, icmph = (void *)(skb_network_header(skb) + ihl); - if ((icmph->type != ICMP_DEST_UNREACH) && - (icmph->type != ICMP_TIME_EXCEEDED) && - (icmph->type != ICMP_PARAMETERPROB)) + if (!icmp_is_err(icmph->type)) break; if (!pskb_may_pull(skb, ihl + sizeof(*icmph) + sizeof(*iph) + @@ -327,7 +325,7 @@ static __net_init int nat_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, nat_net_id); - return tc_action_net_init(tn, &act_nat_ops); + return tc_action_net_init(net, tn, &act_nat_ops); } static void __net_exit nat_exit_net(struct list_head *net_list) diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 17360c6faeaa..3ad718576304 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -43,7 +43,7 @@ static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla, int err = -EINVAL; int rem; - if (!nla || !n) + if (!nla) return NULL; keys_ex = kcalloc(n, sizeof(*k), GFP_KERNEL); @@ -137,7 +137,8 @@ nla_failure: static int tcf_pedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, struct netlink_ext_ack *extack) + struct tcf_proto *tp, u32 flags, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, pedit_net_id); struct nlattr *tb[TCA_PEDIT_MAX + 1]; @@ -170,6 +171,10 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, } parm = nla_data(pattr); + if (!parm->nkeys) { + NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed"); + return -EINVAL; + } ksize = parm->nkeys * sizeof(struct tc_pedit_key); if (nla_len(pattr) < sizeof(*parm) + ksize) { NL_SET_ERR_MSG_ATTR(extack, pattr, "Length of TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute is invalid"); @@ -183,14 +188,8 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { - if (!parm->nkeys) { - tcf_idr_cleanup(tn, index); - NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed"); - ret = -EINVAL; - goto out_free; - } ret = tcf_idr_create(tn, index, est, a, - &act_pedit_ops, bind, false); + &act_pedit_ops, bind, false, 0); if (ret) { tcf_idr_cleanup(tn, index); goto out_free; @@ -498,7 +497,7 @@ static __net_init int pedit_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, pedit_net_id); - return tc_action_net_init(tn, &act_pedit_ops); + return tc_action_net_init(net, tn, &act_pedit_ops); } static void __net_exit pedit_exit_net(struct list_head *net_list) diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 49cec3e64a4d..8b7a0ac96c51 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -40,12 +40,14 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { [TCA_POLICE_PEAKRATE] = { .len = TC_RTAB_SIZE }, [TCA_POLICE_AVRATE] = { .type = NLA_U32 }, [TCA_POLICE_RESULT] = { .type = NLA_U32 }, + [TCA_POLICE_RATE64] = { .type = NLA_U64 }, + [TCA_POLICE_PEAKRATE64] = { .type = NLA_U64 }, }; static int tcf_police_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, + struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { int ret = 0, tcfp_result = TC_ACT_OK, err, size; @@ -58,6 +60,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, struct tcf_police_params *new; bool exists = false; u32 index; + u64 rate64, prate64; if (nla == NULL) return -EINVAL; @@ -84,7 +87,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, index, NULL, a, - &act_police_ops, bind, true); + &act_police_ops, bind, true, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -155,14 +158,18 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, } if (R_tab) { new->rate_present = true; - psched_ratecfg_precompute(&new->rate, &R_tab->rate, 0); + rate64 = tb[TCA_POLICE_RATE64] ? + nla_get_u64(tb[TCA_POLICE_RATE64]) : 0; + psched_ratecfg_precompute(&new->rate, &R_tab->rate, rate64); qdisc_put_rtab(R_tab); } else { new->rate_present = false; } if (P_tab) { new->peak_present = true; - psched_ratecfg_precompute(&new->peak, &P_tab->rate, 0); + prate64 = tb[TCA_POLICE_PEAKRATE64] ? + nla_get_u64(tb[TCA_POLICE_PEAKRATE64]) : 0; + psched_ratecfg_precompute(&new->peak, &P_tab->rate, prate64); qdisc_put_rtab(P_tab); } else { new->peak_present = false; @@ -184,9 +191,9 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, police->tcfp_ptoks = new->tcfp_mtu_ptoks; spin_unlock_bh(&police->tcfp_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); - rcu_swap_protected(police->params, - new, - lockdep_is_held(&police->tcf_lock)); + new = rcu_replace_pointer(police->params, + new, + lockdep_is_held(&police->tcf_lock)); spin_unlock_bh(&police->tcf_lock); if (goto_ch) @@ -287,10 +294,7 @@ static void tcf_police_stats_update(struct tc_action *a, struct tcf_police *police = to_police(a); struct tcf_t *tm = &police->tcf_tm; - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); - if (hw) - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw), - bytes, packets); + tcf_action_update_stats(a, bytes, packets, false, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } @@ -313,10 +317,22 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a, lockdep_is_held(&police->tcf_lock)); opt.mtu = p->tcfp_mtu; opt.burst = PSCHED_NS2TICKS(p->tcfp_burst); - if (p->rate_present) + if (p->rate_present) { psched_ratecfg_getrate(&opt.rate, &p->rate); - if (p->peak_present) + if ((police->params->rate.rate_bytes_ps >= (1ULL << 32)) && + nla_put_u64_64bit(skb, TCA_POLICE_RATE64, + police->params->rate.rate_bytes_ps, + TCA_POLICE_PAD)) + goto nla_put_failure; + } + if (p->peak_present) { psched_ratecfg_getrate(&opt.peakrate, &p->peak); + if ((police->params->peak.rate_bytes_ps >= (1ULL << 32)) && + nla_put_u64_64bit(skb, TCA_POLICE_PEAKRATE64, + police->params->peak.rate_bytes_ps, + TCA_POLICE_PAD)) + goto nla_put_failure; + } if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt)) goto nla_put_failure; if (p->tcfp_result && @@ -326,10 +342,7 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a, nla_put_u32(skb, TCA_POLICE_AVRATE, p->tcfp_ewma_rate)) goto nla_put_failure; - t.install = jiffies_to_clock_t(jiffies - police->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - police->tcf_tm.lastuse); - t.firstuse = jiffies_to_clock_t(jiffies - police->tcf_tm.firstuse); - t.expires = jiffies_to_clock_t(police->tcf_tm.expires); + tcf_tm_dump(&t, &police->tcf_tm); if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD)) goto nla_put_failure; spin_unlock_bh(&police->tcf_lock); @@ -371,7 +384,7 @@ static __net_init int police_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, police_net_id); - return tc_action_net_init(tn, &act_police_ops); + return tc_action_net_init(net, tn, &act_police_ops); } static void __net_exit police_exit_net(struct list_head *net_list) diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 595308d60133..ce948c1e24dc 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -36,7 +36,7 @@ static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = { static int tcf_sample_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, - struct netlink_ext_ack *extack) + u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, sample_net_id); struct nlattr *tb[TCA_SAMPLE_MAX + 1]; @@ -69,7 +69,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, index, est, a, - &act_sample_ops, bind, true); + &act_sample_ops, bind, true, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -102,13 +102,17 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); s->rate = rate; s->psample_group_num = psample_group_num; - RCU_INIT_POINTER(s->psample_group, psample_group); + psample_group = rcu_replace_pointer(s->psample_group, psample_group, + lockdep_is_held(&s->tcf_lock)); if (tb[TCA_SAMPLE_TRUNC_SIZE]) { s->truncate = true; s->trunc_size = nla_get_u32(tb[TCA_SAMPLE_TRUNC_SIZE]); } spin_unlock_bh(&s->tcf_lock); + + if (psample_group) + psample_group_put(psample_group); if (goto_ch) tcf_chain_put_by_act(goto_ch); @@ -142,6 +146,7 @@ static bool tcf_sample_dev_ok_push(struct net_device *dev) case ARPHRD_TUNNEL6: case ARPHRD_SIT: case ARPHRD_IPGRE: + case ARPHRD_IP6GRE: case ARPHRD_VOID: case ARPHRD_NONE: return false; @@ -248,6 +253,32 @@ static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index) return tcf_idr_search(tn, a, index); } +static void tcf_psample_group_put(void *priv) +{ + struct psample_group *group = priv; + + psample_group_put(group); +} + +static struct psample_group * +tcf_sample_get_group(const struct tc_action *a, + tc_action_priv_destructor *destructor) +{ + struct tcf_sample *s = to_sample(a); + struct psample_group *group; + + spin_lock_bh(&s->tcf_lock); + group = rcu_dereference_protected(s->psample_group, + lockdep_is_held(&s->tcf_lock)); + if (group) { + psample_group_take(group); + *destructor = tcf_psample_group_put; + } + spin_unlock_bh(&s->tcf_lock); + + return group; +} + static struct tc_action_ops act_sample_ops = { .kind = "sample", .id = TCA_ID_SAMPLE, @@ -258,6 +289,7 @@ static struct tc_action_ops act_sample_ops = { .cleanup = tcf_sample_cleanup, .walk = tcf_sample_walker, .lookup = tcf_sample_search, + .get_psample_group = tcf_sample_get_group, .size = sizeof(struct tcf_sample), }; @@ -265,7 +297,7 @@ static __net_init int sample_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, sample_net_id); - return tc_action_net_init(tn, &act_sample_ops); + return tc_action_net_init(net, tn, &act_sample_ops); } static void __net_exit sample_exit_net(struct list_head *net_list) diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 33aefa25b545..9813ca4006dd 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -35,7 +35,7 @@ static int tcf_simp_act(struct sk_buff *skb, const struct tc_action *a, * Example if this was the 3rd packet and the string was "hello" * then it would look like "hello_3" (without quotes) */ - pr_info("simple: %s_%d\n", + pr_info("simple: %s_%llu\n", (char *)d->tcfd_defdata, d->tcf_bstats.packets); spin_unlock(&d->tcf_lock); return d->tcf_action; @@ -86,7 +86,8 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = { static int tcf_simp_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, struct netlink_ext_ack *extack) + struct tcf_proto *tp, u32 flags, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, simp_net_id); struct nlattr *tb[TCA_DEF_MAX + 1]; @@ -127,7 +128,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, index, est, a, - &act_simp_ops, bind, false); + &act_simp_ops, bind, false, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -232,7 +233,7 @@ static __net_init int simp_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, simp_net_id); - return tc_action_net_init(tn, &act_simp_ops); + return tc_action_net_init(net, tn, &act_simp_ops); } static void __net_exit simp_exit_net(struct list_head *net_list) diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 37dced00b63d..e857424c387c 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -86,7 +86,7 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, + struct tcf_proto *tp, u32 act_flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); @@ -165,7 +165,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, index, est, a, - &act_skbedit_ops, bind, true); + &act_skbedit_ops, bind, true, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -206,8 +206,8 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, spin_lock_bh(&d->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); - rcu_swap_protected(d->params, params_new, - lockdep_is_held(&d->tcf_lock)); + params_new = rcu_replace_pointer(d->params, params_new, + lockdep_is_held(&d->tcf_lock)); spin_unlock_bh(&d->tcf_lock); if (params_new) kfree_rcu(params_new, rcu); @@ -336,7 +336,7 @@ static __net_init int skbedit_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); - return tc_action_net_init(tn, &act_skbedit_ops); + return tc_action_net_init(net, tn, &act_skbedit_ops); } static void __net_exit skbedit_exit_net(struct list_head *net_list) diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index 7da3518e18ef..39e6d94cfafb 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -79,7 +79,7 @@ static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = { static int tcf_skbmod_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, + struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbmod_net_id); @@ -143,7 +143,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, index, est, a, - &act_skbmod_ops, bind, true); + &act_skbmod_ops, bind, true, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -287,7 +287,7 @@ static __net_init int skbmod_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, skbmod_net_id); - return tc_action_net_init(tn, &act_skbmod_ops); + return tc_action_net_init(net, tn, &act_skbmod_ops); } static void __net_exit skbmod_exit_net(struct list_head *net_list) diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 6d0debdc9b97..536c4bc31be6 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -10,6 +10,8 @@ #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <net/geneve.h> +#include <net/vxlan.h> +#include <net/erspan.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/dst.h> @@ -31,7 +33,7 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a, params = rcu_dereference_bh(t->params); tcf_lastuse_update(&t->tcf_tm); - bstats_cpu_update(this_cpu_ptr(t->common.cpu_bstats), skb); + tcf_action_update_bstats(&t->common, skb); action = READ_ONCE(t->tcf_action); switch (params->tcft_action) { @@ -53,7 +55,11 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a, static const struct nla_policy enc_opts_policy[TCA_TUNNEL_KEY_ENC_OPTS_MAX + 1] = { + [TCA_TUNNEL_KEY_ENC_OPTS_UNSPEC] = { + .strict_start_type = TCA_TUNNEL_KEY_ENC_OPTS_VXLAN }, [TCA_TUNNEL_KEY_ENC_OPTS_GENEVE] = { .type = NLA_NESTED }, + [TCA_TUNNEL_KEY_ENC_OPTS_VXLAN] = { .type = NLA_NESTED }, + [TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN] = { .type = NLA_NESTED }, }; static const struct nla_policy @@ -64,6 +70,19 @@ geneve_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1] = { .len = 128 }, }; +static const struct nla_policy +vxlan_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX + 1] = { + [TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP] = { .type = NLA_U32 }, +}; + +static const struct nla_policy +erspan_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX + 1] = { + [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER] = { .type = NLA_U8 }, + [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX] = { .type = NLA_U32 }, + [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR] = { .type = NLA_U8 }, + [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID] = { .type = NLA_U8 }, +}; + static int tunnel_key_copy_geneve_opt(const struct nlattr *nla, void *dst, int dst_len, struct netlink_ext_ack *extack) @@ -116,10 +135,89 @@ tunnel_key_copy_geneve_opt(const struct nlattr *nla, void *dst, int dst_len, return opt_len; } +static int +tunnel_key_copy_vxlan_opt(const struct nlattr *nla, void *dst, int dst_len, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX + 1]; + int err; + + err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX, nla, + vxlan_opt_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP]) { + NL_SET_ERR_MSG(extack, "Missing tunnel key vxlan option gbp"); + return -EINVAL; + } + + if (dst) { + struct vxlan_metadata *md = dst; + + md->gbp = nla_get_u32(tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP]); + } + + return sizeof(struct vxlan_metadata); +} + +static int +tunnel_key_copy_erspan_opt(const struct nlattr *nla, void *dst, int dst_len, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX + 1]; + int err; + u8 ver; + + err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX, nla, + erspan_opt_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER]) { + NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option ver"); + return -EINVAL; + } + + ver = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER]); + if (ver == 1) { + if (!tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX]) { + NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option index"); + return -EINVAL; + } + } else if (ver == 2) { + if (!tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR] || + !tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID]) { + NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option dir or hwid"); + return -EINVAL; + } + } else { + NL_SET_ERR_MSG(extack, "Tunnel key erspan option ver is incorrect"); + return -EINVAL; + } + + if (dst) { + struct erspan_metadata *md = dst; + + md->version = ver; + if (ver == 1) { + nla = tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX]; + md->u.index = nla_get_be32(nla); + } else { + nla = tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR]; + md->u.md2.dir = nla_get_u8(nla); + nla = tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID]; + set_hwid(&md->u.md2, nla_get_u8(nla)); + } + } + + return sizeof(struct erspan_metadata); +} + static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst, int dst_len, struct netlink_ext_ack *extack) { - int err, rem, opt_len, len = nla_len(nla), opts_len = 0; + int err, rem, opt_len, len = nla_len(nla), opts_len = 0, type = 0; const struct nlattr *attr, *head = nla_data(nla); err = nla_validate_deprecated(head, len, TCA_TUNNEL_KEY_ENC_OPTS_MAX, @@ -130,15 +228,48 @@ static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst, nla_for_each_attr(attr, head, len, rem) { switch (nla_type(attr)) { case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE: + if (type && type != TUNNEL_GENEVE_OPT) { + NL_SET_ERR_MSG(extack, "Duplicate type for geneve options"); + return -EINVAL; + } opt_len = tunnel_key_copy_geneve_opt(attr, dst, dst_len, extack); if (opt_len < 0) return opt_len; opts_len += opt_len; + if (opts_len > IP_TUNNEL_OPTS_MAX) { + NL_SET_ERR_MSG(extack, "Tunnel options exceeds max size"); + return -EINVAL; + } if (dst) { dst_len -= opt_len; dst += opt_len; } + type = TUNNEL_GENEVE_OPT; + break; + case TCA_TUNNEL_KEY_ENC_OPTS_VXLAN: + if (type) { + NL_SET_ERR_MSG(extack, "Duplicate type for vxlan options"); + return -EINVAL; + } + opt_len = tunnel_key_copy_vxlan_opt(attr, dst, + dst_len, extack); + if (opt_len < 0) + return opt_len; + opts_len += opt_len; + type = TUNNEL_VXLAN_OPT; + break; + case TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN: + if (type) { + NL_SET_ERR_MSG(extack, "Duplicate type for erspan options"); + return -EINVAL; + } + opt_len = tunnel_key_copy_erspan_opt(attr, dst, + dst_len, extack); + if (opt_len < 0) + return opt_len; + opts_len += opt_len; + type = TUNNEL_ERSPAN_OPT; break; } } @@ -175,6 +306,22 @@ static int tunnel_key_opts_set(struct nlattr *nla, struct ip_tunnel_info *info, #else return -EAFNOSUPPORT; #endif + case TCA_TUNNEL_KEY_ENC_OPTS_VXLAN: +#if IS_ENABLED(CONFIG_INET) + info->key.tun_flags |= TUNNEL_VXLAN_OPT; + return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info), + opts_len, extack); +#else + return -EAFNOSUPPORT; +#endif + case TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN: +#if IS_ENABLED(CONFIG_INET) + info->key.tun_flags |= TUNNEL_ERSPAN_OPT; + return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info), + opts_len, extack); +#else + return -EAFNOSUPPORT; +#endif default: NL_SET_ERR_MSG(extack, "Cannot set tunnel options for unknown tunnel type"); return -EINVAL; @@ -208,7 +355,7 @@ static void tunnel_key_release_params(struct tcf_tunnel_key_params *p) static int tunnel_key_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, + struct tcf_proto *tp, u32 act_flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); @@ -347,8 +494,9 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, } if (!exists) { - ret = tcf_idr_create(tn, index, est, a, - &act_tunnel_key_ops, bind, true); + ret = tcf_idr_create_from_flags(tn, index, est, a, + &act_tunnel_key_ops, bind, + act_flags); if (ret) { NL_SET_ERR_MSG(extack, "Cannot create TC IDR"); goto release_tun_meta; @@ -381,8 +529,8 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, spin_lock_bh(&t->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); - rcu_swap_protected(t->params, params_new, - lockdep_is_held(&t->tcf_lock)); + params_new = rcu_replace_pointer(t->params, params_new, + lockdep_is_held(&t->tcf_lock)); spin_unlock_bh(&t->tcf_lock); tunnel_key_release_params(params_new); if (goto_ch) @@ -450,6 +598,56 @@ static int tunnel_key_geneve_opts_dump(struct sk_buff *skb, return 0; } +static int tunnel_key_vxlan_opts_dump(struct sk_buff *skb, + const struct ip_tunnel_info *info) +{ + struct vxlan_metadata *md = (struct vxlan_metadata *)(info + 1); + struct nlattr *start; + + start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_VXLAN); + if (!start) + return -EMSGSIZE; + + if (nla_put_u32(skb, TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP, md->gbp)) { + nla_nest_cancel(skb, start); + return -EMSGSIZE; + } + + nla_nest_end(skb, start); + return 0; +} + +static int tunnel_key_erspan_opts_dump(struct sk_buff *skb, + const struct ip_tunnel_info *info) +{ + struct erspan_metadata *md = (struct erspan_metadata *)(info + 1); + struct nlattr *start; + + start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN); + if (!start) + return -EMSGSIZE; + + if (nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER, md->version)) + goto err; + + if (md->version == 1 && + nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX, md->u.index)) + goto err; + + if (md->version == 2 && + (nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR, + md->u.md2.dir) || + nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID, + get_hwid(&md->u.md2)))) + goto err; + + nla_nest_end(skb, start); + return 0; +err: + nla_nest_cancel(skb, start); + return -EMSGSIZE; +} + static int tunnel_key_opts_dump(struct sk_buff *skb, const struct ip_tunnel_info *info) { @@ -467,6 +665,14 @@ static int tunnel_key_opts_dump(struct sk_buff *skb, err = tunnel_key_geneve_opts_dump(skb, info); if (err) goto err_out; + } else if (info->key.tun_flags & TUNNEL_VXLAN_OPT) { + err = tunnel_key_vxlan_opts_dump(skb, info); + if (err) + goto err_out; + } else if (info->key.tun_flags & TUNNEL_ERSPAN_OPT) { + err = tunnel_key_erspan_opts_dump(skb, info); + if (err) + goto err_out; } else { err_out: nla_nest_cancel(skb, start); @@ -600,7 +806,7 @@ static __net_init int tunnel_key_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); - return tc_action_net_init(tn, &act_tunnel_key_ops); + return tc_action_net_init(net, tn, &act_tunnel_key_ops); } static void __net_exit tunnel_key_exit_net(struct list_head *net_list) diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index a3c9eea1ee8a..c91d3958fcbb 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -29,7 +29,7 @@ static int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a, u16 tci; tcf_lastuse_update(&v->tcf_tm); - bstats_cpu_update(this_cpu_ptr(v->common.cpu_bstats), skb); + tcf_action_update_bstats(&v->common, skb); /* Ensure 'data' points at mac_header prior calling vlan manipulating * functions. @@ -88,7 +88,7 @@ out: return action; drop: - qstats_drop_inc(this_cpu_ptr(v->common.cpu_qstats)); + tcf_action_inc_drop_qstats(&v->common); return TC_ACT_SHOT; } @@ -102,7 +102,8 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = { static int tcf_vlan_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, struct netlink_ext_ack *extack) + struct tcf_proto *tp, u32 flags, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, vlan_net_id); struct nlattr *tb[TCA_VLAN_MAX + 1]; @@ -188,8 +189,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, action = parm->v_action; if (!exists) { - ret = tcf_idr_create(tn, index, est, a, - &act_vlan_ops, bind, true); + ret = tcf_idr_create_from_flags(tn, index, est, a, + &act_vlan_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -220,7 +221,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, spin_lock_bh(&v->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); - rcu_swap_protected(v->vlan_p, p, lockdep_is_held(&v->tcf_lock)); + p = rcu_replace_pointer(v->vlan_p, p, lockdep_is_held(&v->tcf_lock)); spin_unlock_bh(&v->tcf_lock); if (goto_ch) @@ -301,6 +302,16 @@ static int tcf_vlan_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops, extack); } +static void tcf_vlan_stats_update(struct tc_action *a, u64 bytes, u32 packets, + u64 lastuse, bool hw) +{ + struct tcf_vlan *v = to_vlan(a); + struct tcf_t *tm = &v->tcf_tm; + + tcf_action_update_stats(a, bytes, packets, false, hw); + tm->lastuse = max_t(u64, tm->lastuse, lastuse); +} + static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, vlan_net_id); @@ -325,6 +336,7 @@ static struct tc_action_ops act_vlan_ops = { .init = tcf_vlan_init, .cleanup = tcf_vlan_cleanup, .walk = tcf_vlan_walker, + .stats_update = tcf_vlan_stats_update, .get_fill_size = tcf_vlan_get_fill_size, .lookup = tcf_vlan_search, .size = sizeof(struct tcf_vlan), @@ -334,7 +346,7 @@ static __net_init int vlan_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, vlan_net_id); - return tc_action_net_init(tn, &act_vlan_ops); + return tc_action_net_init(net, tn, &act_vlan_ops); } static void __net_exit vlan_exit_net(struct list_head *net_list) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index efd3cfb80a2a..c2cdd0fc2e70 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -21,6 +21,7 @@ #include <linux/slab.h> #include <linux/idr.h> #include <linux/rhashtable.h> +#include <linux/jhash.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/netlink.h> @@ -36,6 +37,8 @@ #include <net/tc_act/tc_sample.h> #include <net/tc_act/tc_skbedit.h> #include <net/tc_act/tc_ct.h> +#include <net/tc_act/tc_mpls.h> +#include <net/flow_offload.h> extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1]; @@ -45,6 +48,62 @@ static LIST_HEAD(tcf_proto_base); /* Protects list of registered TC modules. It is pure SMP lock. */ static DEFINE_RWLOCK(cls_mod_lock); +static u32 destroy_obj_hashfn(const struct tcf_proto *tp) +{ + return jhash_3words(tp->chain->index, tp->prio, + (__force __u32)tp->protocol, 0); +} + +static void tcf_proto_signal_destroying(struct tcf_chain *chain, + struct tcf_proto *tp) +{ + struct tcf_block *block = chain->block; + + mutex_lock(&block->proto_destroy_lock); + hash_add_rcu(block->proto_destroy_ht, &tp->destroy_ht_node, + destroy_obj_hashfn(tp)); + mutex_unlock(&block->proto_destroy_lock); +} + +static bool tcf_proto_cmp(const struct tcf_proto *tp1, + const struct tcf_proto *tp2) +{ + return tp1->chain->index == tp2->chain->index && + tp1->prio == tp2->prio && + tp1->protocol == tp2->protocol; +} + +static bool tcf_proto_exists_destroying(struct tcf_chain *chain, + struct tcf_proto *tp) +{ + u32 hash = destroy_obj_hashfn(tp); + struct tcf_proto *iter; + bool found = false; + + rcu_read_lock(); + hash_for_each_possible_rcu(chain->block->proto_destroy_ht, iter, + destroy_ht_node, hash) { + if (tcf_proto_cmp(tp, iter)) { + found = true; + break; + } + } + rcu_read_unlock(); + + return found; +} + +static void +tcf_proto_signal_destroyed(struct tcf_chain *chain, struct tcf_proto *tp) +{ + struct tcf_block *block = chain->block; + + mutex_lock(&block->proto_destroy_lock); + if (hash_hashed(&tp->destroy_ht_node)) + hash_del_rcu(&tp->destroy_ht_node); + mutex_unlock(&block->proto_destroy_lock); +} + /* Find classifier type by string name */ static const struct tcf_proto_ops *__tcf_proto_lookup_ops(const char *kind) @@ -160,11 +219,22 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp) return TC_H_MAJ(first); } +static bool tcf_proto_check_kind(struct nlattr *kind, char *name) +{ + if (kind) + return nla_strlcpy(name, kind, IFNAMSIZ) >= IFNAMSIZ; + memset(name, 0, IFNAMSIZ); + return false; +} + static bool tcf_proto_is_unlocked(const char *kind) { const struct tcf_proto_ops *ops; bool ret; + if (strlen(kind) == 0) + return false; + ops = tcf_proto_lookup_ops(kind, false, NULL); /* On error return false to take rtnl lock. Proto lookup/create * functions will perform lookup again and properly handle errors. @@ -221,9 +291,11 @@ static void tcf_proto_get(struct tcf_proto *tp) static void tcf_chain_put(struct tcf_chain *chain); static void tcf_proto_destroy(struct tcf_proto *tp, bool rtnl_held, - struct netlink_ext_ack *extack) + bool sig_destroy, struct netlink_ext_ack *extack) { tp->ops->destroy(tp, rtnl_held, extack); + if (sig_destroy) + tcf_proto_signal_destroyed(tp->chain, tp); tcf_chain_put(tp->chain); module_put(tp->ops->owner); kfree_rcu(tp, rcu); @@ -233,36 +305,15 @@ static void tcf_proto_put(struct tcf_proto *tp, bool rtnl_held, struct netlink_ext_ack *extack) { if (refcount_dec_and_test(&tp->refcnt)) - tcf_proto_destroy(tp, rtnl_held, extack); -} - -static int walker_check_empty(struct tcf_proto *tp, void *fh, - struct tcf_walker *arg) -{ - if (fh) { - arg->nonempty = true; - return -1; - } - return 0; + tcf_proto_destroy(tp, rtnl_held, true, extack); } -static bool tcf_proto_is_empty(struct tcf_proto *tp, bool rtnl_held) +static bool tcf_proto_check_delete(struct tcf_proto *tp) { - struct tcf_walker walker = { .fn = walker_check_empty, }; + if (tp->ops->delete_empty) + return tp->ops->delete_empty(tp); - if (tp->ops->walk) { - tp->ops->walk(tp, &walker, rtnl_held); - return !walker.nonempty; - } - return true; -} - -static bool tcf_proto_check_delete(struct tcf_proto *tp, bool rtnl_held) -{ - spin_lock(&tp->lock); - if (tcf_proto_is_empty(tp, rtnl_held)) - tp->deleting = true; - spin_unlock(&tp->lock); + tp->deleting = true; return tp->deleting; } @@ -357,6 +408,7 @@ static bool tcf_chain_detach(struct tcf_chain *chain) static void tcf_block_destroy(struct tcf_block *block) { mutex_destroy(&block->lock); + mutex_destroy(&block->proto_destroy_lock); kfree_rcu(block, rcu); } @@ -532,6 +584,12 @@ static void tcf_chain_flush(struct tcf_chain *chain, bool rtnl_held) mutex_lock(&chain->filter_chain_lock); tp = tcf_chain_dereference(chain->filter_chain, chain); + while (tp) { + tp_next = rcu_dereference_protected(tp->next, 1); + tcf_proto_signal_destroying(chain, tp); + tp = tp_next; + } + tp = tcf_chain_dereference(chain->filter_chain, chain); RCU_INIT_POINTER(chain->filter_chain, NULL); tcf_chain0_head_change(chain, NULL); chain->flushing = true; @@ -544,235 +602,87 @@ static void tcf_chain_flush(struct tcf_chain *chain, bool rtnl_held) } } -static struct tcf_block *tc_dev_ingress_block(struct net_device *dev) -{ - const struct Qdisc_class_ops *cops; - struct Qdisc *qdisc; - - if (!dev_ingress_queue(dev)) - return NULL; - - qdisc = dev_ingress_queue(dev)->qdisc_sleeping; - if (!qdisc) - return NULL; - - cops = qdisc->ops->cl_ops; - if (!cops) - return NULL; - - if (!cops->tcf_block) - return NULL; - - return cops->tcf_block(qdisc, TC_H_MIN_INGRESS, NULL); -} - -static struct rhashtable indr_setup_block_ht; - -struct tc_indr_block_dev { - struct rhash_head ht_node; - struct net_device *dev; - unsigned int refcnt; - struct list_head cb_list; - struct tcf_block *block; -}; - -struct tc_indr_block_cb { - struct list_head list; - void *cb_priv; - tc_indr_block_bind_cb_t *cb; - void *cb_ident; -}; - -static const struct rhashtable_params tc_indr_setup_block_ht_params = { - .key_offset = offsetof(struct tc_indr_block_dev, dev), - .head_offset = offsetof(struct tc_indr_block_dev, ht_node), - .key_len = sizeof(struct net_device *), -}; - -static struct tc_indr_block_dev * -tc_indr_block_dev_lookup(struct net_device *dev) -{ - return rhashtable_lookup_fast(&indr_setup_block_ht, &dev, - tc_indr_setup_block_ht_params); -} - -static struct tc_indr_block_dev *tc_indr_block_dev_get(struct net_device *dev) -{ - struct tc_indr_block_dev *indr_dev; - - indr_dev = tc_indr_block_dev_lookup(dev); - if (indr_dev) - goto inc_ref; - - indr_dev = kzalloc(sizeof(*indr_dev), GFP_KERNEL); - if (!indr_dev) - return NULL; - - INIT_LIST_HEAD(&indr_dev->cb_list); - indr_dev->dev = dev; - indr_dev->block = tc_dev_ingress_block(dev); - if (rhashtable_insert_fast(&indr_setup_block_ht, &indr_dev->ht_node, - tc_indr_setup_block_ht_params)) { - kfree(indr_dev); - return NULL; - } - -inc_ref: - indr_dev->refcnt++; - return indr_dev; -} - -static void tc_indr_block_dev_put(struct tc_indr_block_dev *indr_dev) -{ - if (--indr_dev->refcnt) - return; - - rhashtable_remove_fast(&indr_setup_block_ht, &indr_dev->ht_node, - tc_indr_setup_block_ht_params); - kfree(indr_dev); -} - -static struct tc_indr_block_cb * -tc_indr_block_cb_lookup(struct tc_indr_block_dev *indr_dev, - tc_indr_block_bind_cb_t *cb, void *cb_ident) -{ - struct tc_indr_block_cb *indr_block_cb; - - list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list) - if (indr_block_cb->cb == cb && - indr_block_cb->cb_ident == cb_ident) - return indr_block_cb; - return NULL; -} - -static struct tc_indr_block_cb * -tc_indr_block_cb_add(struct tc_indr_block_dev *indr_dev, void *cb_priv, - tc_indr_block_bind_cb_t *cb, void *cb_ident) -{ - struct tc_indr_block_cb *indr_block_cb; - - indr_block_cb = tc_indr_block_cb_lookup(indr_dev, cb, cb_ident); - if (indr_block_cb) - return ERR_PTR(-EEXIST); - - indr_block_cb = kzalloc(sizeof(*indr_block_cb), GFP_KERNEL); - if (!indr_block_cb) - return ERR_PTR(-ENOMEM); - - indr_block_cb->cb_priv = cb_priv; - indr_block_cb->cb = cb; - indr_block_cb->cb_ident = cb_ident; - list_add(&indr_block_cb->list, &indr_dev->cb_list); - - return indr_block_cb; -} - -static void tc_indr_block_cb_del(struct tc_indr_block_cb *indr_block_cb) -{ - list_del(&indr_block_cb->list); - kfree(indr_block_cb); -} - static int tcf_block_setup(struct tcf_block *block, struct flow_block_offload *bo); -static void tc_indr_block_ing_cmd(struct tc_indr_block_dev *indr_dev, - struct tc_indr_block_cb *indr_block_cb, - enum flow_block_command command) +static void tc_indr_block_cmd(struct net_device *dev, struct tcf_block *block, + flow_indr_block_bind_cb_t *cb, void *cb_priv, + enum flow_block_command command, bool ingress) { struct flow_block_offload bo = { .command = command, - .binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS, - .net = dev_net(indr_dev->dev), - .block_shared = tcf_block_non_null_shared(indr_dev->block), + .binder_type = ingress ? + FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS : + FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS, + .net = dev_net(dev), + .block_shared = tcf_block_non_null_shared(block), }; INIT_LIST_HEAD(&bo.cb_list); - if (!indr_dev->block) + if (!block) return; - bo.block = &indr_dev->block->flow_block; + bo.block = &block->flow_block; + + down_write(&block->cb_lock); + cb(dev, cb_priv, TC_SETUP_BLOCK, &bo); - indr_block_cb->cb(indr_dev->dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK, - &bo); - tcf_block_setup(indr_dev->block, &bo); + tcf_block_setup(block, &bo); + up_write(&block->cb_lock); } -int __tc_indr_block_cb_register(struct net_device *dev, void *cb_priv, - tc_indr_block_bind_cb_t *cb, void *cb_ident) +static struct tcf_block *tc_dev_block(struct net_device *dev, bool ingress) { - struct tc_indr_block_cb *indr_block_cb; - struct tc_indr_block_dev *indr_dev; - int err; + const struct Qdisc_class_ops *cops; + const struct Qdisc_ops *ops; + struct Qdisc *qdisc; - indr_dev = tc_indr_block_dev_get(dev); - if (!indr_dev) - return -ENOMEM; + if (!dev_ingress_queue(dev)) + return NULL; - indr_block_cb = tc_indr_block_cb_add(indr_dev, cb_priv, cb, cb_ident); - err = PTR_ERR_OR_ZERO(indr_block_cb); - if (err) - goto err_dev_put; + qdisc = dev_ingress_queue(dev)->qdisc_sleeping; + if (!qdisc) + return NULL; - tc_indr_block_ing_cmd(indr_dev, indr_block_cb, FLOW_BLOCK_BIND); - return 0; + ops = qdisc->ops; + if (!ops) + return NULL; -err_dev_put: - tc_indr_block_dev_put(indr_dev); - return err; -} -EXPORT_SYMBOL_GPL(__tc_indr_block_cb_register); + if (!ingress && !strcmp("ingress", ops->id)) + return NULL; -int tc_indr_block_cb_register(struct net_device *dev, void *cb_priv, - tc_indr_block_bind_cb_t *cb, void *cb_ident) -{ - int err; + cops = ops->cl_ops; + if (!cops) + return NULL; - rtnl_lock(); - err = __tc_indr_block_cb_register(dev, cb_priv, cb, cb_ident); - rtnl_unlock(); + if (!cops->tcf_block) + return NULL; - return err; + return cops->tcf_block(qdisc, + ingress ? TC_H_MIN_INGRESS : TC_H_MIN_EGRESS, + NULL); } -EXPORT_SYMBOL_GPL(tc_indr_block_cb_register); -void __tc_indr_block_cb_unregister(struct net_device *dev, - tc_indr_block_bind_cb_t *cb, void *cb_ident) +static void tc_indr_block_get_and_cmd(struct net_device *dev, + flow_indr_block_bind_cb_t *cb, + void *cb_priv, + enum flow_block_command command) { - struct tc_indr_block_cb *indr_block_cb; - struct tc_indr_block_dev *indr_dev; - - indr_dev = tc_indr_block_dev_lookup(dev); - if (!indr_dev) - return; + struct tcf_block *block; - indr_block_cb = tc_indr_block_cb_lookup(indr_dev, cb, cb_ident); - if (!indr_block_cb) - return; + block = tc_dev_block(dev, true); + tc_indr_block_cmd(dev, block, cb, cb_priv, command, true); - /* Send unbind message if required to free any block cbs. */ - tc_indr_block_ing_cmd(indr_dev, indr_block_cb, FLOW_BLOCK_UNBIND); - tc_indr_block_cb_del(indr_block_cb); - tc_indr_block_dev_put(indr_dev); -} -EXPORT_SYMBOL_GPL(__tc_indr_block_cb_unregister); - -void tc_indr_block_cb_unregister(struct net_device *dev, - tc_indr_block_bind_cb_t *cb, void *cb_ident) -{ - rtnl_lock(); - __tc_indr_block_cb_unregister(dev, cb, cb_ident); - rtnl_unlock(); + block = tc_dev_block(dev, false); + tc_indr_block_cmd(dev, block, cb, cb_priv, command, false); } -EXPORT_SYMBOL_GPL(tc_indr_block_cb_unregister); -static void tc_indr_block_call(struct tcf_block *block, struct net_device *dev, +static void tc_indr_block_call(struct tcf_block *block, + struct net_device *dev, struct tcf_block_ext_info *ei, enum flow_block_command command, struct netlink_ext_ack *extack) { - struct tc_indr_block_cb *indr_block_cb; - struct tc_indr_block_dev *indr_dev; struct flow_block_offload bo = { .command = command, .binder_type = ei->binder_type, @@ -783,22 +693,13 @@ static void tc_indr_block_call(struct tcf_block *block, struct net_device *dev, }; INIT_LIST_HEAD(&bo.cb_list); - indr_dev = tc_indr_block_dev_lookup(dev); - if (!indr_dev) - return; - - indr_dev->block = command == FLOW_BLOCK_BIND ? block : NULL; - - list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list) - indr_block_cb->cb(dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK, - &bo); - + flow_indr_block_call(dev, &bo, command); tcf_block_setup(block, &bo); } static bool tcf_block_offload_in_use(struct tcf_block *block) { - return block->offloadcnt; + return atomic_read(&block->offloadcnt); } static int tcf_block_offload_cmd(struct tcf_block *block, @@ -832,6 +733,7 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, struct net_device *dev = q->dev_queue->dev; int err; + down_write(&block->cb_lock); if (!dev->netdev_ops->ndo_setup_tc) goto no_offload_dev_inc; @@ -840,24 +742,31 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, */ if (!tc_can_offload(dev) && tcf_block_offload_in_use(block)) { NL_SET_ERR_MSG(extack, "Bind to offloaded block failed as dev has offload disabled"); - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + goto err_unlock; } err = tcf_block_offload_cmd(block, dev, ei, FLOW_BLOCK_BIND, extack); if (err == -EOPNOTSUPP) goto no_offload_dev_inc; if (err) - return err; + goto err_unlock; tc_indr_block_call(block, dev, ei, FLOW_BLOCK_BIND, extack); + up_write(&block->cb_lock); return 0; no_offload_dev_inc: - if (tcf_block_offload_in_use(block)) - return -EOPNOTSUPP; + if (tcf_block_offload_in_use(block)) { + err = -EOPNOTSUPP; + goto err_unlock; + } + err = 0; block->nooffloaddevcnt++; tc_indr_block_call(block, dev, ei, FLOW_BLOCK_BIND, extack); - return 0; +err_unlock: + up_write(&block->cb_lock); + return err; } static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q, @@ -866,6 +775,7 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q, struct net_device *dev = q->dev_queue->dev; int err; + down_write(&block->cb_lock); tc_indr_block_call(block, dev, ei, FLOW_BLOCK_UNBIND, NULL); if (!dev->netdev_ops->ndo_setup_tc) @@ -873,10 +783,12 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q, err = tcf_block_offload_cmd(block, dev, ei, FLOW_BLOCK_UNBIND, NULL); if (err == -EOPNOTSUPP) goto no_offload_dev_dec; + up_write(&block->cb_lock); return; no_offload_dev_dec: WARN_ON(block->nooffloaddevcnt-- == 0); + up_write(&block->cb_lock); } static int @@ -991,6 +903,8 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q, return ERR_PTR(-ENOMEM); } mutex_init(&block->lock); + mutex_init(&block->proto_destroy_lock); + init_rwsem(&block->cb_lock); flow_block_init(&block->flow_block); INIT_LIST_HEAD(&block->chain_list); INIT_LIST_HEAD(&block->owner_list); @@ -1526,6 +1440,8 @@ tcf_block_playback_offloads(struct tcf_block *block, flow_setup_cb_t *cb, struct tcf_proto *tp, *tp_prev; int err; + lockdep_assert_held(&block->cb_lock); + for (chain = __tcf_get_next_chain(block, NULL); chain; chain_prev = chain, @@ -1564,6 +1480,8 @@ static int tcf_block_bind(struct tcf_block *block, struct flow_block_cb *block_cb, *next; int err, i = 0; + lockdep_assert_held(&block->cb_lock); + list_for_each_entry(block_cb, &bo->cb_list, list) { err = tcf_block_playback_offloads(block, block_cb->cb, block_cb->cb_priv, true, @@ -1571,6 +1489,8 @@ static int tcf_block_bind(struct tcf_block *block, bo->extack); if (err) goto err_unroll; + if (!bo->unlocked_driver_cb) + block->lockeddevcnt++; i++; } @@ -1586,6 +1506,8 @@ err_unroll: block_cb->cb_priv, false, tcf_block_offload_in_use(block), NULL); + if (!bo->unlocked_driver_cb) + block->lockeddevcnt--; } flow_block_cb_free(block_cb); } @@ -1598,6 +1520,8 @@ static void tcf_block_unbind(struct tcf_block *block, { struct flow_block_cb *block_cb, *next; + lockdep_assert_held(&block->cb_lock); + list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) { tcf_block_playback_offloads(block, block_cb->cb, block_cb->cb_priv, false, @@ -1605,6 +1529,8 @@ static void tcf_block_unbind(struct tcf_block *block, NULL); list_del(&block_cb->list); flow_block_cb_free(block_cb); + if (!bo->unlocked_driver_cb) + block->lockeddevcnt--; } } @@ -1659,6 +1585,18 @@ reclassify: goto reset; } else if (unlikely(TC_ACT_EXT_CMP(err, TC_ACT_GOTO_CHAIN))) { first_tp = res->goto_tp; + +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + { + struct tc_skb_ext *ext; + + ext = skb_ext_add(skb, TC_SKB_EXT); + if (WARN_ON_ONCE(!ext)) + return TC_ACT_SHOT; + + ext->chain = err & TC_ACT_EXT_VAL_MASK; + } +#endif goto reset; } #endif @@ -1743,6 +1681,12 @@ static struct tcf_proto *tcf_chain_tp_insert_unique(struct tcf_chain *chain, mutex_lock(&chain->filter_chain_lock); + if (tcf_proto_exists_destroying(chain, tp_new)) { + mutex_unlock(&chain->filter_chain_lock); + tcf_proto_destroy(tp_new, rtnl_held, false, NULL); + return ERR_PTR(-EAGAIN); + } + tp = tcf_chain_tp_find(chain, &chain_info, protocol, prio, false); if (!tp) @@ -1750,10 +1694,10 @@ static struct tcf_proto *tcf_chain_tp_insert_unique(struct tcf_chain *chain, mutex_unlock(&chain->filter_chain_lock); if (tp) { - tcf_proto_destroy(tp_new, rtnl_held, NULL); + tcf_proto_destroy(tp_new, rtnl_held, false, NULL); tp_new = tp; } else if (err) { - tcf_proto_destroy(tp_new, rtnl_held, NULL); + tcf_proto_destroy(tp_new, rtnl_held, false, NULL); tp_new = ERR_PTR(err); } @@ -1786,11 +1730,12 @@ static void tcf_chain_tp_delete_empty(struct tcf_chain *chain, * concurrently. * Mark tp for deletion if it is empty. */ - if (!tp_iter || !tcf_proto_check_delete(tp, rtnl_held)) { + if (!tp_iter || !tcf_proto_check_delete(tp)) { mutex_unlock(&chain->filter_chain_lock); return; } + tcf_proto_signal_destroying(chain, tp); next = tcf_chain_dereference(chain_info.next, chain); if (tp == chain->filter_chain) tcf_chain0_head_change(chain, next); @@ -1976,6 +1921,7 @@ static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n, { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_MAX + 1]; + char name[IFNAMSIZ]; struct tcmsg *t; u32 protocol; u32 prio; @@ -2032,13 +1978,19 @@ replay: if (err) return err; + if (tcf_proto_check_kind(tca[TCA_KIND], name)) { + NL_SET_ERR_MSG(extack, "Specified TC filter name too long"); + err = -EINVAL; + goto errout; + } + /* Take rtnl mutex if rtnl_held was set to true on previous iteration, * block is shared (no qdisc found), qdisc is not unlocked, classifier * type is not specified, classifier is not unlocked. */ if (rtnl_held || (q && !(q->ops->cl_ops->flags & QDISC_CLASS_OPS_DOIT_UNLOCKED)) || - !tca[TCA_KIND] || !tcf_proto_is_unlocked(nla_data(tca[TCA_KIND]))) { + !tcf_proto_is_unlocked(name)) { rtnl_held = true; rtnl_lock(); } @@ -2103,9 +2055,8 @@ replay: &chain_info)); mutex_unlock(&chain->filter_chain_lock); - tp_new = tcf_proto_create(nla_data(tca[TCA_KIND]), - protocol, prio, chain, rtnl_held, - extack); + tp_new = tcf_proto_create(name, protocol, prio, chain, + rtnl_held, extack); if (IS_ERR(tp_new)) { err = PTR_ERR(tp_new); goto errout_tp; @@ -2196,6 +2147,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_MAX + 1]; + char name[IFNAMSIZ]; struct tcmsg *t; u32 protocol; u32 prio; @@ -2235,13 +2187,18 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, if (err) return err; + if (tcf_proto_check_kind(tca[TCA_KIND], name)) { + NL_SET_ERR_MSG(extack, "Specified TC filter name too long"); + err = -EINVAL; + goto errout; + } /* Take rtnl mutex if flushing whole chain, block is shared (no qdisc * found), qdisc is not unlocked, classifier type is not specified, * classifier is not unlocked. */ if (!prio || (q && !(q->ops->cl_ops->flags & QDISC_CLASS_OPS_DOIT_UNLOCKED)) || - !tca[TCA_KIND] || !tcf_proto_is_unlocked(nla_data(tca[TCA_KIND]))) { + !tcf_proto_is_unlocked(name)) { rtnl_held = true; rtnl_lock(); } @@ -2297,6 +2254,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, err = -EINVAL; goto errout_locked; } else if (t->tcm_handle == 0) { + tcf_proto_signal_destroying(chain, tp); tcf_chain_tp_remove(chain, &chain_info, tp); mutex_unlock(&chain->filter_chain_lock); @@ -2349,6 +2307,7 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n, { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_MAX + 1]; + char name[IFNAMSIZ]; struct tcmsg *t; u32 protocol; u32 prio; @@ -2385,12 +2344,17 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n, if (err) return err; + if (tcf_proto_check_kind(tca[TCA_KIND], name)) { + NL_SET_ERR_MSG(extack, "Specified TC filter name too long"); + err = -EINVAL; + goto errout; + } /* Take rtnl mutex if block is shared (no qdisc found), qdisc is not * unlocked, classifier type is not specified, classifier is not * unlocked. */ if ((q && !(q->ops->cl_ops->flags & QDISC_CLASS_OPS_DOIT_UNLOCKED)) || - !tca[TCA_KIND] || !tcf_proto_is_unlocked(nla_data(tca[TCA_KIND]))) { + !tcf_proto_is_unlocked(name)) { rtnl_held = true; rtnl_lock(); } @@ -2749,13 +2713,19 @@ static int tc_chain_tmplt_add(struct tcf_chain *chain, struct net *net, struct netlink_ext_ack *extack) { const struct tcf_proto_ops *ops; + char name[IFNAMSIZ]; void *tmplt_priv; /* If kind is not set, user did not specify template. */ if (!tca[TCA_KIND]) return 0; - ops = tcf_proto_lookup_ops(nla_data(tca[TCA_KIND]), true, extack); + if (tcf_proto_check_kind(tca[TCA_KIND], name)) { + NL_SET_ERR_MSG(extack, "Specified TC chain template name too long"); + return -EINVAL; + } + + ops = tcf_proto_lookup_ops(name, true, extack); if (IS_ERR(ops)) return PTR_ERR(ops); if (!ops->tmplt_create || !ops->tmplt_destroy || !ops->tmplt_dump) { @@ -3027,8 +2997,10 @@ out: void tcf_exts_destroy(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT - tcf_action_destroy(exts->actions, TCA_ACT_UNBIND); - kfree(exts->actions); + if (exts->actions) { + tcf_action_destroy(exts->actions, TCA_ACT_UNBIND); + kfree(exts->actions); + } exts->nr_actions = 0; #endif } @@ -3151,17 +3123,61 @@ int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts) } EXPORT_SYMBOL(tcf_exts_dump_stats); -int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type, - void *type_data, bool err_stop) +static void tcf_block_offload_inc(struct tcf_block *block, u32 *flags) +{ + if (*flags & TCA_CLS_FLAGS_IN_HW) + return; + *flags |= TCA_CLS_FLAGS_IN_HW; + atomic_inc(&block->offloadcnt); +} + +static void tcf_block_offload_dec(struct tcf_block *block, u32 *flags) +{ + if (!(*flags & TCA_CLS_FLAGS_IN_HW)) + return; + *flags &= ~TCA_CLS_FLAGS_IN_HW; + atomic_dec(&block->offloadcnt); +} + +static void tc_cls_offload_cnt_update(struct tcf_block *block, + struct tcf_proto *tp, u32 *cnt, + u32 *flags, u32 diff, bool add) +{ + lockdep_assert_held(&block->cb_lock); + + spin_lock(&tp->lock); + if (add) { + if (!*cnt) + tcf_block_offload_inc(block, flags); + *cnt += diff; + } else { + *cnt -= diff; + if (!*cnt) + tcf_block_offload_dec(block, flags); + } + spin_unlock(&tp->lock); +} + +static void +tc_cls_offload_cnt_reset(struct tcf_block *block, struct tcf_proto *tp, + u32 *cnt, u32 *flags) +{ + lockdep_assert_held(&block->cb_lock); + + spin_lock(&tp->lock); + tcf_block_offload_dec(block, flags); + *cnt = 0; + spin_unlock(&tp->lock); +} + +static int +__tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type, + void *type_data, bool err_stop) { struct flow_block_cb *block_cb; int ok_count = 0; int err; - /* Make sure all netdevs sharing this block are offload-capable. */ - if (block->nooffloaddevcnt && err_stop) - return -EOPNOTSUPP; - list_for_each_entry(block_cb, &block->flow_block.cb_list, list) { err = block_cb->cb(type, type_data, block_cb->cb_priv); if (err) { @@ -3173,17 +3189,261 @@ int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type, } return ok_count; } + +int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type, + void *type_data, bool err_stop, bool rtnl_held) +{ + bool take_rtnl = READ_ONCE(block->lockeddevcnt) && !rtnl_held; + int ok_count; + +retry: + if (take_rtnl) + rtnl_lock(); + down_read(&block->cb_lock); + /* Need to obtain rtnl lock if block is bound to devs that require it. + * In block bind code cb_lock is obtained while holding rtnl, so we must + * obtain the locks in same order here. + */ + if (!rtnl_held && !take_rtnl && block->lockeddevcnt) { + up_read(&block->cb_lock); + take_rtnl = true; + goto retry; + } + + ok_count = __tc_setup_cb_call(block, type, type_data, err_stop); + + up_read(&block->cb_lock); + if (take_rtnl) + rtnl_unlock(); + return ok_count; +} EXPORT_SYMBOL(tc_setup_cb_call); +/* Non-destructive filter add. If filter that wasn't already in hardware is + * successfully offloaded, increment block offloads counter. On failure, + * previously offloaded filter is considered to be intact and offloads counter + * is not decremented. + */ + +int tc_setup_cb_add(struct tcf_block *block, struct tcf_proto *tp, + enum tc_setup_type type, void *type_data, bool err_stop, + u32 *flags, unsigned int *in_hw_count, bool rtnl_held) +{ + bool take_rtnl = READ_ONCE(block->lockeddevcnt) && !rtnl_held; + int ok_count; + +retry: + if (take_rtnl) + rtnl_lock(); + down_read(&block->cb_lock); + /* Need to obtain rtnl lock if block is bound to devs that require it. + * In block bind code cb_lock is obtained while holding rtnl, so we must + * obtain the locks in same order here. + */ + if (!rtnl_held && !take_rtnl && block->lockeddevcnt) { + up_read(&block->cb_lock); + take_rtnl = true; + goto retry; + } + + /* Make sure all netdevs sharing this block are offload-capable. */ + if (block->nooffloaddevcnt && err_stop) { + ok_count = -EOPNOTSUPP; + goto err_unlock; + } + + ok_count = __tc_setup_cb_call(block, type, type_data, err_stop); + if (ok_count < 0) + goto err_unlock; + + if (tp->ops->hw_add) + tp->ops->hw_add(tp, type_data); + if (ok_count > 0) + tc_cls_offload_cnt_update(block, tp, in_hw_count, flags, + ok_count, true); +err_unlock: + up_read(&block->cb_lock); + if (take_rtnl) + rtnl_unlock(); + return ok_count < 0 ? ok_count : 0; +} +EXPORT_SYMBOL(tc_setup_cb_add); + +/* Destructive filter replace. If filter that wasn't already in hardware is + * successfully offloaded, increment block offload counter. On failure, + * previously offloaded filter is considered to be destroyed and offload counter + * is decremented. + */ + +int tc_setup_cb_replace(struct tcf_block *block, struct tcf_proto *tp, + enum tc_setup_type type, void *type_data, bool err_stop, + u32 *old_flags, unsigned int *old_in_hw_count, + u32 *new_flags, unsigned int *new_in_hw_count, + bool rtnl_held) +{ + bool take_rtnl = READ_ONCE(block->lockeddevcnt) && !rtnl_held; + int ok_count; + +retry: + if (take_rtnl) + rtnl_lock(); + down_read(&block->cb_lock); + /* Need to obtain rtnl lock if block is bound to devs that require it. + * In block bind code cb_lock is obtained while holding rtnl, so we must + * obtain the locks in same order here. + */ + if (!rtnl_held && !take_rtnl && block->lockeddevcnt) { + up_read(&block->cb_lock); + take_rtnl = true; + goto retry; + } + + /* Make sure all netdevs sharing this block are offload-capable. */ + if (block->nooffloaddevcnt && err_stop) { + ok_count = -EOPNOTSUPP; + goto err_unlock; + } + + tc_cls_offload_cnt_reset(block, tp, old_in_hw_count, old_flags); + if (tp->ops->hw_del) + tp->ops->hw_del(tp, type_data); + + ok_count = __tc_setup_cb_call(block, type, type_data, err_stop); + if (ok_count < 0) + goto err_unlock; + + if (tp->ops->hw_add) + tp->ops->hw_add(tp, type_data); + if (ok_count > 0) + tc_cls_offload_cnt_update(block, tp, new_in_hw_count, + new_flags, ok_count, true); +err_unlock: + up_read(&block->cb_lock); + if (take_rtnl) + rtnl_unlock(); + return ok_count < 0 ? ok_count : 0; +} +EXPORT_SYMBOL(tc_setup_cb_replace); + +/* Destroy filter and decrement block offload counter, if filter was previously + * offloaded. + */ + +int tc_setup_cb_destroy(struct tcf_block *block, struct tcf_proto *tp, + enum tc_setup_type type, void *type_data, bool err_stop, + u32 *flags, unsigned int *in_hw_count, bool rtnl_held) +{ + bool take_rtnl = READ_ONCE(block->lockeddevcnt) && !rtnl_held; + int ok_count; + +retry: + if (take_rtnl) + rtnl_lock(); + down_read(&block->cb_lock); + /* Need to obtain rtnl lock if block is bound to devs that require it. + * In block bind code cb_lock is obtained while holding rtnl, so we must + * obtain the locks in same order here. + */ + if (!rtnl_held && !take_rtnl && block->lockeddevcnt) { + up_read(&block->cb_lock); + take_rtnl = true; + goto retry; + } + + ok_count = __tc_setup_cb_call(block, type, type_data, err_stop); + + tc_cls_offload_cnt_reset(block, tp, in_hw_count, flags); + if (tp->ops->hw_del) + tp->ops->hw_del(tp, type_data); + + up_read(&block->cb_lock); + if (take_rtnl) + rtnl_unlock(); + return ok_count < 0 ? ok_count : 0; +} +EXPORT_SYMBOL(tc_setup_cb_destroy); + +int tc_setup_cb_reoffload(struct tcf_block *block, struct tcf_proto *tp, + bool add, flow_setup_cb_t *cb, + enum tc_setup_type type, void *type_data, + void *cb_priv, u32 *flags, unsigned int *in_hw_count) +{ + int err = cb(type, type_data, cb_priv); + + if (err) { + if (add && tc_skip_sw(*flags)) + return err; + } else { + tc_cls_offload_cnt_update(block, tp, in_hw_count, flags, 1, + add); + } + + return 0; +} +EXPORT_SYMBOL(tc_setup_cb_reoffload); + +void tc_cleanup_flow_action(struct flow_action *flow_action) +{ + struct flow_action_entry *entry; + int i; + + flow_action_for_each(i, entry, flow_action) + if (entry->destructor) + entry->destructor(entry->destructor_priv); +} +EXPORT_SYMBOL(tc_cleanup_flow_action); + +static void tcf_mirred_get_dev(struct flow_action_entry *entry, + const struct tc_action *act) +{ +#ifdef CONFIG_NET_CLS_ACT + entry->dev = act->ops->get_dev(act, &entry->destructor); + if (!entry->dev) + return; + entry->destructor_priv = entry->dev; +#endif +} + +static void tcf_tunnel_encap_put_tunnel(void *priv) +{ + struct ip_tunnel_info *tunnel = priv; + + kfree(tunnel); +} + +static int tcf_tunnel_encap_get_tunnel(struct flow_action_entry *entry, + const struct tc_action *act) +{ + entry->tunnel = tcf_tunnel_info_copy(act); + if (!entry->tunnel) + return -ENOMEM; + entry->destructor = tcf_tunnel_encap_put_tunnel; + entry->destructor_priv = entry->tunnel; + return 0; +} + +static void tcf_sample_get_group(struct flow_action_entry *entry, + const struct tc_action *act) +{ +#ifdef CONFIG_NET_CLS_ACT + entry->sample.psample_group = + act->ops->get_psample_group(act, &entry->destructor); + entry->destructor_priv = entry->sample.psample_group; +#endif +} + int tc_setup_flow_action(struct flow_action *flow_action, - const struct tcf_exts *exts) + const struct tcf_exts *exts, bool rtnl_held) { const struct tc_action *act; - int i, j, k; + int i, j, k, err = 0; if (!exts) return 0; + if (!rtnl_held) + rtnl_lock(); + j = 0; tcf_exts_for_each_action(i, act, exts) { struct flow_action_entry *entry; @@ -3200,10 +3460,16 @@ int tc_setup_flow_action(struct flow_action *flow_action, entry->chain_index = tcf_gact_goto_chain_index(act); } else if (is_tcf_mirred_egress_redirect(act)) { entry->id = FLOW_ACTION_REDIRECT; - entry->dev = tcf_mirred_dev(act); + tcf_mirred_get_dev(entry, act); } else if (is_tcf_mirred_egress_mirror(act)) { entry->id = FLOW_ACTION_MIRRED; - entry->dev = tcf_mirred_dev(act); + tcf_mirred_get_dev(entry, act); + } else if (is_tcf_mirred_ingress_redirect(act)) { + entry->id = FLOW_ACTION_REDIRECT_INGRESS; + tcf_mirred_get_dev(entry, act); + } else if (is_tcf_mirred_ingress_mirror(act)) { + entry->id = FLOW_ACTION_MIRRED_INGRESS; + tcf_mirred_get_dev(entry, act); } else if (is_tcf_vlan(act)) { switch (tcf_vlan_action(act)) { case TCA_VLAN_ACT_PUSH: @@ -3222,11 +3488,14 @@ int tc_setup_flow_action(struct flow_action *flow_action, entry->vlan.prio = tcf_vlan_push_prio(act); break; default: + err = -EOPNOTSUPP; goto err_out; } } else if (is_tcf_tunnel_set(act)) { entry->id = FLOW_ACTION_TUNNEL_ENCAP; - entry->tunnel = tcf_tunnel_info(act); + err = tcf_tunnel_encap_get_tunnel(entry, act); + if (err) + goto err_out; } else if (is_tcf_tunnel_release(act)) { entry->id = FLOW_ACTION_TUNNEL_DECAP; } else if (is_tcf_pedit(act)) { @@ -3239,6 +3508,7 @@ int tc_setup_flow_action(struct flow_action *flow_action, entry->id = FLOW_ACTION_ADD; break; default: + err = -EOPNOTSUPP; goto err_out; } entry->mangle.htype = tcf_pedit_htype(act, k); @@ -3255,11 +3525,10 @@ int tc_setup_flow_action(struct flow_action *flow_action, entry->mark = tcf_skbedit_mark(act); } else if (is_tcf_sample(act)) { entry->id = FLOW_ACTION_SAMPLE; - entry->sample.psample_group = - tcf_sample_psample_group(act); entry->sample.trunc_size = tcf_sample_trunc_size(act); entry->sample.truncate = tcf_sample_truncate(act); entry->sample.rate = tcf_sample_rate(act); + tcf_sample_get_group(entry, act); } else if (is_tcf_police(act)) { entry->id = FLOW_ACTION_POLICE; entry->police.burst = tcf_police_tcfp_burst(act); @@ -3269,16 +3538,50 @@ int tc_setup_flow_action(struct flow_action *flow_action, entry->id = FLOW_ACTION_CT; entry->ct.action = tcf_ct_action(act); entry->ct.zone = tcf_ct_zone(act); + } else if (is_tcf_mpls(act)) { + switch (tcf_mpls_action(act)) { + case TCA_MPLS_ACT_PUSH: + entry->id = FLOW_ACTION_MPLS_PUSH; + entry->mpls_push.proto = tcf_mpls_proto(act); + entry->mpls_push.label = tcf_mpls_label(act); + entry->mpls_push.tc = tcf_mpls_tc(act); + entry->mpls_push.bos = tcf_mpls_bos(act); + entry->mpls_push.ttl = tcf_mpls_ttl(act); + break; + case TCA_MPLS_ACT_POP: + entry->id = FLOW_ACTION_MPLS_POP; + entry->mpls_pop.proto = tcf_mpls_proto(act); + break; + case TCA_MPLS_ACT_MODIFY: + entry->id = FLOW_ACTION_MPLS_MANGLE; + entry->mpls_mangle.label = tcf_mpls_label(act); + entry->mpls_mangle.tc = tcf_mpls_tc(act); + entry->mpls_mangle.bos = tcf_mpls_bos(act); + entry->mpls_mangle.ttl = tcf_mpls_ttl(act); + break; + default: + goto err_out; + } + } else if (is_tcf_skbedit_ptype(act)) { + entry->id = FLOW_ACTION_PTYPE; + entry->ptype = tcf_skbedit_ptype(act); } else { + err = -EOPNOTSUPP; goto err_out; } if (!is_tcf_pedit(act)) j++; } - return 0; + err_out: - return -EOPNOTSUPP; + if (!rtnl_held) + rtnl_unlock(); + + if (err) + tc_cleanup_flow_action(flow_action); + + return err; } EXPORT_SYMBOL(tc_setup_flow_action); @@ -3321,6 +3624,11 @@ static struct pernet_operations tcf_net_ops = { .size = sizeof(struct tcf_net), }; +static struct flow_indr_block_entry block_entry = { + .cb = tc_indr_block_get_and_cmd, + .list = LIST_HEAD_INIT(block_entry.list), +}; + static int __init tc_filter_init(void) { int err; @@ -3333,10 +3641,7 @@ static int __init tc_filter_init(void) if (err) goto err_register_pernet_subsys; - err = rhashtable_init(&indr_setup_block_ht, - &tc_indr_setup_block_ht_params); - if (err) - goto err_rhash_setup_block_ht; + flow_indr_add_block_cb(&block_entry); rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL, RTNL_FLAG_DOIT_UNLOCKED); @@ -3351,8 +3656,6 @@ static int __init tc_filter_init(void) return 0; -err_rhash_setup_block_ht: - unregister_pernet_subsys(&tcf_net_ops); err_register_pernet_subsys: destroy_workqueue(tc_filter_wq); return err; diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index 4aafbe3d435c..f256a7c69093 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -263,12 +263,17 @@ skip: } } -static void basic_bind_class(void *fh, u32 classid, unsigned long cl) +static void basic_bind_class(void *fh, u32 classid, unsigned long cl, void *q, + unsigned long base) { struct basic_filter *f = fh; - if (f && f->res.classid == classid) - f->res.class = cl; + if (f && f->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &f->res, base); + else + __tcf_unbind_filter(q, &f->res); + } } static int basic_dump(struct net *net, struct tcf_proto *tp, void *fh, diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 3f7a9c02b70c..6e3e63db0e01 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -162,18 +162,24 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog, cls_bpf.name = obj->bpf_name; cls_bpf.exts_integrated = obj->exts_integrated; - if (oldprog) - tcf_block_offload_dec(block, &oldprog->gen_flags); + if (oldprog && prog) + err = tc_setup_cb_replace(block, tp, TC_SETUP_CLSBPF, &cls_bpf, + skip_sw, &oldprog->gen_flags, + &oldprog->in_hw_count, + &prog->gen_flags, &prog->in_hw_count, + true); + else if (prog) + err = tc_setup_cb_add(block, tp, TC_SETUP_CLSBPF, &cls_bpf, + skip_sw, &prog->gen_flags, + &prog->in_hw_count, true); + else + err = tc_setup_cb_destroy(block, tp, TC_SETUP_CLSBPF, &cls_bpf, + skip_sw, &oldprog->gen_flags, + &oldprog->in_hw_count, true); - err = tc_setup_cb_call(block, TC_SETUP_CLSBPF, &cls_bpf, skip_sw); - if (prog) { - if (err < 0) { - cls_bpf_offload_cmd(tp, oldprog, prog, extack); - return err; - } else if (err > 0) { - prog->in_hw_count = err; - tcf_block_offload_inc(block, &prog->gen_flags); - } + if (prog && err) { + cls_bpf_offload_cmd(tp, oldprog, prog, extack); + return err; } if (prog && skip_sw && !(prog->gen_flags & TCA_CLS_FLAGS_IN_HW)) @@ -230,7 +236,7 @@ static void cls_bpf_offload_update_stats(struct tcf_proto *tp, cls_bpf.name = prog->bpf_name; cls_bpf.exts_integrated = prog->exts_integrated; - tc_setup_cb_call(block, TC_SETUP_CLSBPF, &cls_bpf, false); + tc_setup_cb_call(block, TC_SETUP_CLSBPF, &cls_bpf, false, true); } static int cls_bpf_init(struct tcf_proto *tp) @@ -625,12 +631,17 @@ nla_put_failure: return -1; } -static void cls_bpf_bind_class(void *fh, u32 classid, unsigned long cl) +static void cls_bpf_bind_class(void *fh, u32 classid, unsigned long cl, + void *q, unsigned long base) { struct cls_bpf_prog *prog = fh; - if (prog && prog->res.classid == classid) - prog->res.class = cl; + if (prog && prog->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &prog->res, base); + else + __tcf_unbind_filter(q, &prog->res); + } } static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg, @@ -673,15 +684,11 @@ static int cls_bpf_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb cls_bpf.name = prog->bpf_name; cls_bpf.exts_integrated = prog->exts_integrated; - err = cb(TC_SETUP_CLSBPF, &cls_bpf, cb_priv); - if (err) { - if (add && tc_skip_sw(prog->gen_flags)) - return err; - continue; - } - - tc_cls_offload_cnt_update(block, &prog->in_hw_count, - &prog->gen_flags, add); + err = tc_setup_cb_reoffload(block, tp, add, cb, TC_SETUP_CLSBPF, + &cls_bpf, cb_priv, &prog->gen_flags, + &prog->in_hw_count); + if (err) + return err; } return 0; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 054123742e32..7e54d2ab5254 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -22,6 +22,8 @@ #include <net/ip.h> #include <net/flow_dissector.h> #include <net/geneve.h> +#include <net/vxlan.h> +#include <net/erspan.h> #include <net/dst.h> #include <net/dst_metadata.h> @@ -54,8 +56,13 @@ struct fl_flow_key { struct flow_dissector_key_ip ip; struct flow_dissector_key_ip enc_ip; struct flow_dissector_key_enc_opts enc_opts; - struct flow_dissector_key_ports tp_min; - struct flow_dissector_key_ports tp_max; + union { + struct flow_dissector_key_ports tp; + struct { + struct flow_dissector_key_ports tp_min; + struct flow_dissector_key_ports tp_max; + }; + } tp_range; struct flow_dissector_key_ct ct; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ @@ -198,19 +205,19 @@ static bool fl_range_port_dst_cmp(struct cls_fl_filter *filter, { __be16 min_mask, max_mask, min_val, max_val; - min_mask = htons(filter->mask->key.tp_min.dst); - max_mask = htons(filter->mask->key.tp_max.dst); - min_val = htons(filter->key.tp_min.dst); - max_val = htons(filter->key.tp_max.dst); + min_mask = htons(filter->mask->key.tp_range.tp_min.dst); + max_mask = htons(filter->mask->key.tp_range.tp_max.dst); + min_val = htons(filter->key.tp_range.tp_min.dst); + max_val = htons(filter->key.tp_range.tp_max.dst); if (min_mask && max_mask) { - if (htons(key->tp.dst) < min_val || - htons(key->tp.dst) > max_val) + if (htons(key->tp_range.tp.dst) < min_val || + htons(key->tp_range.tp.dst) > max_val) return false; /* skb does not have min and max values */ - mkey->tp_min.dst = filter->mkey.tp_min.dst; - mkey->tp_max.dst = filter->mkey.tp_max.dst; + mkey->tp_range.tp_min.dst = filter->mkey.tp_range.tp_min.dst; + mkey->tp_range.tp_max.dst = filter->mkey.tp_range.tp_max.dst; } return true; } @@ -221,19 +228,19 @@ static bool fl_range_port_src_cmp(struct cls_fl_filter *filter, { __be16 min_mask, max_mask, min_val, max_val; - min_mask = htons(filter->mask->key.tp_min.src); - max_mask = htons(filter->mask->key.tp_max.src); - min_val = htons(filter->key.tp_min.src); - max_val = htons(filter->key.tp_max.src); + min_mask = htons(filter->mask->key.tp_range.tp_min.src); + max_mask = htons(filter->mask->key.tp_range.tp_max.src); + min_val = htons(filter->key.tp_range.tp_min.src); + max_val = htons(filter->key.tp_range.tp_max.src); if (min_mask && max_mask) { - if (htons(key->tp.src) < min_val || - htons(key->tp.src) > max_val) + if (htons(key->tp_range.tp.src) < min_val || + htons(key->tp_range.tp.src) > max_val) return false; /* skb does not have min and max values */ - mkey->tp_min.src = filter->mkey.tp_min.src; - mkey->tp_max.src = filter->mkey.tp_max.src; + mkey->tp_range.tp_min.src = filter->mkey.tp_range.tp_min.src; + mkey->tp_range.tp_max.src = filter->mkey.tp_range.tp_max.src; } return true; } @@ -412,41 +419,27 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f, struct tcf_block *block = tp->chain->block; struct flow_cls_offload cls_flower = {}; - if (!rtnl_held) - rtnl_lock(); - tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack); cls_flower.command = FLOW_CLS_DESTROY; cls_flower.cookie = (unsigned long) f; - tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false); - spin_lock(&tp->lock); - list_del_init(&f->hw_list); - tcf_block_offload_dec(block, &f->flags); - spin_unlock(&tp->lock); + tc_setup_cb_destroy(block, tp, TC_SETUP_CLSFLOWER, &cls_flower, false, + &f->flags, &f->in_hw_count, rtnl_held); - if (!rtnl_held) - rtnl_unlock(); } static int fl_hw_replace_filter(struct tcf_proto *tp, struct cls_fl_filter *f, bool rtnl_held, struct netlink_ext_ack *extack) { - struct cls_fl_head *head = fl_head_dereference(tp); struct tcf_block *block = tp->chain->block; struct flow_cls_offload cls_flower = {}; bool skip_sw = tc_skip_sw(f->flags); int err = 0; - if (!rtnl_held) - rtnl_lock(); - cls_flower.rule = flow_rule_alloc(tcf_exts_num_actions(&f->exts)); - if (!cls_flower.rule) { - err = -ENOMEM; - goto errout; - } + if (!cls_flower.rule) + return -ENOMEM; tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack); cls_flower.command = FLOW_CLS_REPLACE; @@ -456,43 +449,31 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, cls_flower.rule->match.key = &f->mkey; cls_flower.classid = f->res.classid; - err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts); + err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts, + rtnl_held); if (err) { kfree(cls_flower.rule); - if (skip_sw) + if (skip_sw) { NL_SET_ERR_MSG_MOD(extack, "Failed to setup flow action"); - else - err = 0; - goto errout; + return err; + } + return 0; } - err = tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, skip_sw); + err = tc_setup_cb_add(block, tp, TC_SETUP_CLSFLOWER, &cls_flower, + skip_sw, &f->flags, &f->in_hw_count, rtnl_held); + tc_cleanup_flow_action(&cls_flower.rule->action); kfree(cls_flower.rule); - if (err < 0) { - fl_hw_destroy_filter(tp, f, true, NULL); - goto errout; - } else if (err > 0) { - f->in_hw_count = err; - err = 0; - spin_lock(&tp->lock); - tcf_block_offload_inc(block, &f->flags); - spin_unlock(&tp->lock); - } - - if (skip_sw && !(f->flags & TCA_CLS_FLAGS_IN_HW)) { - err = -EINVAL; - goto errout; + if (err) { + fl_hw_destroy_filter(tp, f, rtnl_held, NULL); + return err; } - spin_lock(&tp->lock); - list_add(&f->hw_list, &head->hw_filters); - spin_unlock(&tp->lock); -errout: - if (!rtnl_held) - rtnl_unlock(); + if (skip_sw && !(f->flags & TCA_CLS_FLAGS_IN_HW)) + return -EINVAL; - return err; + return 0; } static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f, @@ -501,22 +482,17 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f, struct tcf_block *block = tp->chain->block; struct flow_cls_offload cls_flower = {}; - if (!rtnl_held) - rtnl_lock(); - tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, NULL); cls_flower.command = FLOW_CLS_STATS; cls_flower.cookie = (unsigned long) f; cls_flower.classid = f->res.classid; - tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false); + tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false, + rtnl_held); tcf_exts_stats_update(&f->exts, cls_flower.stats.bytes, cls_flower.stats.pkts, cls_flower.stats.lastused); - - if (!rtnl_held) - rtnl_unlock(); } static void __fl_put(struct cls_fl_filter *f) @@ -715,11 +691,16 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { .len = 128 / BITS_PER_BYTE }, [TCA_FLOWER_KEY_CT_LABELS_MASK] = { .type = NLA_BINARY, .len = 128 / BITS_PER_BYTE }, + [TCA_FLOWER_FLAGS] = { .type = NLA_U32 }, }; static const struct nla_policy enc_opts_policy[TCA_FLOWER_KEY_ENC_OPTS_MAX + 1] = { + [TCA_FLOWER_KEY_ENC_OPTS_UNSPEC] = { + .strict_start_type = TCA_FLOWER_KEY_ENC_OPTS_VXLAN }, [TCA_FLOWER_KEY_ENC_OPTS_GENEVE] = { .type = NLA_NESTED }, + [TCA_FLOWER_KEY_ENC_OPTS_VXLAN] = { .type = NLA_NESTED }, + [TCA_FLOWER_KEY_ENC_OPTS_ERSPAN] = { .type = NLA_NESTED }, }; static const struct nla_policy @@ -730,6 +711,19 @@ geneve_opt_policy[TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX + 1] = { .len = 128 }, }; +static const struct nla_policy +vxlan_opt_policy[TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX + 1] = { + [TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP] = { .type = NLA_U32 }, +}; + +static const struct nla_policy +erspan_opt_policy[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX + 1] = { + [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID] = { .type = NLA_U8 }, +}; + static void fl_set_key_val(struct nlattr **tb, void *val, int val_type, void *mask, int mask_type, int len) @@ -746,23 +740,25 @@ static void fl_set_key_val(struct nlattr **tb, static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key, struct fl_flow_key *mask) { - fl_set_key_val(tb, &key->tp_min.dst, - TCA_FLOWER_KEY_PORT_DST_MIN, &mask->tp_min.dst, - TCA_FLOWER_UNSPEC, sizeof(key->tp_min.dst)); - fl_set_key_val(tb, &key->tp_max.dst, - TCA_FLOWER_KEY_PORT_DST_MAX, &mask->tp_max.dst, - TCA_FLOWER_UNSPEC, sizeof(key->tp_max.dst)); - fl_set_key_val(tb, &key->tp_min.src, - TCA_FLOWER_KEY_PORT_SRC_MIN, &mask->tp_min.src, - TCA_FLOWER_UNSPEC, sizeof(key->tp_min.src)); - fl_set_key_val(tb, &key->tp_max.src, - TCA_FLOWER_KEY_PORT_SRC_MAX, &mask->tp_max.src, - TCA_FLOWER_UNSPEC, sizeof(key->tp_max.src)); - - if ((mask->tp_min.dst && mask->tp_max.dst && - htons(key->tp_max.dst) <= htons(key->tp_min.dst)) || - (mask->tp_min.src && mask->tp_max.src && - htons(key->tp_max.src) <= htons(key->tp_min.src))) + fl_set_key_val(tb, &key->tp_range.tp_min.dst, + TCA_FLOWER_KEY_PORT_DST_MIN, &mask->tp_range.tp_min.dst, + TCA_FLOWER_UNSPEC, sizeof(key->tp_range.tp_min.dst)); + fl_set_key_val(tb, &key->tp_range.tp_max.dst, + TCA_FLOWER_KEY_PORT_DST_MAX, &mask->tp_range.tp_max.dst, + TCA_FLOWER_UNSPEC, sizeof(key->tp_range.tp_max.dst)); + fl_set_key_val(tb, &key->tp_range.tp_min.src, + TCA_FLOWER_KEY_PORT_SRC_MIN, &mask->tp_range.tp_min.src, + TCA_FLOWER_UNSPEC, sizeof(key->tp_range.tp_min.src)); + fl_set_key_val(tb, &key->tp_range.tp_max.src, + TCA_FLOWER_KEY_PORT_SRC_MAX, &mask->tp_range.tp_max.src, + TCA_FLOWER_UNSPEC, sizeof(key->tp_range.tp_max.src)); + + if ((mask->tp_range.tp_min.dst && mask->tp_range.tp_max.dst && + htons(key->tp_range.tp_max.dst) <= + htons(key->tp_range.tp_min.dst)) || + (mask->tp_range.tp_min.src && mask->tp_range.tp_max.src && + htons(key->tp_range.tp_max.src) <= + htons(key->tp_range.tp_min.src))) return -EINVAL; return 0; @@ -959,6 +955,105 @@ static int fl_set_geneve_opt(const struct nlattr *nla, struct fl_flow_key *key, return sizeof(struct geneve_opt) + data_len; } +static int fl_set_vxlan_opt(const struct nlattr *nla, struct fl_flow_key *key, + int depth, int option_len, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX + 1]; + struct vxlan_metadata *md; + int err; + + md = (struct vxlan_metadata *)&key->enc_opts.data[key->enc_opts.len]; + memset(md, 0xff, sizeof(*md)); + + if (!depth) + return sizeof(*md); + + if (nla_type(nla) != TCA_FLOWER_KEY_ENC_OPTS_VXLAN) { + NL_SET_ERR_MSG(extack, "Non-vxlan option type for mask"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX, nla, + vxlan_opt_policy, extack); + if (err < 0) + return err; + + if (!option_len && !tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP]) { + NL_SET_ERR_MSG(extack, "Missing tunnel key vxlan option gbp"); + return -EINVAL; + } + + if (tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP]) + md->gbp = nla_get_u32(tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP]); + + return sizeof(*md); +} + +static int fl_set_erspan_opt(const struct nlattr *nla, struct fl_flow_key *key, + int depth, int option_len, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX + 1]; + struct erspan_metadata *md; + int err; + + md = (struct erspan_metadata *)&key->enc_opts.data[key->enc_opts.len]; + memset(md, 0xff, sizeof(*md)); + md->version = 1; + + if (!depth) + return sizeof(*md); + + if (nla_type(nla) != TCA_FLOWER_KEY_ENC_OPTS_ERSPAN) { + NL_SET_ERR_MSG(extack, "Non-erspan option type for mask"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX, nla, + erspan_opt_policy, extack); + if (err < 0) + return err; + + if (!option_len && !tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER]) { + NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option ver"); + return -EINVAL; + } + + if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER]) + md->version = nla_get_u8(tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER]); + + if (md->version == 1) { + if (!option_len && !tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX]) { + NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option index"); + return -EINVAL; + } + if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX]) { + nla = tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX]; + md->u.index = nla_get_be32(nla); + } + } else if (md->version == 2) { + if (!option_len && (!tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR] || + !tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID])) { + NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option dir or hwid"); + return -EINVAL; + } + if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR]) { + nla = tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR]; + md->u.md2.dir = nla_get_u8(nla); + } + if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID]) { + nla = tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID]; + set_hwid(&md->u.md2, nla_get_u8(nla)); + } + } else { + NL_SET_ERR_MSG(extack, "Tunnel key erspan option ver is incorrect"); + return -EINVAL; + } + + return sizeof(*md); +} + static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, struct fl_flow_key *mask, struct netlink_ext_ack *extack) @@ -989,6 +1084,11 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS]), key_depth) { switch (nla_type(nla_opt_key)) { case TCA_FLOWER_KEY_ENC_OPTS_GENEVE: + if (key->enc_opts.dst_opt_type && + key->enc_opts.dst_opt_type != TUNNEL_GENEVE_OPT) { + NL_SET_ERR_MSG(extack, "Duplicate type for geneve options"); + return -EINVAL; + } option_len = 0; key->enc_opts.dst_opt_type = TUNNEL_GENEVE_OPT; option_len = fl_set_geneve_opt(nla_opt_key, key, @@ -1017,6 +1117,72 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, if (msk_depth) nla_opt_msk = nla_next(nla_opt_msk, &msk_depth); break; + case TCA_FLOWER_KEY_ENC_OPTS_VXLAN: + if (key->enc_opts.dst_opt_type) { + NL_SET_ERR_MSG(extack, "Duplicate type for vxlan options"); + return -EINVAL; + } + option_len = 0; + key->enc_opts.dst_opt_type = TUNNEL_VXLAN_OPT; + option_len = fl_set_vxlan_opt(nla_opt_key, key, + key_depth, option_len, + extack); + if (option_len < 0) + return option_len; + + key->enc_opts.len += option_len; + /* At the same time we need to parse through the mask + * in order to verify exact and mask attribute lengths. + */ + mask->enc_opts.dst_opt_type = TUNNEL_VXLAN_OPT; + option_len = fl_set_vxlan_opt(nla_opt_msk, mask, + msk_depth, option_len, + extack); + if (option_len < 0) + return option_len; + + mask->enc_opts.len += option_len; + if (key->enc_opts.len != mask->enc_opts.len) { + NL_SET_ERR_MSG(extack, "Key and mask miss aligned"); + return -EINVAL; + } + + if (msk_depth) + nla_opt_msk = nla_next(nla_opt_msk, &msk_depth); + break; + case TCA_FLOWER_KEY_ENC_OPTS_ERSPAN: + if (key->enc_opts.dst_opt_type) { + NL_SET_ERR_MSG(extack, "Duplicate type for erspan options"); + return -EINVAL; + } + option_len = 0; + key->enc_opts.dst_opt_type = TUNNEL_ERSPAN_OPT; + option_len = fl_set_erspan_opt(nla_opt_key, key, + key_depth, option_len, + extack); + if (option_len < 0) + return option_len; + + key->enc_opts.len += option_len; + /* At the same time we need to parse through the mask + * in order to verify exact and mask attribute lengths. + */ + mask->enc_opts.dst_opt_type = TUNNEL_ERSPAN_OPT; + option_len = fl_set_erspan_opt(nla_opt_msk, mask, + msk_depth, option_len, + extack); + if (option_len < 0) + return option_len; + + mask->enc_opts.len += option_len; + if (key->enc_opts.len != mask->enc_opts.len) { + NL_SET_ERR_MSG(extack, "Key and mask miss aligned"); + return -EINVAL; + } + + if (msk_depth) + nla_opt_msk = nla_next(nla_opt_msk, &msk_depth); + break; default: NL_SET_ERR_MSG(extack, "Unknown tunnel option type"); return -EINVAL; @@ -1316,7 +1482,7 @@ static int fl_init_mask_hashtable(struct fl_flow_mask *mask) } #define FL_KEY_MEMBER_OFFSET(member) offsetof(struct fl_flow_key, member) -#define FL_KEY_MEMBER_SIZE(member) FIELD_SIZEOF(struct fl_flow_key, member) +#define FL_KEY_MEMBER_SIZE(member) sizeof_field(struct fl_flow_key, member) #define FL_KEY_IS_MASKED(mask, member) \ memchr_inv(((char *)mask) + FL_KEY_MEMBER_OFFSET(member), \ @@ -1351,9 +1517,10 @@ static void fl_init_dissector(struct flow_dissector *dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4); FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6); - if (FL_KEY_IS_MASKED(mask, tp) || - FL_KEY_IS_MASKED(mask, tp_min) || FL_KEY_IS_MASKED(mask, tp_max)) - FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_PORTS, tp); + FL_KEY_SET_IF_MASKED(mask, keys, cnt, + FLOW_DISSECTOR_KEY_PORTS, tp); + FL_KEY_SET_IF_MASKED(mask, keys, cnt, + FLOW_DISSECTOR_KEY_PORTS_RANGE, tp_range); FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_IP, ip); FL_KEY_SET_IF_MASKED(mask, keys, cnt, @@ -1402,8 +1569,10 @@ static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head, fl_mask_copy(newmask, mask); - if ((newmask->key.tp_min.dst && newmask->key.tp_max.dst) || - (newmask->key.tp_min.src && newmask->key.tp_max.src)) + if ((newmask->key.tp_range.tp_min.dst && + newmask->key.tp_range.tp_max.dst) || + (newmask->key.tp_range.tp_min.src && + newmask->key.tp_range.tp_max.src)) newmask->flags |= TCA_FLOWER_MASK_FLAGS_RANGE; err = fl_init_mask_hashtable(newmask); @@ -1831,7 +2000,8 @@ static int fl_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb, cls_flower.rule->match.mask = &f->mask->key; cls_flower.rule->match.key = &f->mkey; - err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts); + err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts, + true); if (err) { kfree(cls_flower.rule); if (tc_skip_sw(f->flags)) { @@ -1844,21 +2014,17 @@ static int fl_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb, cls_flower.classid = f->res.classid; - err = cb(TC_SETUP_CLSFLOWER, &cls_flower, cb_priv); + err = tc_setup_cb_reoffload(block, tp, add, cb, + TC_SETUP_CLSFLOWER, &cls_flower, + cb_priv, &f->flags, + &f->in_hw_count); + tc_cleanup_flow_action(&cls_flower.rule->action); kfree(cls_flower.rule); if (err) { - if (add && tc_skip_sw(f->flags)) { - __fl_put(f); - return err; - } - goto next_flow; + __fl_put(f); + return err; } - - spin_lock(&tp->lock); - tc_cls_offload_cnt_update(block, &f->in_hw_count, &f->flags, - add); - spin_unlock(&tp->lock); next_flow: __fl_put(f); } @@ -1866,6 +2032,30 @@ next_flow: return 0; } +static void fl_hw_add(struct tcf_proto *tp, void *type_data) +{ + struct flow_cls_offload *cls_flower = type_data; + struct cls_fl_filter *f = + (struct cls_fl_filter *) cls_flower->cookie; + struct cls_fl_head *head = fl_head_dereference(tp); + + spin_lock(&tp->lock); + list_add(&f->hw_list, &head->hw_filters); + spin_unlock(&tp->lock); +} + +static void fl_hw_del(struct tcf_proto *tp, void *type_data) +{ + struct flow_cls_offload *cls_flower = type_data; + struct cls_fl_filter *f = + (struct cls_fl_filter *) cls_flower->cookie; + + spin_lock(&tp->lock); + if (!list_empty(&f->hw_list)) + list_del_init(&f->hw_list); + spin_unlock(&tp->lock); +} + static int fl_hw_create_tmplt(struct tcf_chain *chain, struct fl_flow_tmplt *tmplt) { @@ -1886,7 +2076,7 @@ static int fl_hw_create_tmplt(struct tcf_chain *chain, /* We don't care if driver (any of them) fails to handle this * call. It serves just as a hint for it. */ - tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false); + tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false, true); kfree(cls_flower.rule); return 0; @@ -1902,7 +2092,7 @@ static void fl_hw_destroy_tmplt(struct tcf_chain *chain, cls_flower.command = FLOW_CLS_TMPLT_DESTROY; cls_flower.cookie = (unsigned long) tmplt; - tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false); + tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false, true); } static void *fl_tmplt_create(struct net *net, struct tcf_chain *chain, @@ -1980,18 +2170,22 @@ static int fl_dump_key_val(struct sk_buff *skb, static int fl_dump_key_port_range(struct sk_buff *skb, struct fl_flow_key *key, struct fl_flow_key *mask) { - if (fl_dump_key_val(skb, &key->tp_min.dst, TCA_FLOWER_KEY_PORT_DST_MIN, - &mask->tp_min.dst, TCA_FLOWER_UNSPEC, - sizeof(key->tp_min.dst)) || - fl_dump_key_val(skb, &key->tp_max.dst, TCA_FLOWER_KEY_PORT_DST_MAX, - &mask->tp_max.dst, TCA_FLOWER_UNSPEC, - sizeof(key->tp_max.dst)) || - fl_dump_key_val(skb, &key->tp_min.src, TCA_FLOWER_KEY_PORT_SRC_MIN, - &mask->tp_min.src, TCA_FLOWER_UNSPEC, - sizeof(key->tp_min.src)) || - fl_dump_key_val(skb, &key->tp_max.src, TCA_FLOWER_KEY_PORT_SRC_MAX, - &mask->tp_max.src, TCA_FLOWER_UNSPEC, - sizeof(key->tp_max.src))) + if (fl_dump_key_val(skb, &key->tp_range.tp_min.dst, + TCA_FLOWER_KEY_PORT_DST_MIN, + &mask->tp_range.tp_min.dst, TCA_FLOWER_UNSPEC, + sizeof(key->tp_range.tp_min.dst)) || + fl_dump_key_val(skb, &key->tp_range.tp_max.dst, + TCA_FLOWER_KEY_PORT_DST_MAX, + &mask->tp_range.tp_max.dst, TCA_FLOWER_UNSPEC, + sizeof(key->tp_range.tp_max.dst)) || + fl_dump_key_val(skb, &key->tp_range.tp_min.src, + TCA_FLOWER_KEY_PORT_SRC_MIN, + &mask->tp_range.tp_min.src, TCA_FLOWER_UNSPEC, + sizeof(key->tp_range.tp_min.src)) || + fl_dump_key_val(skb, &key->tp_range.tp_max.src, + TCA_FLOWER_KEY_PORT_SRC_MAX, + &mask->tp_range.tp_max.src, TCA_FLOWER_UNSPEC, + sizeof(key->tp_range.tp_max.src))) return -1; return 0; @@ -2145,6 +2339,61 @@ nla_put_failure: return -EMSGSIZE; } +static int fl_dump_key_vxlan_opt(struct sk_buff *skb, + struct flow_dissector_key_enc_opts *enc_opts) +{ + struct vxlan_metadata *md; + struct nlattr *nest; + + nest = nla_nest_start_noflag(skb, TCA_FLOWER_KEY_ENC_OPTS_VXLAN); + if (!nest) + goto nla_put_failure; + + md = (struct vxlan_metadata *)&enc_opts->data[0]; + if (nla_put_u32(skb, TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP, md->gbp)) + goto nla_put_failure; + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int fl_dump_key_erspan_opt(struct sk_buff *skb, + struct flow_dissector_key_enc_opts *enc_opts) +{ + struct erspan_metadata *md; + struct nlattr *nest; + + nest = nla_nest_start_noflag(skb, TCA_FLOWER_KEY_ENC_OPTS_ERSPAN); + if (!nest) + goto nla_put_failure; + + md = (struct erspan_metadata *)&enc_opts->data[0]; + if (nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER, md->version)) + goto nla_put_failure; + + if (md->version == 1 && + nla_put_be32(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX, md->u.index)) + goto nla_put_failure; + + if (md->version == 2 && + (nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR, + md->u.md2.dir) || + nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID, + get_hwid(&md->u.md2)))) + goto nla_put_failure; + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + static int fl_dump_key_ct(struct sk_buff *skb, struct flow_dissector_key_ct *key, struct flow_dissector_key_ct *mask) @@ -2198,6 +2447,16 @@ static int fl_dump_key_options(struct sk_buff *skb, int enc_opt_type, if (err) goto nla_put_failure; break; + case TUNNEL_VXLAN_OPT: + err = fl_dump_key_vxlan_opt(skb, enc_opts); + if (err) + goto nla_put_failure; + break; + case TUNNEL_ERSPAN_OPT: + err = fl_dump_key_erspan_opt(skb, enc_opts); + if (err) + goto nla_put_failure; + break; default: goto nla_put_failure; } @@ -2507,12 +2766,28 @@ nla_put_failure: return -EMSGSIZE; } -static void fl_bind_class(void *fh, u32 classid, unsigned long cl) +static void fl_bind_class(void *fh, u32 classid, unsigned long cl, void *q, + unsigned long base) { struct cls_fl_filter *f = fh; - if (f && f->res.classid == classid) - f->res.class = cl; + if (f && f->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &f->res, base); + else + __tcf_unbind_filter(q, &f->res); + } +} + +static bool fl_delete_empty(struct tcf_proto *tp) +{ + struct cls_fl_head *head = fl_head_dereference(tp); + + spin_lock(&tp->lock); + tp->deleting = idr_is_empty(&head->handle_idr); + spin_unlock(&tp->lock); + + return tp->deleting; } static struct tcf_proto_ops cls_fl_ops __read_mostly = { @@ -2524,8 +2799,11 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = { .put = fl_put, .change = fl_change, .delete = fl_delete, + .delete_empty = fl_delete_empty, .walk = fl_walk, .reoffload = fl_reoffload, + .hw_add = fl_hw_add, + .hw_del = fl_hw_del, .dump = fl_dump, .bind_class = fl_bind_class, .tmplt_create = fl_tmplt_create, diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index c9496c920d6f..ec945294626a 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -419,12 +419,17 @@ nla_put_failure: return -1; } -static void fw_bind_class(void *fh, u32 classid, unsigned long cl) +static void fw_bind_class(void *fh, u32 classid, unsigned long cl, void *q, + unsigned long base) { struct fw_filter *f = fh; - if (f && f->res.classid == classid) - f->res.class = cl; + if (f && f->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &f->res, base); + else + __tcf_unbind_filter(q, &f->res); + } } static struct tcf_proto_ops cls_fw_ops __read_mostly = { diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 455ea2793f9b..610a0b728161 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -75,8 +75,8 @@ static void mall_destroy_hw_filter(struct tcf_proto *tp, cls_mall.command = TC_CLSMATCHALL_DESTROY; cls_mall.cookie = cookie; - tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, false); - tcf_block_offload_dec(block, &head->flags); + tc_setup_cb_destroy(block, tp, TC_SETUP_CLSMATCHALL, &cls_mall, false, + &head->flags, &head->in_hw_count, true); } static int mall_replace_hw_filter(struct tcf_proto *tp, @@ -97,7 +97,7 @@ static int mall_replace_hw_filter(struct tcf_proto *tp, cls_mall.command = TC_CLSMATCHALL_REPLACE; cls_mall.cookie = cookie; - err = tc_setup_flow_action(&cls_mall.rule->action, &head->exts); + err = tc_setup_flow_action(&cls_mall.rule->action, &head->exts, true); if (err) { kfree(cls_mall.rule); mall_destroy_hw_filter(tp, head, cookie, NULL); @@ -109,15 +109,14 @@ static int mall_replace_hw_filter(struct tcf_proto *tp, return err; } - err = tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, skip_sw); + err = tc_setup_cb_add(block, tp, TC_SETUP_CLSMATCHALL, &cls_mall, + skip_sw, &head->flags, &head->in_hw_count, true); + tc_cleanup_flow_action(&cls_mall.rule->action); kfree(cls_mall.rule); - if (err < 0) { + if (err) { mall_destroy_hw_filter(tp, head, cookie, NULL); return err; - } else if (err > 0) { - head->in_hw_count = err; - tcf_block_offload_inc(block, &head->flags); } if (skip_sw && !(head->flags & TCA_CLS_FLAGS_IN_HW)) @@ -158,6 +157,7 @@ static void *mall_get(struct tcf_proto *tp, u32 handle) static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = { [TCA_MATCHALL_UNSPEC] = { .type = NLA_UNSPEC }, [TCA_MATCHALL_CLASSID] = { .type = NLA_U32 }, + [TCA_MATCHALL_FLAGS] = { .type = NLA_U32 }, }; static int mall_set_parms(struct net *net, struct tcf_proto *tp, @@ -302,7 +302,7 @@ static int mall_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb, TC_CLSMATCHALL_REPLACE : TC_CLSMATCHALL_DESTROY; cls_mall.cookie = (unsigned long)head; - err = tc_setup_flow_action(&cls_mall.rule->action, &head->exts); + err = tc_setup_flow_action(&cls_mall.rule->action, &head->exts, true); if (err) { kfree(cls_mall.rule); if (add && tc_skip_sw(head->flags)) { @@ -312,16 +312,14 @@ static int mall_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb, return 0; } - err = cb(TC_SETUP_CLSMATCHALL, &cls_mall, cb_priv); + err = tc_setup_cb_reoffload(block, tp, add, cb, TC_SETUP_CLSMATCHALL, + &cls_mall, cb_priv, &head->flags, + &head->in_hw_count); + tc_cleanup_flow_action(&cls_mall.rule->action); kfree(cls_mall.rule); - if (err) { - if (add && tc_skip_sw(head->flags)) - return err; - return 0; - } - - tc_cls_offload_cnt_update(block, &head->in_hw_count, &head->flags, add); + if (err) + return err; return 0; } @@ -337,7 +335,7 @@ static void mall_stats_hw_filter(struct tcf_proto *tp, cls_mall.command = TC_CLSMATCHALL_STATS; cls_mall.cookie = cookie; - tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, false); + tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, false, true); tcf_exts_stats_update(&head->exts, cls_mall.stats.bytes, cls_mall.stats.pkts, cls_mall.stats.lastused); @@ -396,12 +394,17 @@ nla_put_failure: return -1; } -static void mall_bind_class(void *fh, u32 classid, unsigned long cl) +static void mall_bind_class(void *fh, u32 classid, unsigned long cl, void *q, + unsigned long base) { struct cls_mall_head *head = fh; - if (head && head->res.classid == classid) - head->res.class = cl; + if (head && head->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &head->res, base); + else + __tcf_unbind_filter(q, &head->res); + } } static struct tcf_proto_ops cls_mall_ops __read_mostly = { diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 2d9e0b4484ea..6f8786b06bde 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -641,12 +641,17 @@ nla_put_failure: return -1; } -static void route4_bind_class(void *fh, u32 classid, unsigned long cl) +static void route4_bind_class(void *fh, u32 classid, unsigned long cl, void *q, + unsigned long base) { struct route4_filter *f = fh; - if (f && f->res.classid == classid) - f->res.class = cl; + if (f && f->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &f->res, base); + else + __tcf_unbind_filter(q, &f->res); + } } static struct tcf_proto_ops cls_route4_ops __read_mostly = { diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 2f3c03b25d5d..d36949d9382c 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -463,10 +463,8 @@ static u32 gen_tunnel(struct rsvp_head *data) static const struct nla_policy rsvp_policy[TCA_RSVP_MAX + 1] = { [TCA_RSVP_CLASSID] = { .type = NLA_U32 }, - [TCA_RSVP_DST] = { .type = NLA_BINARY, - .len = RSVP_DST_LEN * sizeof(u32) }, - [TCA_RSVP_SRC] = { .type = NLA_BINARY, - .len = RSVP_DST_LEN * sizeof(u32) }, + [TCA_RSVP_DST] = { .len = RSVP_DST_LEN * sizeof(u32) }, + [TCA_RSVP_SRC] = { .len = RSVP_DST_LEN * sizeof(u32) }, [TCA_RSVP_PINFO] = { .len = sizeof(struct tc_rsvp_pinfo) }, }; @@ -738,12 +736,17 @@ nla_put_failure: return -1; } -static void rsvp_bind_class(void *fh, u32 classid, unsigned long cl) +static void rsvp_bind_class(void *fh, u32 classid, unsigned long cl, void *q, + unsigned long base) { struct rsvp_filter *f = fh; - if (f && f->res.classid == classid) - f->res.class = cl; + if (f && f->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &f->res, base); + else + __tcf_unbind_filter(q, &f->res); + } } static struct tcf_proto_ops RSVP_OPS __read_mostly = { diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index e573e5a5c794..09b7dc5fe7e0 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -333,12 +333,31 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, cp->fall_through = p->fall_through; cp->tp = tp; + if (tb[TCA_TCINDEX_HASH]) + cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]); + + if (tb[TCA_TCINDEX_MASK]) + cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]); + + if (tb[TCA_TCINDEX_SHIFT]) + cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]); + + if (!cp->hash) { + /* Hash not specified, use perfect hash if the upper limit + * of the hashing index is below the threshold. + */ + if ((cp->mask >> cp->shift) < PERFECT_HASH_THRESHOLD) + cp->hash = (cp->mask >> cp->shift) + 1; + else + cp->hash = DEFAULT_HASH_SIZE; + } + if (p->perfect) { int i; if (tcindex_alloc_perfect_hash(net, cp) < 0) goto errout; - for (i = 0; i < cp->hash; i++) + for (i = 0; i < min(cp->hash, p->hash); i++) cp->perfect[i].res = p->perfect[i].res; balloc = 1; } @@ -346,19 +365,10 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, err = tcindex_filter_result_init(&new_filter_result, net); if (err < 0) - goto errout1; + goto errout_alloc; if (old_r) cr = r->res; - if (tb[TCA_TCINDEX_HASH]) - cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]); - - if (tb[TCA_TCINDEX_MASK]) - cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]); - - if (tb[TCA_TCINDEX_SHIFT]) - cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]); - err = -EBUSY; /* Hash already allocated, make sure that we still meet the @@ -376,16 +386,6 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, if (tb[TCA_TCINDEX_FALL_THROUGH]) cp->fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]); - if (!cp->hash) { - /* Hash not specified, use perfect hash if the upper limit - * of the hashing index is below the threshold. - */ - if ((cp->mask >> cp->shift) < PERFECT_HASH_THRESHOLD) - cp->hash = (cp->mask >> cp->shift) + 1; - else - cp->hash = DEFAULT_HASH_SIZE; - } - if (!cp->perfect && !cp->h) cp->alloc_hash = cp->hash; @@ -484,7 +484,6 @@ errout_alloc: tcindex_free_perfect_hash(cp); else if (balloc == 2) kfree(cp->h); -errout1: tcf_exts_destroy(&new_filter_result.exts); errout: kfree(cp); @@ -654,12 +653,17 @@ nla_put_failure: return -1; } -static void tcindex_bind_class(void *fh, u32 classid, unsigned long cl) +static void tcindex_bind_class(void *fh, u32 classid, unsigned long cl, + void *q, unsigned long base) { struct tcindex_filter_result *r = fh; - if (r && r->res.classid == classid) - r->res.class = cl; + if (r && r->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &r->res, base); + else + __tcf_unbind_filter(q, &r->res); + } } static struct tcf_proto_ops cls_tcindex_ops __read_mostly = { diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 8614088edd1b..e15ff335953d 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -480,7 +480,7 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h, cls_u32.hnode.handle = h->handle; cls_u32.hnode.prio = h->prio; - tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, false); + tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, false, true); } static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h, @@ -498,7 +498,7 @@ static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h, cls_u32.hnode.handle = h->handle; cls_u32.hnode.prio = h->prio; - err = tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, skip_sw); + err = tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, skip_sw, true); if (err < 0) { u32_clear_hw_hnode(tp, h, NULL); return err; @@ -522,8 +522,8 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, cls_u32.command = TC_CLSU32_DELETE_KNODE; cls_u32.knode.handle = n->handle; - tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, false); - tcf_block_offload_dec(block, &n->flags); + tc_setup_cb_destroy(block, tp, TC_SETUP_CLSU32, &cls_u32, false, + &n->flags, &n->in_hw_count, true); } static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, @@ -552,13 +552,11 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, if (n->ht_down) cls_u32.knode.link_handle = ht->handle; - err = tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, skip_sw); - if (err < 0) { + err = tc_setup_cb_add(block, tp, TC_SETUP_CLSU32, &cls_u32, skip_sw, + &n->flags, &n->in_hw_count, true); + if (err) { u32_remove_hw_knode(tp, n, NULL); return err; - } else if (err > 0) { - n->in_hw_count = err; - tcf_block_offload_inc(block, &n->flags); } if (skip_sw && !(n->flags & TCA_CLS_FLAGS_IN_HW)) @@ -1201,14 +1199,11 @@ static int u32_reoffload_knode(struct tcf_proto *tp, struct tc_u_knode *n, cls_u32.knode.link_handle = ht->handle; } - err = cb(TC_SETUP_CLSU32, &cls_u32, cb_priv); - if (err) { - if (add && tc_skip_sw(n->flags)) - return err; - return 0; - } - - tc_cls_offload_cnt_update(block, &n->in_hw_count, &n->flags, add); + err = tc_setup_cb_reoffload(block, tp, add, cb, TC_SETUP_CLSU32, + &cls_u32, cb_priv, &n->flags, + &n->in_hw_count); + if (err) + return err; return 0; } @@ -1260,12 +1255,17 @@ static int u32_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb, return 0; } -static void u32_bind_class(void *fh, u32 classid, unsigned long cl) +static void u32_bind_class(void *fh, u32 classid, unsigned long cl, void *q, + unsigned long base) { struct tc_u_knode *n = fh; - if (n && n->res.classid == classid) - n->res.class = cl; + if (n && n->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &n->res, base); + else + __tcf_unbind_filter(q, &n->res); + } } static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh, diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 82bd14e7ac93..d99966a55c84 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -446,7 +446,7 @@ META_COLLECTOR(int_sk_wmem_queued) *err = -1; return; } - dst->value = sk->sk_wmem_queued; + dst->value = READ_ONCE(sk->sk_wmem_queued); } META_COLLECTOR(int_sk_fwd_alloc) @@ -521,7 +521,7 @@ META_COLLECTOR(int_sk_ack_bl) *err = -1; return; } - dst->value = sk->sk_ack_backlog; + dst->value = READ_ONCE(sk->sk_ack_backlog); } META_COLLECTOR(int_sk_max_ack_bl) @@ -532,7 +532,7 @@ META_COLLECTOR(int_sk_max_ack_bl) *err = -1; return; } - dst->value = sk->sk_max_ack_backlog; + dst->value = READ_ONCE(sk->sk_max_ack_backlog); } META_COLLECTOR(int_sk_prio) @@ -554,7 +554,7 @@ META_COLLECTOR(int_sk_rcvlowat) *err = -1; return; } - dst->value = sk->sk_rcvlowat; + dst->value = READ_ONCE(sk->sk_rcvlowat); } META_COLLECTOR(int_sk_rcvtimeo) diff --git a/net/sched/ematch.c b/net/sched/ematch.c index 8f2ad706784d..dd3b8c11a2e0 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -238,6 +238,9 @@ static int tcf_em_validate(struct tcf_proto *tp, goto errout; if (em->ops->change) { + err = -EINVAL; + if (em_hdr->flags & TCF_EM_SIMPLE) + goto errout; err = em->ops->change(net, data, data_len, em); if (err < 0) goto errout; @@ -263,12 +266,12 @@ static int tcf_em_validate(struct tcf_proto *tp, } em->data = (unsigned long) v; } + em->datalen = data_len; } } em->matchid = em_hdr->matchid; em->flags = em_hdr->flags; - em->datalen = data_len; em->net = net; err = 0; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 04faee7ccbce..50794125bf02 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1891,8 +1891,9 @@ static int tclass_del_notify(struct net *net, struct tcf_bind_args { struct tcf_walker w; - u32 classid; + unsigned long base; unsigned long cl; + u32 classid; }; static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg) @@ -1903,26 +1904,30 @@ static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg) struct Qdisc *q = tcf_block_q(tp->chain->block); sch_tree_lock(q); - tp->ops->bind_class(n, a->classid, a->cl); + tp->ops->bind_class(n, a->classid, a->cl, q, a->base); sch_tree_unlock(q); } return 0; } -static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid, - unsigned long new_cl) +struct tc_bind_class_args { + struct qdisc_walker w; + unsigned long new_cl; + u32 portid; + u32 clid; +}; + +static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl, + struct qdisc_walker *w) { + struct tc_bind_class_args *a = (struct tc_bind_class_args *)w; const struct Qdisc_class_ops *cops = q->ops->cl_ops; struct tcf_block *block; struct tcf_chain *chain; - unsigned long cl; - cl = cops->find(q, portid); - if (!cl) - return; block = cops->tcf_block(q, cl, NULL); if (!block) - return; + return 0; for (chain = tcf_get_next_chain(block, NULL); chain; chain = tcf_get_next_chain(block, chain)) { @@ -1933,11 +1938,29 @@ static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid, struct tcf_bind_args arg = {}; arg.w.fn = tcf_node_bind; - arg.classid = clid; - arg.cl = new_cl; + arg.classid = a->clid; + arg.base = cl; + arg.cl = a->new_cl; tp->ops->walk(tp, &arg.w, true); } } + + return 0; +} + +static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid, + unsigned long new_cl) +{ + const struct Qdisc_class_ops *cops = q->ops->cl_ops; + struct tc_bind_class_args args = {}; + + if (!cops->tcf_block) + return; + args.portid = portid; + args.clid = clid; + args.new_cl = new_cl; + args.w.fn = tc_bind_class_walker; + q->ops->cl_ops->walk(q, &args.w); } #else diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 53a80bc6b13a..1496e87cd07b 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -173,8 +173,7 @@ struct cake_tin_data { u64 tin_rate_bps; u16 tin_rate_shft; - u16 tin_quantum_prio; - u16 tin_quantum_band; + u16 tin_quantum; s32 tin_deficit; u32 tin_backlog; u32 tin_dropped; @@ -1683,8 +1682,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (IS_ERR_OR_NULL(segs)) return qdisc_drop(skb, sch, to_free); - while (segs) { - nskb = segs->next; + skb_list_walk_safe(segs, segs, nskb) { skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; cobalt_set_enqueue_time(segs, now); @@ -1697,7 +1695,6 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, slen += segs->len; q->buffer_used += segs->truesize; b->packets++; - segs = nskb; } /* stats */ @@ -1769,7 +1766,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, q->avg_window_begin)); u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC; - do_div(b, window_interval); + b = div64_u64(b, window_interval); q->avg_peak_bandwidth = cake_ewma(q->avg_peak_bandwidth, b, b > q->avg_peak_bandwidth ? 2 : 8); @@ -1919,7 +1916,7 @@ begin: while (b->tin_deficit < 0 || !(b->sparse_flow_count + b->bulk_flow_count)) { if (b->tin_deficit <= 0) - b->tin_deficit += b->tin_quantum_band; + b->tin_deficit += b->tin_quantum; if (b->sparse_flow_count + b->bulk_flow_count) empty = false; @@ -2184,6 +2181,7 @@ static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = { [TCA_CAKE_MPU] = { .type = NLA_U32 }, [TCA_CAKE_INGRESS] = { .type = NLA_U32 }, [TCA_CAKE_ACK_FILTER] = { .type = NLA_U32 }, + [TCA_CAKE_SPLIT_GSO] = { .type = NLA_U32 }, [TCA_CAKE_FWMARK] = { .type = NLA_U32 }, }; @@ -2240,8 +2238,7 @@ static int cake_config_besteffort(struct Qdisc *sch) cake_set_rate(b, rate, mtu, us_to_ns(q->target), us_to_ns(q->interval)); - b->tin_quantum_band = 65535; - b->tin_quantum_prio = 65535; + b->tin_quantum = 65535; return 0; } @@ -2252,8 +2249,7 @@ static int cake_config_precedence(struct Qdisc *sch) struct cake_sched_data *q = qdisc_priv(sch); u32 mtu = psched_mtu(qdisc_dev(sch)); u64 rate = q->rate_bps; - u32 quantum1 = 256; - u32 quantum2 = 256; + u32 quantum = 256; u32 i; q->tin_cnt = 8; @@ -2266,18 +2262,14 @@ static int cake_config_precedence(struct Qdisc *sch) cake_set_rate(b, rate, mtu, us_to_ns(q->target), us_to_ns(q->interval)); - b->tin_quantum_prio = max_t(u16, 1U, quantum1); - b->tin_quantum_band = max_t(u16, 1U, quantum2); + b->tin_quantum = max_t(u16, 1U, quantum); /* calculate next class's parameters */ rate *= 7; rate >>= 3; - quantum1 *= 3; - quantum1 >>= 1; - - quantum2 *= 7; - quantum2 >>= 3; + quantum *= 7; + quantum >>= 3; } return 0; @@ -2346,8 +2338,7 @@ static int cake_config_diffserv8(struct Qdisc *sch) struct cake_sched_data *q = qdisc_priv(sch); u32 mtu = psched_mtu(qdisc_dev(sch)); u64 rate = q->rate_bps; - u32 quantum1 = 256; - u32 quantum2 = 256; + u32 quantum = 256; u32 i; q->tin_cnt = 8; @@ -2363,18 +2354,14 @@ static int cake_config_diffserv8(struct Qdisc *sch) cake_set_rate(b, rate, mtu, us_to_ns(q->target), us_to_ns(q->interval)); - b->tin_quantum_prio = max_t(u16, 1U, quantum1); - b->tin_quantum_band = max_t(u16, 1U, quantum2); + b->tin_quantum = max_t(u16, 1U, quantum); /* calculate next class's parameters */ rate *= 7; rate >>= 3; - quantum1 *= 3; - quantum1 >>= 1; - - quantum2 *= 7; - quantum2 >>= 3; + quantum *= 7; + quantum >>= 3; } return 0; @@ -2413,17 +2400,11 @@ static int cake_config_diffserv4(struct Qdisc *sch) cake_set_rate(&q->tins[3], rate >> 2, mtu, us_to_ns(q->target), us_to_ns(q->interval)); - /* priority weights */ - q->tins[0].tin_quantum_prio = quantum; - q->tins[1].tin_quantum_prio = quantum >> 4; - q->tins[2].tin_quantum_prio = quantum << 2; - q->tins[3].tin_quantum_prio = quantum << 4; - /* bandwidth-sharing weights */ - q->tins[0].tin_quantum_band = quantum; - q->tins[1].tin_quantum_band = quantum >> 4; - q->tins[2].tin_quantum_band = quantum >> 1; - q->tins[3].tin_quantum_band = quantum >> 2; + q->tins[0].tin_quantum = quantum; + q->tins[1].tin_quantum = quantum >> 4; + q->tins[2].tin_quantum = quantum >> 1; + q->tins[3].tin_quantum = quantum >> 2; return 0; } @@ -2454,15 +2435,10 @@ static int cake_config_diffserv3(struct Qdisc *sch) cake_set_rate(&q->tins[2], rate >> 2, mtu, us_to_ns(q->target), us_to_ns(q->interval)); - /* priority weights */ - q->tins[0].tin_quantum_prio = quantum; - q->tins[1].tin_quantum_prio = quantum >> 4; - q->tins[2].tin_quantum_prio = quantum << 4; - /* bandwidth-sharing weights */ - q->tins[0].tin_quantum_band = quantum; - q->tins[1].tin_quantum_band = quantum >> 4; - q->tins[2].tin_quantum_band = quantum >> 2; + q->tins[0].tin_quantum = quantum; + q->tins[1].tin_quantum = quantum >> 4; + q->tins[2].tin_quantum = quantum >> 2; return 0; } diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 06c7a2da21bc..39b427dc7512 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1127,6 +1127,33 @@ static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = { [TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) }, }; +static int cbq_opt_parse(struct nlattr *tb[TCA_CBQ_MAX + 1], + struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + int err; + + if (!opt) { + NL_SET_ERR_MSG(extack, "CBQ options are required for this operation"); + return -EINVAL; + } + + err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt, + cbq_policy, extack); + if (err < 0) + return err; + + if (tb[TCA_CBQ_WRROPT]) { + const struct tc_cbq_wrropt *wrr = nla_data(tb[TCA_CBQ_WRROPT]); + + if (wrr->priority > TC_CBQ_MAXPRIO) { + NL_SET_ERR_MSG(extack, "priority is bigger than TC_CBQ_MAXPRIO"); + err = -EINVAL; + } + } + return err; +} + static int cbq_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { @@ -1139,13 +1166,7 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt, hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); q->delay_timer.function = cbq_undelay; - if (!opt) { - NL_SET_ERR_MSG(extack, "CBQ options are required for this operation"); - return -EINVAL; - } - - err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt, cbq_policy, - extack); + err = cbq_opt_parse(tb, opt, extack); if (err < 0) return err; @@ -1464,13 +1485,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t struct cbq_class *parent; struct qdisc_rate_table *rtab = NULL; - if (!opt) { - NL_SET_ERR_MSG(extack, "Mandatory qdisc options missing"); - return -EINVAL; - } - - err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt, cbq_policy, - extack); + err = cbq_opt_parse(tb, opt, extack); if (err < 0) return err; diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index 732e109c3055..b2905b03a432 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -181,11 +181,6 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch) s64 credits; int len; - if (atomic64_read(&q->port_rate) == -1) { - WARN_ONCE(1, "cbs: dequeue() called with unknown port rate."); - return NULL; - } - if (q->credits < 0) { credits = timediff_to_credits(now - q->last, q->idleslope); @@ -303,11 +298,19 @@ static int cbs_enable_offload(struct net_device *dev, struct cbs_sched_data *q, static void cbs_set_port_rate(struct net_device *dev, struct cbs_sched_data *q) { struct ethtool_link_ksettings ecmd; - int port_rate = -1; + int speed = SPEED_10; + int port_rate; + int err; + + err = __ethtool_get_link_ksettings(dev, &ecmd); + if (err < 0) + goto skip; + + if (ecmd.base.speed && ecmd.base.speed != SPEED_UNKNOWN) + speed = ecmd.base.speed; - if (!__ethtool_get_link_ksettings(dev, &ecmd) && - ecmd.base.speed != SPEED_UNKNOWN) - port_rate = ecmd.base.speed * 1000 * BYTES_PER_KBIT; +skip: + port_rate = speed * 1000 * BYTES_PER_KBIT; atomic64_set(&q->port_rate, port_rate); netdev_dbg(dev, "cbs: set %s's port_rate to: %lld, linkspeed: %d\n", @@ -389,7 +392,6 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt, { struct cbs_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); - int err; if (!opt) { NL_SET_ERR_MSG(extack, "Missing CBS qdisc options which are mandatory"); @@ -401,6 +403,10 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt, if (!q->qdisc) return -ENOMEM; + spin_lock(&cbs_list_lock); + list_add(&q->cbs_list, &cbs_list); + spin_unlock(&cbs_list_lock); + qdisc_hash_add(q->qdisc, false); q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0); @@ -410,17 +416,7 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt, qdisc_watchdog_init(&q->watchdog, sch); - err = cbs_change(sch, opt, extack); - if (err) - return err; - - if (!q->offload) { - spin_lock(&cbs_list_lock); - list_add(&q->cbs_list, &cbs_list); - spin_unlock(&cbs_list_lock); - } - - return 0; + return cbs_change(sch, opt, extack); } static void cbs_destroy(struct Qdisc *sch) @@ -428,15 +424,18 @@ static void cbs_destroy(struct Qdisc *sch) struct cbs_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); - spin_lock(&cbs_list_lock); - list_del(&q->cbs_list); - spin_unlock(&cbs_list_lock); + /* Nothing to do if we couldn't create the underlying qdisc */ + if (!q->qdisc) + return; qdisc_watchdog_cancel(&q->watchdog); cbs_disable_offload(dev, q); - if (q->qdisc) - qdisc_put(q->qdisc); + spin_lock(&cbs_list_lock); + list_del(&q->cbs_list); + spin_unlock(&cbs_list_lock); + + qdisc_put(q->qdisc); } static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb) diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index dba70377bbd9..a36974e9c601 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -377,7 +377,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt, if (mask != q->tab_mask) { struct sk_buff **ntab; - ntab = kvmalloc_array((mask + 1), sizeof(struct sk_buff *), GFP_KERNEL | __GFP_ZERO); + ntab = kvcalloc(mask + 1, sizeof(struct sk_buff *), GFP_KERNEL); if (!ntab) return -ENOMEM; diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index bad1cbe59a56..05605b30bef3 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -361,6 +361,8 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt, goto errout; err = -EINVAL; + if (!tb[TCA_DSMARK_INDICES]) + goto errout; indices = nla_get_u16(tb[TCA_DSMARK_INDICES]); if (hweight32(indices) != 1) diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c index cebfb65d8556..b1da5589a0c6 100644 --- a/net/sched/sch_etf.c +++ b/net/sched/sch_etf.c @@ -177,7 +177,7 @@ static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch, parent = *p; skb = rb_to_skb(parent); - if (ktime_after(txtime, skb->tstamp)) { + if (ktime_compare(txtime, skb->tstamp) >= 0) { p = &parent->rb_right; leftmost = false; } else { diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c new file mode 100644 index 000000000000..a87e9159338c --- /dev/null +++ b/net/sched/sch_ets.c @@ -0,0 +1,828 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * net/sched/sch_ets.c Enhanced Transmission Selection scheduler + * + * Description + * ----------- + * + * The Enhanced Transmission Selection scheduler is a classful queuing + * discipline that merges functionality of PRIO and DRR qdiscs in one scheduler. + * ETS makes it easy to configure a set of strict and bandwidth-sharing bands to + * implement the transmission selection described in 802.1Qaz. + * + * Although ETS is technically classful, it's not possible to add and remove + * classes at will. Instead one specifies number of classes, how many are + * PRIO-like and how many DRR-like, and quanta for the latter. + * + * Algorithm + * --------- + * + * The strict classes, if any, are tried for traffic first: first band 0, if it + * has no traffic then band 1, etc. + * + * When there is no traffic in any of the strict queues, the bandwidth-sharing + * ones are tried next. Each band is assigned a deficit counter, initialized to + * "quantum" of that band. ETS maintains a list of active bandwidth-sharing + * bands whose qdiscs are non-empty. A packet is dequeued from the band at the + * head of the list if the packet size is smaller or equal to the deficit + * counter. If the counter is too small, it is increased by "quantum" and the + * scheduler moves on to the next band in the active list. + */ + +#include <linux/module.h> +#include <net/gen_stats.h> +#include <net/netlink.h> +#include <net/pkt_cls.h> +#include <net/pkt_sched.h> +#include <net/sch_generic.h> + +struct ets_class { + struct list_head alist; /* In struct ets_sched.active. */ + struct Qdisc *qdisc; + u32 quantum; + u32 deficit; + struct gnet_stats_basic_packed bstats; + struct gnet_stats_queue qstats; +}; + +struct ets_sched { + struct list_head active; + struct tcf_proto __rcu *filter_list; + struct tcf_block *block; + unsigned int nbands; + unsigned int nstrict; + u8 prio2band[TC_PRIO_MAX + 1]; + struct ets_class classes[TCQ_ETS_MAX_BANDS]; +}; + +static const struct nla_policy ets_policy[TCA_ETS_MAX + 1] = { + [TCA_ETS_NBANDS] = { .type = NLA_U8 }, + [TCA_ETS_NSTRICT] = { .type = NLA_U8 }, + [TCA_ETS_QUANTA] = { .type = NLA_NESTED }, + [TCA_ETS_PRIOMAP] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy ets_priomap_policy[TCA_ETS_MAX + 1] = { + [TCA_ETS_PRIOMAP_BAND] = { .type = NLA_U8 }, +}; + +static const struct nla_policy ets_quanta_policy[TCA_ETS_MAX + 1] = { + [TCA_ETS_QUANTA_BAND] = { .type = NLA_U32 }, +}; + +static const struct nla_policy ets_class_policy[TCA_ETS_MAX + 1] = { + [TCA_ETS_QUANTA_BAND] = { .type = NLA_U32 }, +}; + +static int ets_quantum_parse(struct Qdisc *sch, const struct nlattr *attr, + unsigned int *quantum, + struct netlink_ext_ack *extack) +{ + *quantum = nla_get_u32(attr); + if (!*quantum) { + NL_SET_ERR_MSG(extack, "ETS quantum cannot be zero"); + return -EINVAL; + } + return 0; +} + +static struct ets_class * +ets_class_from_arg(struct Qdisc *sch, unsigned long arg) +{ + struct ets_sched *q = qdisc_priv(sch); + + return &q->classes[arg - 1]; +} + +static u32 ets_class_id(struct Qdisc *sch, const struct ets_class *cl) +{ + struct ets_sched *q = qdisc_priv(sch); + int band = cl - q->classes; + + return TC_H_MAKE(sch->handle, band + 1); +} + +static void ets_offload_change(struct Qdisc *sch) +{ + struct net_device *dev = qdisc_dev(sch); + struct ets_sched *q = qdisc_priv(sch); + struct tc_ets_qopt_offload qopt; + unsigned int w_psum_prev = 0; + unsigned int q_psum = 0; + unsigned int q_sum = 0; + unsigned int quantum; + unsigned int w_psum; + unsigned int weight; + unsigned int i; + + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return; + + qopt.command = TC_ETS_REPLACE; + qopt.handle = sch->handle; + qopt.parent = sch->parent; + qopt.replace_params.bands = q->nbands; + qopt.replace_params.qstats = &sch->qstats; + memcpy(&qopt.replace_params.priomap, + q->prio2band, sizeof(q->prio2band)); + + for (i = 0; i < q->nbands; i++) + q_sum += q->classes[i].quantum; + + for (i = 0; i < q->nbands; i++) { + quantum = q->classes[i].quantum; + q_psum += quantum; + w_psum = quantum ? q_psum * 100 / q_sum : 0; + weight = w_psum - w_psum_prev; + w_psum_prev = w_psum; + + qopt.replace_params.quanta[i] = quantum; + qopt.replace_params.weights[i] = weight; + } + + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETS, &qopt); +} + +static void ets_offload_destroy(struct Qdisc *sch) +{ + struct net_device *dev = qdisc_dev(sch); + struct tc_ets_qopt_offload qopt; + + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return; + + qopt.command = TC_ETS_DESTROY; + qopt.handle = sch->handle; + qopt.parent = sch->parent; + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETS, &qopt); +} + +static void ets_offload_graft(struct Qdisc *sch, struct Qdisc *new, + struct Qdisc *old, unsigned long arg, + struct netlink_ext_ack *extack) +{ + struct net_device *dev = qdisc_dev(sch); + struct tc_ets_qopt_offload qopt; + + qopt.command = TC_ETS_GRAFT; + qopt.handle = sch->handle; + qopt.parent = sch->parent; + qopt.graft_params.band = arg - 1; + qopt.graft_params.child_handle = new->handle; + + qdisc_offload_graft_helper(dev, sch, new, old, TC_SETUP_QDISC_ETS, + &qopt, extack); +} + +static int ets_offload_dump(struct Qdisc *sch) +{ + struct tc_ets_qopt_offload qopt; + + qopt.command = TC_ETS_STATS; + qopt.handle = sch->handle; + qopt.parent = sch->parent; + qopt.stats.bstats = &sch->bstats; + qopt.stats.qstats = &sch->qstats; + + return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_ETS, &qopt); +} + +static bool ets_class_is_strict(struct ets_sched *q, const struct ets_class *cl) +{ + unsigned int band = cl - q->classes; + + return band < q->nstrict; +} + +static int ets_class_change(struct Qdisc *sch, u32 classid, u32 parentid, + struct nlattr **tca, unsigned long *arg, + struct netlink_ext_ack *extack) +{ + struct ets_class *cl = ets_class_from_arg(sch, *arg); + struct ets_sched *q = qdisc_priv(sch); + struct nlattr *opt = tca[TCA_OPTIONS]; + struct nlattr *tb[TCA_ETS_MAX + 1]; + unsigned int quantum; + int err; + + /* Classes can be added and removed only through Qdisc_ops.change + * interface. + */ + if (!cl) { + NL_SET_ERR_MSG(extack, "Fine-grained class addition and removal is not supported"); + return -EOPNOTSUPP; + } + + if (!opt) { + NL_SET_ERR_MSG(extack, "ETS options are required for this operation"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_ETS_MAX, opt, ets_class_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_ETS_QUANTA_BAND]) + /* Nothing to configure. */ + return 0; + + if (ets_class_is_strict(q, cl)) { + NL_SET_ERR_MSG(extack, "Strict bands do not have a configurable quantum"); + return -EINVAL; + } + + err = ets_quantum_parse(sch, tb[TCA_ETS_QUANTA_BAND], &quantum, + extack); + if (err) + return err; + + sch_tree_lock(sch); + cl->quantum = quantum; + sch_tree_unlock(sch); + + ets_offload_change(sch); + return 0; +} + +static int ets_class_graft(struct Qdisc *sch, unsigned long arg, + struct Qdisc *new, struct Qdisc **old, + struct netlink_ext_ack *extack) +{ + struct ets_class *cl = ets_class_from_arg(sch, arg); + + if (!new) { + new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + ets_class_id(sch, cl), NULL); + if (!new) + new = &noop_qdisc; + else + qdisc_hash_add(new, true); + } + + *old = qdisc_replace(sch, new, &cl->qdisc); + ets_offload_graft(sch, new, *old, arg, extack); + return 0; +} + +static struct Qdisc *ets_class_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct ets_class *cl = ets_class_from_arg(sch, arg); + + return cl->qdisc; +} + +static unsigned long ets_class_find(struct Qdisc *sch, u32 classid) +{ + unsigned long band = TC_H_MIN(classid); + struct ets_sched *q = qdisc_priv(sch); + + if (band - 1 >= q->nbands) + return 0; + return band; +} + +static void ets_class_qlen_notify(struct Qdisc *sch, unsigned long arg) +{ + struct ets_class *cl = ets_class_from_arg(sch, arg); + struct ets_sched *q = qdisc_priv(sch); + + /* We get notified about zero-length child Qdiscs as well if they are + * offloaded. Those aren't on the active list though, so don't attempt + * to remove them. + */ + if (!ets_class_is_strict(q, cl) && sch->q.qlen) + list_del(&cl->alist); +} + +static int ets_class_dump(struct Qdisc *sch, unsigned long arg, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct ets_class *cl = ets_class_from_arg(sch, arg); + struct ets_sched *q = qdisc_priv(sch); + struct nlattr *nest; + + tcm->tcm_parent = TC_H_ROOT; + tcm->tcm_handle = ets_class_id(sch, cl); + tcm->tcm_info = cl->qdisc->handle; + + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); + if (!nest) + goto nla_put_failure; + if (!ets_class_is_strict(q, cl)) { + if (nla_put_u32(skb, TCA_ETS_QUANTA_BAND, cl->quantum)) + goto nla_put_failure; + } + return nla_nest_end(skb, nest); + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int ets_class_dump_stats(struct Qdisc *sch, unsigned long arg, + struct gnet_dump *d) +{ + struct ets_class *cl = ets_class_from_arg(sch, arg); + struct Qdisc *cl_q = cl->qdisc; + + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), + d, NULL, &cl_q->bstats) < 0 || + qdisc_qstats_copy(d, cl_q) < 0) + return -1; + + return 0; +} + +static void ets_qdisc_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct ets_sched *q = qdisc_priv(sch); + int i; + + if (arg->stop) + return; + + for (i = 0; i < q->nbands; i++) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, i + 1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static struct tcf_block * +ets_qdisc_tcf_block(struct Qdisc *sch, unsigned long cl, + struct netlink_ext_ack *extack) +{ + struct ets_sched *q = qdisc_priv(sch); + + if (cl) { + NL_SET_ERR_MSG(extack, "ETS classid must be zero"); + return NULL; + } + + return q->block; +} + +static unsigned long ets_qdisc_bind_tcf(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + return ets_class_find(sch, classid); +} + +static void ets_qdisc_unbind_tcf(struct Qdisc *sch, unsigned long arg) +{ +} + +static struct ets_class *ets_classify(struct sk_buff *skb, struct Qdisc *sch, + int *qerr) +{ + struct ets_sched *q = qdisc_priv(sch); + u32 band = skb->priority; + struct tcf_result res; + struct tcf_proto *fl; + int err; + + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + if (TC_H_MAJ(skb->priority) != sch->handle) { + fl = rcu_dereference_bh(q->filter_list); + err = tcf_classify(skb, fl, &res, false); +#ifdef CONFIG_NET_CLS_ACT + switch (err) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + /* fall through */ + case TC_ACT_SHOT: + return NULL; + } +#endif + if (!fl || err < 0) { + if (TC_H_MAJ(band)) + band = 0; + return &q->classes[q->prio2band[band & TC_PRIO_MAX]]; + } + band = res.classid; + } + band = TC_H_MIN(band) - 1; + if (band >= q->nbands) + return &q->classes[q->prio2band[0]]; + return &q->classes[band]; +} + +static int ets_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + unsigned int len = qdisc_pkt_len(skb); + struct ets_sched *q = qdisc_priv(sch); + struct ets_class *cl; + int err = 0; + bool first; + + cl = ets_classify(skb, sch, &err); + if (!cl) { + if (err & __NET_XMIT_BYPASS) + qdisc_qstats_drop(sch); + __qdisc_drop(skb, to_free); + return err; + } + + first = !cl->qdisc->q.qlen; + err = qdisc_enqueue(skb, cl->qdisc, to_free); + if (unlikely(err != NET_XMIT_SUCCESS)) { + if (net_xmit_drop_count(err)) { + cl->qstats.drops++; + qdisc_qstats_drop(sch); + } + return err; + } + + if (first && !ets_class_is_strict(q, cl)) { + list_add_tail(&cl->alist, &q->active); + cl->deficit = cl->quantum; + } + + sch->qstats.backlog += len; + sch->q.qlen++; + return err; +} + +static struct sk_buff * +ets_qdisc_dequeue_skb(struct Qdisc *sch, struct sk_buff *skb) +{ + qdisc_bstats_update(sch, skb); + qdisc_qstats_backlog_dec(sch, skb); + sch->q.qlen--; + return skb; +} + +static struct sk_buff *ets_qdisc_dequeue(struct Qdisc *sch) +{ + struct ets_sched *q = qdisc_priv(sch); + struct ets_class *cl; + struct sk_buff *skb; + unsigned int band; + unsigned int len; + + while (1) { + for (band = 0; band < q->nstrict; band++) { + cl = &q->classes[band]; + skb = qdisc_dequeue_peeked(cl->qdisc); + if (skb) + return ets_qdisc_dequeue_skb(sch, skb); + } + + if (list_empty(&q->active)) + goto out; + + cl = list_first_entry(&q->active, struct ets_class, alist); + skb = cl->qdisc->ops->peek(cl->qdisc); + if (!skb) { + qdisc_warn_nonwc(__func__, cl->qdisc); + goto out; + } + + len = qdisc_pkt_len(skb); + if (len <= cl->deficit) { + cl->deficit -= len; + skb = qdisc_dequeue_peeked(cl->qdisc); + if (unlikely(!skb)) + goto out; + if (cl->qdisc->q.qlen == 0) + list_del(&cl->alist); + return ets_qdisc_dequeue_skb(sch, skb); + } + + cl->deficit += cl->quantum; + list_move_tail(&cl->alist, &q->active); + } +out: + return NULL; +} + +static int ets_qdisc_priomap_parse(struct nlattr *priomap_attr, + unsigned int nbands, u8 *priomap, + struct netlink_ext_ack *extack) +{ + const struct nlattr *attr; + int prio = 0; + u8 band; + int rem; + int err; + + err = __nla_validate_nested(priomap_attr, TCA_ETS_MAX, + ets_priomap_policy, NL_VALIDATE_STRICT, + extack); + if (err) + return err; + + nla_for_each_nested(attr, priomap_attr, rem) { + switch (nla_type(attr)) { + case TCA_ETS_PRIOMAP_BAND: + if (prio > TC_PRIO_MAX) { + NL_SET_ERR_MSG_MOD(extack, "Too many priorities in ETS priomap"); + return -EINVAL; + } + band = nla_get_u8(attr); + if (band >= nbands) { + NL_SET_ERR_MSG_MOD(extack, "Invalid band number in ETS priomap"); + return -EINVAL; + } + priomap[prio++] = band; + break; + default: + WARN_ON_ONCE(1); /* Validate should have caught this. */ + return -EINVAL; + } + } + + return 0; +} + +static int ets_qdisc_quanta_parse(struct Qdisc *sch, struct nlattr *quanta_attr, + unsigned int nbands, unsigned int nstrict, + unsigned int *quanta, + struct netlink_ext_ack *extack) +{ + const struct nlattr *attr; + int band = nstrict; + int rem; + int err; + + err = __nla_validate_nested(quanta_attr, TCA_ETS_MAX, + ets_quanta_policy, NL_VALIDATE_STRICT, + extack); + if (err < 0) + return err; + + nla_for_each_nested(attr, quanta_attr, rem) { + switch (nla_type(attr)) { + case TCA_ETS_QUANTA_BAND: + if (band >= nbands) { + NL_SET_ERR_MSG_MOD(extack, "ETS quanta has more values than bands"); + return -EINVAL; + } + err = ets_quantum_parse(sch, attr, &quanta[band++], + extack); + if (err) + return err; + break; + default: + WARN_ON_ONCE(1); /* Validate should have caught this. */ + return -EINVAL; + } + } + + return 0; +} + +static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + unsigned int quanta[TCQ_ETS_MAX_BANDS] = {0}; + struct Qdisc *queues[TCQ_ETS_MAX_BANDS]; + struct ets_sched *q = qdisc_priv(sch); + struct nlattr *tb[TCA_ETS_MAX + 1]; + unsigned int oldbands = q->nbands; + u8 priomap[TC_PRIO_MAX + 1]; + unsigned int nstrict = 0; + unsigned int nbands; + unsigned int i; + int err; + + if (!opt) { + NL_SET_ERR_MSG(extack, "ETS options are required for this operation"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_ETS_MAX, opt, ets_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_ETS_NBANDS]) { + NL_SET_ERR_MSG_MOD(extack, "Number of bands is a required argument"); + return -EINVAL; + } + nbands = nla_get_u8(tb[TCA_ETS_NBANDS]); + if (nbands < 1 || nbands > TCQ_ETS_MAX_BANDS) { + NL_SET_ERR_MSG_MOD(extack, "Invalid number of bands"); + return -EINVAL; + } + /* Unless overridden, traffic goes to the last band. */ + memset(priomap, nbands - 1, sizeof(priomap)); + + if (tb[TCA_ETS_NSTRICT]) { + nstrict = nla_get_u8(tb[TCA_ETS_NSTRICT]); + if (nstrict > nbands) { + NL_SET_ERR_MSG_MOD(extack, "Invalid number of strict bands"); + return -EINVAL; + } + } + + if (tb[TCA_ETS_PRIOMAP]) { + err = ets_qdisc_priomap_parse(tb[TCA_ETS_PRIOMAP], + nbands, priomap, extack); + if (err) + return err; + } + + if (tb[TCA_ETS_QUANTA]) { + err = ets_qdisc_quanta_parse(sch, tb[TCA_ETS_QUANTA], + nbands, nstrict, quanta, extack); + if (err) + return err; + } + /* If there are more bands than strict + quanta provided, the remaining + * ones are ETS with quantum of MTU. Initialize the missing values here. + */ + for (i = nstrict; i < nbands; i++) { + if (!quanta[i]) + quanta[i] = psched_mtu(qdisc_dev(sch)); + } + + /* Before commit, make sure we can allocate all new qdiscs */ + for (i = oldbands; i < nbands; i++) { + queues[i] = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + ets_class_id(sch, &q->classes[i]), + extack); + if (!queues[i]) { + while (i > oldbands) + qdisc_put(queues[--i]); + return -ENOMEM; + } + } + + sch_tree_lock(sch); + + q->nbands = nbands; + q->nstrict = nstrict; + memcpy(q->prio2band, priomap, sizeof(priomap)); + + for (i = q->nbands; i < oldbands; i++) + qdisc_tree_flush_backlog(q->classes[i].qdisc); + + for (i = 0; i < q->nbands; i++) + q->classes[i].quantum = quanta[i]; + + for (i = oldbands; i < q->nbands; i++) { + q->classes[i].qdisc = queues[i]; + if (q->classes[i].qdisc != &noop_qdisc) + qdisc_hash_add(q->classes[i].qdisc, true); + } + + sch_tree_unlock(sch); + + ets_offload_change(sch); + for (i = q->nbands; i < oldbands; i++) { + qdisc_put(q->classes[i].qdisc); + memset(&q->classes[i], 0, sizeof(q->classes[i])); + } + return 0; +} + +static int ets_qdisc_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct ets_sched *q = qdisc_priv(sch); + int err; + + if (!opt) + return -EINVAL; + + err = tcf_block_get(&q->block, &q->filter_list, sch, extack); + if (err) + return err; + + INIT_LIST_HEAD(&q->active); + return ets_qdisc_change(sch, opt, extack); +} + +static void ets_qdisc_reset(struct Qdisc *sch) +{ + struct ets_sched *q = qdisc_priv(sch); + int band; + + for (band = q->nstrict; band < q->nbands; band++) { + if (q->classes[band].qdisc->q.qlen) + list_del(&q->classes[band].alist); + } + for (band = 0; band < q->nbands; band++) + qdisc_reset(q->classes[band].qdisc); + sch->qstats.backlog = 0; + sch->q.qlen = 0; +} + +static void ets_qdisc_destroy(struct Qdisc *sch) +{ + struct ets_sched *q = qdisc_priv(sch); + int band; + + ets_offload_destroy(sch); + tcf_block_put(q->block); + for (band = 0; band < q->nbands; band++) + qdisc_put(q->classes[band].qdisc); +} + +static int ets_qdisc_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct ets_sched *q = qdisc_priv(sch); + struct nlattr *opts; + struct nlattr *nest; + int band; + int prio; + int err; + + err = ets_offload_dump(sch); + if (err) + return err; + + opts = nla_nest_start_noflag(skb, TCA_OPTIONS); + if (!opts) + goto nla_err; + + if (nla_put_u8(skb, TCA_ETS_NBANDS, q->nbands)) + goto nla_err; + + if (q->nstrict && + nla_put_u8(skb, TCA_ETS_NSTRICT, q->nstrict)) + goto nla_err; + + if (q->nbands > q->nstrict) { + nest = nla_nest_start(skb, TCA_ETS_QUANTA); + if (!nest) + goto nla_err; + + for (band = q->nstrict; band < q->nbands; band++) { + if (nla_put_u32(skb, TCA_ETS_QUANTA_BAND, + q->classes[band].quantum)) + goto nla_err; + } + + nla_nest_end(skb, nest); + } + + nest = nla_nest_start(skb, TCA_ETS_PRIOMAP); + if (!nest) + goto nla_err; + + for (prio = 0; prio <= TC_PRIO_MAX; prio++) { + if (nla_put_u8(skb, TCA_ETS_PRIOMAP_BAND, q->prio2band[prio])) + goto nla_err; + } + + nla_nest_end(skb, nest); + + return nla_nest_end(skb, opts); + +nla_err: + nla_nest_cancel(skb, opts); + return -EMSGSIZE; +} + +static const struct Qdisc_class_ops ets_class_ops = { + .change = ets_class_change, + .graft = ets_class_graft, + .leaf = ets_class_leaf, + .find = ets_class_find, + .qlen_notify = ets_class_qlen_notify, + .dump = ets_class_dump, + .dump_stats = ets_class_dump_stats, + .walk = ets_qdisc_walk, + .tcf_block = ets_qdisc_tcf_block, + .bind_tcf = ets_qdisc_bind_tcf, + .unbind_tcf = ets_qdisc_unbind_tcf, +}; + +static struct Qdisc_ops ets_qdisc_ops __read_mostly = { + .cl_ops = &ets_class_ops, + .id = "ets", + .priv_size = sizeof(struct ets_sched), + .enqueue = ets_qdisc_enqueue, + .dequeue = ets_qdisc_dequeue, + .peek = qdisc_peek_dequeued, + .change = ets_qdisc_change, + .init = ets_qdisc_init, + .reset = ets_qdisc_reset, + .destroy = ets_qdisc_destroy, + .dump = ets_qdisc_dump, + .owner = THIS_MODULE, +}; + +static int __init ets_init(void) +{ + return register_qdisc(&ets_qdisc_ops); +} + +static void __exit ets_exit(void) +{ + unregister_qdisc(&ets_qdisc_ops); +} + +module_init(ets_init); +module_exit(ets_exit); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 98dd87ce1510..a5a295477ecc 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -301,6 +301,9 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) f->socket_hash != sk->sk_hash)) { f->credit = q->initial_quantum; f->socket_hash = sk->sk_hash; + if (q->rate_enable) + smp_store_release(&sk->sk_pacing_status, + SK_PACING_FQ); if (fq_flow_is_throttled(f)) fq_flow_unset_throttled(q, f); f->time_next_packet = 0ULL; @@ -322,8 +325,12 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) fq_flow_set_detached(f); f->sk = sk; - if (skb->sk == sk) + if (skb->sk == sk) { f->socket_hash = sk->sk_hash; + if (q->rate_enable) + smp_store_release(&sk->sk_pacing_status, + SK_PACING_FQ); + } f->credit = q->initial_quantum; rb_link_node(&f->fq_node, parent, p); @@ -428,17 +435,9 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, f->qlen++; qdisc_qstats_backlog_inc(sch, skb); if (fq_flow_is_detached(f)) { - struct sock *sk = skb->sk; - fq_flow_add_tail(&q->new_flows, f); if (time_after(jiffies, f->age + q->flow_refill_delay)) f->credit = max_t(u32, f->credit, q->quantum); - if (sk && q->rate_enable) { - if (unlikely(smp_load_acquire(&sk->sk_pacing_status) != - SK_PACING_FQ)) - smp_store_release(&sk->sk_pacing_status, - SK_PACING_FQ); - } q->inactive_flows--; } @@ -530,8 +529,7 @@ begin: fq_flow_set_throttled(q, f); goto begin; } - if (time_next_packet && - (s64)(now - time_next_packet - q->ce_threshold) > 0) { + if ((s64)(now - time_next_packet - q->ce_threshold) > 0) { INET_ECN_set_ce(skb); q->stat_ce_mark++; } @@ -788,10 +786,12 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_FQ_QUANTUM]) { u32 quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]); - if (quantum > 0) + if (quantum > 0 && quantum <= (1 << 20)) { q->quantum = quantum; - else + } else { + NL_SET_ERR_MSG_MOD(extack, "invalid quantum"); err = -EINVAL; + } } if (tb[TCA_FQ_INITIAL_QUANTUM]) diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index d59fbcc745d1..968519ff36e9 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -14,7 +14,6 @@ #include <linux/errno.h> #include <linux/init.h> #include <linux/skbuff.h> -#include <linux/jhash.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <net/netlink.h> @@ -45,7 +44,6 @@ struct fq_codel_flow { struct sk_buff *tail; struct list_head flowchain; int deficit; - u32 dropped; /* number of drops (or ECN marks) on this flow */ struct codel_vars cvars; }; /* please try to keep this structure <= 64 bytes */ @@ -173,7 +171,8 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets, __qdisc_drop(skb, to_free); } while (++i < max_packets && len < threshold); - flow->dropped += i; + /* Tell codel to increase its signal strength also */ + flow->cvars.count += i; q->backlogs[idx] -= len; q->memory_usage -= mem; sch->qstats.drops += i; @@ -211,7 +210,6 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch, list_add_tail(&flow->flowchain, &q->new_flows); q->new_flow_count++; flow->deficit = q->quantum; - flow->dropped = 0; } get_codel_cb(skb)->mem_usage = skb->truesize; q->memory_usage += get_codel_cb(skb)->mem_usage; @@ -286,7 +284,6 @@ static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch) struct sk_buff *skb; struct fq_codel_flow *flow; struct list_head *head; - u32 prev_drop_count, prev_ecn_mark; begin: head = &q->new_flows; @@ -303,16 +300,10 @@ begin: goto begin; } - prev_drop_count = q->cstats.drop_count; - prev_ecn_mark = q->cstats.ecn_mark; - skb = codel_dequeue(sch, &sch->qstats.backlog, &q->cparams, &flow->cvars, &q->cstats, qdisc_pkt_len, codel_get_enqueue_time, drop_func, dequeue_func); - flow->dropped += q->cstats.drop_count - prev_drop_count; - flow->dropped += q->cstats.ecn_mark - prev_ecn_mark; - if (!skb) { /* force a pass through old_flows to prevent starvation */ if ((head == &q->new_flows) && !list_empty(&q->old_flows)) @@ -658,7 +649,7 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl, sch_tree_unlock(sch); } qs.backlog = q->backlogs[idx]; - qs.drops = flow->dropped; + qs.drops = 0; } if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) return -1; diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c new file mode 100644 index 000000000000..214657eb3dfd --- /dev/null +++ b/net/sched/sch_fq_pie.c @@ -0,0 +1,562 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Flow Queue PIE discipline + * + * Copyright (C) 2019 Mohit P. Tahiliani <tahiliani@nitk.edu.in> + * Copyright (C) 2019 Sachin D. Patil <sdp.sachin@gmail.com> + * Copyright (C) 2019 V. Saicharan <vsaicharan1998@gmail.com> + * Copyright (C) 2019 Mohit Bhasi <mohitbhasi1998@gmail.com> + * Copyright (C) 2019 Leslie Monis <lesliemonis@gmail.com> + * Copyright (C) 2019 Gautam Ramakrishnan <gautamramk@gmail.com> + */ + +#include <linux/jhash.h> +#include <linux/sizes.h> +#include <linux/vmalloc.h> +#include <net/pkt_cls.h> +#include <net/pie.h> + +/* Flow Queue PIE + * + * Principles: + * - Packets are classified on flows. + * - This is a Stochastic model (as we use a hash, several flows might + * be hashed to the same slot) + * - Each flow has a PIE managed queue. + * - Flows are linked onto two (Round Robin) lists, + * so that new flows have priority on old ones. + * - For a given flow, packets are not reordered. + * - Drops during enqueue only. + * - ECN capability is off by default. + * - ECN threshold (if ECN is enabled) is at 10% by default. + * - Uses timestamps to calculate queue delay by default. + */ + +/** + * struct fq_pie_flow - contains data for each flow + * @vars: pie vars associated with the flow + * @deficit: number of remaining byte credits + * @backlog: size of data in the flow + * @qlen: number of packets in the flow + * @flowchain: flowchain for the flow + * @head: first packet in the flow + * @tail: last packet in the flow + */ +struct fq_pie_flow { + struct pie_vars vars; + s32 deficit; + u32 backlog; + u32 qlen; + struct list_head flowchain; + struct sk_buff *head; + struct sk_buff *tail; +}; + +struct fq_pie_sched_data { + struct tcf_proto __rcu *filter_list; /* optional external classifier */ + struct tcf_block *block; + struct fq_pie_flow *flows; + struct Qdisc *sch; + struct list_head old_flows; + struct list_head new_flows; + struct pie_params p_params; + u32 ecn_prob; + u32 flows_cnt; + u32 quantum; + u32 memory_limit; + u32 new_flow_count; + u32 memory_usage; + u32 overmemory; + struct pie_stats stats; + struct timer_list adapt_timer; +}; + +static unsigned int fq_pie_hash(const struct fq_pie_sched_data *q, + struct sk_buff *skb) +{ + return reciprocal_scale(skb_get_hash(skb), q->flows_cnt); +} + +static unsigned int fq_pie_classify(struct sk_buff *skb, struct Qdisc *sch, + int *qerr) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + struct tcf_proto *filter; + struct tcf_result res; + int result; + + if (TC_H_MAJ(skb->priority) == sch->handle && + TC_H_MIN(skb->priority) > 0 && + TC_H_MIN(skb->priority) <= q->flows_cnt) + return TC_H_MIN(skb->priority); + + filter = rcu_dereference_bh(q->filter_list); + if (!filter) + return fq_pie_hash(q, skb) + 1; + + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + result = tcf_classify(skb, filter, &res, false); + if (result >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + /* fall through */ + case TC_ACT_SHOT: + return 0; + } +#endif + if (TC_H_MIN(res.classid) <= q->flows_cnt) + return TC_H_MIN(res.classid); + } + return 0; +} + +/* add skb to flow queue (tail add) */ +static inline void flow_queue_add(struct fq_pie_flow *flow, + struct sk_buff *skb) +{ + if (!flow->head) + flow->head = skb; + else + flow->tail->next = skb; + flow->tail = skb; + skb->next = NULL; +} + +static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + struct fq_pie_flow *sel_flow; + int uninitialized_var(ret); + u8 memory_limited = false; + u8 enqueue = false; + u32 pkt_len; + u32 idx; + + /* Classifies packet into corresponding flow */ + idx = fq_pie_classify(skb, sch, &ret); + sel_flow = &q->flows[idx]; + + /* Checks whether adding a new packet would exceed memory limit */ + get_pie_cb(skb)->mem_usage = skb->truesize; + memory_limited = q->memory_usage > q->memory_limit + skb->truesize; + + /* Checks if the qdisc is full */ + if (unlikely(qdisc_qlen(sch) >= sch->limit)) { + q->stats.overlimit++; + goto out; + } else if (unlikely(memory_limited)) { + q->overmemory++; + } + + if (!pie_drop_early(sch, &q->p_params, &sel_flow->vars, + sel_flow->backlog, skb->len)) { + enqueue = true; + } else if (q->p_params.ecn && + sel_flow->vars.prob <= (MAX_PROB / 100) * q->ecn_prob && + INET_ECN_set_ce(skb)) { + /* If packet is ecn capable, mark it if drop probability + * is lower than the parameter ecn_prob, else drop it. + */ + q->stats.ecn_mark++; + enqueue = true; + } + if (enqueue) { + /* Set enqueue time only when dq_rate_estimator is disabled. */ + if (!q->p_params.dq_rate_estimator) + pie_set_enqueue_time(skb); + + pkt_len = qdisc_pkt_len(skb); + q->stats.packets_in++; + q->memory_usage += skb->truesize; + sch->qstats.backlog += pkt_len; + sch->q.qlen++; + flow_queue_add(sel_flow, skb); + if (list_empty(&sel_flow->flowchain)) { + list_add_tail(&sel_flow->flowchain, &q->new_flows); + q->new_flow_count++; + sel_flow->deficit = q->quantum; + sel_flow->qlen = 0; + sel_flow->backlog = 0; + } + sel_flow->qlen++; + sel_flow->backlog += pkt_len; + return NET_XMIT_SUCCESS; + } +out: + q->stats.dropped++; + sel_flow->vars.accu_prob = 0; + sel_flow->vars.accu_prob_overflows = 0; + __qdisc_drop(skb, to_free); + qdisc_qstats_drop(sch); + return NET_XMIT_CN; +} + +static const struct nla_policy fq_pie_policy[TCA_FQ_PIE_MAX + 1] = { + [TCA_FQ_PIE_LIMIT] = {.type = NLA_U32}, + [TCA_FQ_PIE_FLOWS] = {.type = NLA_U32}, + [TCA_FQ_PIE_TARGET] = {.type = NLA_U32}, + [TCA_FQ_PIE_TUPDATE] = {.type = NLA_U32}, + [TCA_FQ_PIE_ALPHA] = {.type = NLA_U32}, + [TCA_FQ_PIE_BETA] = {.type = NLA_U32}, + [TCA_FQ_PIE_QUANTUM] = {.type = NLA_U32}, + [TCA_FQ_PIE_MEMORY_LIMIT] = {.type = NLA_U32}, + [TCA_FQ_PIE_ECN_PROB] = {.type = NLA_U32}, + [TCA_FQ_PIE_ECN] = {.type = NLA_U32}, + [TCA_FQ_PIE_BYTEMODE] = {.type = NLA_U32}, + [TCA_FQ_PIE_DQ_RATE_ESTIMATOR] = {.type = NLA_U32}, +}; + +static inline struct sk_buff *dequeue_head(struct fq_pie_flow *flow) +{ + struct sk_buff *skb = flow->head; + + flow->head = skb->next; + skb->next = NULL; + return skb; +} + +static struct sk_buff *fq_pie_qdisc_dequeue(struct Qdisc *sch) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb = NULL; + struct fq_pie_flow *flow; + struct list_head *head; + u32 pkt_len; + +begin: + head = &q->new_flows; + if (list_empty(head)) { + head = &q->old_flows; + if (list_empty(head)) + return NULL; + } + + flow = list_first_entry(head, struct fq_pie_flow, flowchain); + /* Flow has exhausted all its credits */ + if (flow->deficit <= 0) { + flow->deficit += q->quantum; + list_move_tail(&flow->flowchain, &q->old_flows); + goto begin; + } + + if (flow->head) { + skb = dequeue_head(flow); + pkt_len = qdisc_pkt_len(skb); + sch->qstats.backlog -= pkt_len; + sch->q.qlen--; + qdisc_bstats_update(sch, skb); + } + + if (!skb) { + /* force a pass through old_flows to prevent starvation */ + if (head == &q->new_flows && !list_empty(&q->old_flows)) + list_move_tail(&flow->flowchain, &q->old_flows); + else + list_del_init(&flow->flowchain); + goto begin; + } + + flow->qlen--; + flow->deficit -= pkt_len; + flow->backlog -= pkt_len; + q->memory_usage -= get_pie_cb(skb)->mem_usage; + pie_process_dequeue(skb, &q->p_params, &flow->vars, flow->backlog); + return skb; +} + +static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_FQ_PIE_MAX + 1]; + unsigned int len_dropped = 0; + unsigned int num_dropped = 0; + int err; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_FQ_PIE_MAX, opt, fq_pie_policy, extack); + if (err < 0) + return err; + + sch_tree_lock(sch); + if (tb[TCA_FQ_PIE_LIMIT]) { + u32 limit = nla_get_u32(tb[TCA_FQ_PIE_LIMIT]); + + q->p_params.limit = limit; + sch->limit = limit; + } + if (tb[TCA_FQ_PIE_FLOWS]) { + if (q->flows) { + NL_SET_ERR_MSG_MOD(extack, + "Number of flows cannot be changed"); + goto flow_error; + } + q->flows_cnt = nla_get_u32(tb[TCA_FQ_PIE_FLOWS]); + if (!q->flows_cnt || q->flows_cnt > 65536) { + NL_SET_ERR_MSG_MOD(extack, + "Number of flows must be < 65536"); + goto flow_error; + } + } + + /* convert from microseconds to pschedtime */ + if (tb[TCA_FQ_PIE_TARGET]) { + /* target is in us */ + u32 target = nla_get_u32(tb[TCA_FQ_PIE_TARGET]); + + /* convert to pschedtime */ + q->p_params.target = + PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC); + } + + /* tupdate is in jiffies */ + if (tb[TCA_FQ_PIE_TUPDATE]) + q->p_params.tupdate = + usecs_to_jiffies(nla_get_u32(tb[TCA_FQ_PIE_TUPDATE])); + + if (tb[TCA_FQ_PIE_ALPHA]) + q->p_params.alpha = nla_get_u32(tb[TCA_FQ_PIE_ALPHA]); + + if (tb[TCA_FQ_PIE_BETA]) + q->p_params.beta = nla_get_u32(tb[TCA_FQ_PIE_BETA]); + + if (tb[TCA_FQ_PIE_QUANTUM]) + q->quantum = nla_get_u32(tb[TCA_FQ_PIE_QUANTUM]); + + if (tb[TCA_FQ_PIE_MEMORY_LIMIT]) + q->memory_limit = nla_get_u32(tb[TCA_FQ_PIE_MEMORY_LIMIT]); + + if (tb[TCA_FQ_PIE_ECN_PROB]) + q->ecn_prob = nla_get_u32(tb[TCA_FQ_PIE_ECN_PROB]); + + if (tb[TCA_FQ_PIE_ECN]) + q->p_params.ecn = nla_get_u32(tb[TCA_FQ_PIE_ECN]); + + if (tb[TCA_FQ_PIE_BYTEMODE]) + q->p_params.bytemode = nla_get_u32(tb[TCA_FQ_PIE_BYTEMODE]); + + if (tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR]) + q->p_params.dq_rate_estimator = + nla_get_u32(tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR]); + + /* Drop excess packets if new limit is lower */ + while (sch->q.qlen > sch->limit) { + struct sk_buff *skb = fq_pie_qdisc_dequeue(sch); + + len_dropped += qdisc_pkt_len(skb); + num_dropped += 1; + rtnl_kfree_skbs(skb, skb); + } + qdisc_tree_reduce_backlog(sch, num_dropped, len_dropped); + + sch_tree_unlock(sch); + return 0; + +flow_error: + sch_tree_unlock(sch); + return -EINVAL; +} + +static void fq_pie_timer(struct timer_list *t) +{ + struct fq_pie_sched_data *q = from_timer(q, t, adapt_timer); + struct Qdisc *sch = q->sch; + spinlock_t *root_lock; /* to lock qdisc for probability calculations */ + u16 idx; + + root_lock = qdisc_lock(qdisc_root_sleeping(sch)); + spin_lock(root_lock); + + for (idx = 0; idx < q->flows_cnt; idx++) + pie_calculate_probability(&q->p_params, &q->flows[idx].vars, + q->flows[idx].backlog); + + /* reset the timer to fire after 'tupdate' jiffies. */ + if (q->p_params.tupdate) + mod_timer(&q->adapt_timer, jiffies + q->p_params.tupdate); + + spin_unlock(root_lock); +} + +static int fq_pie_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + int err; + u16 idx; + + pie_params_init(&q->p_params); + sch->limit = 10 * 1024; + q->p_params.limit = sch->limit; + q->quantum = psched_mtu(qdisc_dev(sch)); + q->sch = sch; + q->ecn_prob = 10; + q->flows_cnt = 1024; + q->memory_limit = SZ_32M; + + INIT_LIST_HEAD(&q->new_flows); + INIT_LIST_HEAD(&q->old_flows); + + if (opt) { + err = fq_pie_change(sch, opt, extack); + + if (err) + return err; + } + + err = tcf_block_get(&q->block, &q->filter_list, sch, extack); + if (err) + goto init_failure; + + q->flows = kvcalloc(q->flows_cnt, sizeof(struct fq_pie_flow), + GFP_KERNEL); + if (!q->flows) { + err = -ENOMEM; + goto init_failure; + } + for (idx = 0; idx < q->flows_cnt; idx++) { + struct fq_pie_flow *flow = q->flows + idx; + + INIT_LIST_HEAD(&flow->flowchain); + pie_vars_init(&flow->vars); + } + + timer_setup(&q->adapt_timer, fq_pie_timer, 0); + mod_timer(&q->adapt_timer, jiffies + HZ / 2); + + return 0; + +init_failure: + q->flows_cnt = 0; + + return err; +} + +static int fq_pie_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (!opts) + return -EMSGSIZE; + + /* convert target from pschedtime to us */ + if (nla_put_u32(skb, TCA_FQ_PIE_LIMIT, sch->limit) || + nla_put_u32(skb, TCA_FQ_PIE_FLOWS, q->flows_cnt) || + nla_put_u32(skb, TCA_FQ_PIE_TARGET, + ((u32)PSCHED_TICKS2NS(q->p_params.target)) / + NSEC_PER_USEC) || + nla_put_u32(skb, TCA_FQ_PIE_TUPDATE, + jiffies_to_usecs(q->p_params.tupdate)) || + nla_put_u32(skb, TCA_FQ_PIE_ALPHA, q->p_params.alpha) || + nla_put_u32(skb, TCA_FQ_PIE_BETA, q->p_params.beta) || + nla_put_u32(skb, TCA_FQ_PIE_QUANTUM, q->quantum) || + nla_put_u32(skb, TCA_FQ_PIE_MEMORY_LIMIT, q->memory_limit) || + nla_put_u32(skb, TCA_FQ_PIE_ECN_PROB, q->ecn_prob) || + nla_put_u32(skb, TCA_FQ_PIE_ECN, q->p_params.ecn) || + nla_put_u32(skb, TCA_FQ_PIE_BYTEMODE, q->p_params.bytemode) || + nla_put_u32(skb, TCA_FQ_PIE_DQ_RATE_ESTIMATOR, + q->p_params.dq_rate_estimator)) + goto nla_put_failure; + + return nla_nest_end(skb, opts); + +nla_put_failure: + nla_nest_cancel(skb, opts); + return -EMSGSIZE; +} + +static int fq_pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + struct tc_fq_pie_xstats st = { + .packets_in = q->stats.packets_in, + .overlimit = q->stats.overlimit, + .overmemory = q->overmemory, + .dropped = q->stats.dropped, + .ecn_mark = q->stats.ecn_mark, + .new_flow_count = q->new_flow_count, + .memory_usage = q->memory_usage, + }; + struct list_head *pos; + + sch_tree_lock(sch); + list_for_each(pos, &q->new_flows) + st.new_flows_len++; + + list_for_each(pos, &q->old_flows) + st.old_flows_len++; + sch_tree_unlock(sch); + + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static void fq_pie_reset(struct Qdisc *sch) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + u16 idx; + + INIT_LIST_HEAD(&q->new_flows); + INIT_LIST_HEAD(&q->old_flows); + for (idx = 0; idx < q->flows_cnt; idx++) { + struct fq_pie_flow *flow = q->flows + idx; + + /* Removes all packets from flow */ + rtnl_kfree_skbs(flow->head, flow->tail); + flow->head = NULL; + + INIT_LIST_HEAD(&flow->flowchain); + pie_vars_init(&flow->vars); + } + + sch->q.qlen = 0; + sch->qstats.backlog = 0; +} + +static void fq_pie_destroy(struct Qdisc *sch) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + + tcf_block_put(q->block); + del_timer_sync(&q->adapt_timer); + kvfree(q->flows); +} + +static struct Qdisc_ops fq_pie_qdisc_ops __read_mostly = { + .id = "fq_pie", + .priv_size = sizeof(struct fq_pie_sched_data), + .enqueue = fq_pie_qdisc_enqueue, + .dequeue = fq_pie_qdisc_dequeue, + .peek = qdisc_peek_dequeued, + .init = fq_pie_init, + .destroy = fq_pie_destroy, + .reset = fq_pie_reset, + .change = fq_pie_change, + .dump = fq_pie_dump, + .dump_stats = fq_pie_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init fq_pie_module_init(void) +{ + return register_qdisc(&fq_pie_qdisc_ops); +} + +static void __exit fq_pie_module_exit(void) +{ + unregister_qdisc(&fq_pie_qdisc_ops); +} + +module_init(fq_pie_module_init); +module_exit(fq_pie_module_exit); + +MODULE_DESCRIPTION("Flow Queue Proportional Integral controller Enhanced (FQ-PIE)"); +MODULE_AUTHOR("Mohit P. Tahiliani"); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 11c03cf4aa74..6c9595f1048a 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -46,6 +46,8 @@ EXPORT_SYMBOL(default_qdisc_ops); * - updates to tree and tree walking are only done under the rtnl mutex. */ +#define SKB_XOFF_MAGIC ((struct sk_buff *)1UL) + static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q) { const struct netdev_queue *txq = q->dev_queue; @@ -71,7 +73,7 @@ static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q) q->q.qlen--; } } else { - skb = NULL; + skb = SKB_XOFF_MAGIC; } } @@ -253,8 +255,11 @@ validate: return skb; skb = qdisc_dequeue_skb_bad_txq(q); - if (unlikely(skb)) + if (unlikely(skb)) { + if (skb == SKB_XOFF_MAGIC) + return NULL; goto bulk; + } skb = q->dequeue(q); if (skb) { bulk: @@ -377,13 +382,8 @@ void __qdisc_run(struct Qdisc *q) int packets; while (qdisc_restart(q, &packets)) { - /* - * Ordered by possible occurrence: Postpone processing if - * 1. we've exceeded packet quota - * 2. another process needs the CPU; - */ quota -= packets; - if (quota <= 0 || need_resched()) { + if (quota <= 0) { __netif_schedule(q); break; } @@ -441,7 +441,7 @@ static void dev_watchdog(struct timer_list *t) trace_net_dev_xmit_timeout(dev, i); WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n", dev->name, netdev_drivername(dev), i); - dev->netdev_ops->ndo_tx_timeout(dev); + dev->netdev_ops->ndo_tx_timeout(dev, i); } if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + @@ -624,8 +624,12 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, err = skb_array_produce(q, skb); - if (unlikely(err)) - return qdisc_drop_cpu(skb, qdisc, to_free); + if (unlikely(err)) { + if (qdisc_is_percpu_stats(qdisc)) + return qdisc_drop_cpu(skb, qdisc, to_free); + else + return qdisc_drop(skb, qdisc, to_free); + } qdisc_update_stats_at_enqueue(qdisc, pkt_len); return NET_XMIT_SUCCESS; @@ -648,7 +652,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) if (likely(skb)) { qdisc_update_stats_at_dequeue(qdisc, skb); } else { - qdisc->empty = true; + WRITE_ONCE(qdisc->empty, true); } return skb; @@ -688,11 +692,14 @@ static void pfifo_fast_reset(struct Qdisc *qdisc) kfree_skb(skb); } - for_each_possible_cpu(i) { - struct gnet_stats_queue *q = per_cpu_ptr(qdisc->cpu_qstats, i); + if (qdisc_is_percpu_stats(qdisc)) { + for_each_possible_cpu(i) { + struct gnet_stats_queue *q; - q->backlog = 0; - q->qlen = 0; + q = per_cpu_ptr(qdisc->cpu_qstats, i); + q->backlog = 0; + q->qlen = 0; + } } } @@ -787,9 +794,6 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = { }; EXPORT_SYMBOL(pfifo_fast_ops); -static struct lock_class_key qdisc_tx_busylock; -static struct lock_class_key qdisc_running_key; - struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, struct netlink_ext_ack *extack) @@ -842,17 +846,9 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, } spin_lock_init(&sch->busylock); - lockdep_set_class(&sch->busylock, - dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); - /* seqlock has the same scope of busylock, for NOLOCK qdisc */ spin_lock_init(&sch->seqlock); - lockdep_set_class(&sch->busylock, - dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); - seqcount_init(&sch->running); - lockdep_set_class(&sch->running, - dev->qdisc_running_key ?: &qdisc_running_key); sch->ops = ops; sch->flags = ops->static_flags; @@ -863,6 +859,12 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, dev_hold(dev); refcount_set(&sch->refcnt, 1); + if (sch != &noop_qdisc) { + lockdep_set_class(&sch->busylock, &dev->qdisc_tx_busylock_key); + lockdep_set_class(&sch->seqlock, &dev->qdisc_tx_busylock_key); + lockdep_set_class(&sch->running, &dev->qdisc_running_key); + } + return sch; errout1: kfree(p); @@ -973,6 +975,9 @@ static void qdisc_destroy(struct Qdisc *qdisc) void qdisc_put(struct Qdisc *qdisc) { + if (!qdisc) + return; + if (qdisc->flags & TCQ_F_BUILTIN || !refcount_dec_and_test(&qdisc->refcnt)) return; @@ -1028,6 +1033,8 @@ static void attach_one_default_qdisc(struct net_device *dev, if (dev->priv_flags & IFF_NO_QUEUE) ops = &noqueue_qdisc_ops; + else if(dev->type == ARPHRD_CAN) + ops = &pfifo_fast_ops; qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT, NULL); if (!qdisc) { @@ -1202,8 +1209,13 @@ void dev_deactivate_many(struct list_head *head) /* Wait for outstanding qdisc_run calls. */ list_for_each_entry(dev, head, close_list) { - while (some_qdisc_is_busy(dev)) - yield(); + while (some_qdisc_is_busy(dev)) { + /* wait_event() would avoid this sleep-loop but would + * require expensive checks in the fast paths of packet + * processing which isn't worth it. + */ + schedule_timeout_uninterruptible(1); + } /* The new qdisc is assigned at this point so we can safely * unwind stale skb lists and qdisc statistics */ diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index cee6971c1c82..be35f03b657b 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -5,11 +5,11 @@ * Copyright (C) 2013 Nandita Dukkipati <nanditad@google.com> */ -#include <linux/jhash.h> #include <linux/jiffies.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> +#include <linux/siphash.h> #include <net/pkt_sched.h> #include <net/sock.h> @@ -126,7 +126,7 @@ struct wdrr_bucket { struct hhf_sched_data { struct wdrr_bucket buckets[WDRR_BUCKET_CNT]; - u32 perturbation; /* hash perturbation */ + siphash_key_t perturbation; /* hash perturbation */ u32 quantum; /* psched_mtu(qdisc_dev(sch)); */ u32 drop_overlimit; /* number of times max qdisc packet * limit was hit @@ -264,7 +264,7 @@ static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch) } /* Get hashed flow-id of the skb. */ - hash = skb_get_hash_perturb(skb, q->perturbation); + hash = skb_get_hash_perturb(skb, &q->perturbation); /* Check if this packet belongs to an already established HH flow. */ flow_pos = hash & HHF_BIT_MASK; @@ -531,7 +531,7 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt, new_hhf_non_hh_weight = nla_get_u32(tb[TCA_HHF_NON_HH_WEIGHT]); non_hh_quantum = (u64)new_quantum * new_hhf_non_hh_weight; - if (non_hh_quantum > INT_MAX) + if (non_hh_quantum == 0 || non_hh_quantum > INT_MAX) return -EINVAL; sch_tree_lock(sch); @@ -582,7 +582,7 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt, sch->limit = 1000; q->quantum = psched_mtu(qdisc_dev(sch)); - q->perturbation = prandom_u32(); + get_random_bytes(&q->perturbation, sizeof(q->perturbation)); INIT_LIST_HEAD(&q->new_buckets); INIT_LIST_HEAD(&q->old_buckets); diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 7bcf20ef9145..8184c87da8be 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1302,6 +1302,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, struct htb_class *cl = (struct htb_class *)*arg, *parent; struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_HTB_MAX + 1]; + struct Qdisc *parent_qdisc = NULL; struct tc_htb_opt *hopt; u64 rate64, ceil64; int warn = 0; @@ -1401,7 +1402,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, if (parent && !parent->level) { /* turn parent into inner node */ qdisc_purge_queue(parent->leaf.q); - qdisc_put(parent->leaf.q); + parent_qdisc = parent->leaf.q; if (parent->prio_activity) htb_deactivate(q, parent); @@ -1480,6 +1481,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer); sch_tree_unlock(sch); + qdisc_put(parent_qdisc); if (warn) pr_warn("HTB: quantum of class %X is %s. Consider r2q change.\n", diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index 0d578333e967..e79f1afe0cfd 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -153,6 +153,7 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) __gnet_stats_copy_queue(&sch->qstats, qdisc->cpu_qstats, &qdisc->qstats, qlen); + sch->q.qlen += qlen; } else { sch->q.qlen += qdisc->q.qlen; sch->bstats.bytes += qdisc->bstats.bytes; @@ -245,7 +246,8 @@ static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct netdev_queue *dev_queue = mq_queue_get(sch, cl); sch = dev_queue->qdisc_sleeping; - if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 || + if (gnet_stats_copy_basic(&sch->running, d, sch->cpu_bstats, + &sch->bstats) < 0 || qdisc_qstats_copy(d, sch) < 0) return -1; return 0; diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 46980b8d66c5..8766ab5b8788 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -411,6 +411,7 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) __gnet_stats_copy_queue(&sch->qstats, qdisc->cpu_qstats, &qdisc->qstats, qlen); + sch->q.qlen += qlen; } else { sch->q.qlen += qdisc->q.qlen; sch->bstats.bytes += qdisc->bstats.bytes; @@ -433,7 +434,7 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) opt.offset[tc] = dev->tc_to_txq[tc].offset; } - if (nla_put(skb, TCA_OPTIONS, NLA_ALIGN(sizeof(opt)), &opt)) + if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) goto nla_put_failure; if ((priv->flags & TC_MQPRIO_F_MODE) && @@ -557,8 +558,8 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); sch = dev_queue->qdisc_sleeping; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, NULL, &sch->bstats) < 0 || + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, + sch->cpu_bstats, &sch->bstats) < 0 || qdisc_qstats_copy(d, sch) < 0) return -1; } diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index e1087746f6a2..1330ad224931 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -174,7 +174,8 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt, { struct multiq_sched_data *q = qdisc_priv(sch); struct tc_multiq_qopt *qopt; - int i; + struct Qdisc **removed; + int i, n_removed = 0; if (!netif_is_multiqueue(qdisc_dev(sch))) return -EOPNOTSUPP; @@ -185,6 +186,11 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt, qopt->bands = qdisc_dev(sch)->real_num_tx_queues; + removed = kmalloc(sizeof(*removed) * (q->max_bands - q->bands), + GFP_KERNEL); + if (!removed) + return -ENOMEM; + sch_tree_lock(sch); q->bands = qopt->bands; for (i = q->bands; i < q->max_bands; i++) { @@ -192,13 +198,17 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt, struct Qdisc *child = q->queues[i]; q->queues[i] = &noop_qdisc; - qdisc_tree_flush_backlog(child); - qdisc_put(child); + qdisc_purge_queue(child); + removed[n_removed++] = child; } } sch_tree_unlock(sch); + for (i = 0; i < n_removed; i++) + qdisc_put(removed[i]); + kfree(removed); + for (i = 0; i < q->bands; i++) { if (q->queues[i] == &noop_qdisc) { struct Qdisc *child, *old; @@ -213,11 +223,10 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt, if (child != &noop_qdisc) qdisc_hash_add(child, true); - if (old != &noop_qdisc) { - qdisc_tree_flush_backlog(old); - qdisc_put(old); - } + if (old != &noop_qdisc) + qdisc_purge_queue(old); sch_tree_unlock(sch); + qdisc_put(old); } } } @@ -330,7 +339,7 @@ static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl, cl_q = q->queues[cl - 1]; if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, NULL, &cl_q->bstats) < 0 || + d, cl_q->cpu_bstats, &cl_q->bstats) < 0 || qdisc_qstats_copy(d, cl_q) < 0) return -1; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index b17f2ed970e2..42e557d48e4e 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -476,7 +476,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, * skb will be queued. */ if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) { - struct Qdisc *rootq = qdisc_root(sch); + struct Qdisc *rootq = qdisc_root_bh(sch); u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ q->duplicate = 0; @@ -509,6 +509,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb)) { qdisc_drop(skb, sch, to_free); + skb = NULL; goto finish_segs; } @@ -593,9 +594,10 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, finish_segs: if (segs) { unsigned int len, last_len; - int nb = 0; + int nb; - len = skb->len; + len = skb ? skb->len : 0; + nb = skb ? 1 : 0; while (segs) { skb2 = segs->next; @@ -612,7 +614,10 @@ finish_segs: } segs = skb2; } - qdisc_tree_reduce_backlog(sch, -nb, prev_len - len); + /* Parent qdiscs accounted for 1 skb of size @prev_len */ + qdisc_tree_reduce_backlog(sch, -(nb - 1), -(len - prev_len)); + } else if (!skb) { + return NET_XMIT_DROP; } return NET_XMIT_SUCCESS; } @@ -777,7 +782,7 @@ static int get_dist_table(struct Qdisc *sch, struct disttable **tbl, struct disttable *d; int i; - if (n > NETEM_DIST_MAX) + if (!n || n > NETEM_DIST_MAX) return -EINVAL; d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL); diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index df98a887eb89..915bcdb59a9f 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -19,134 +19,76 @@ #include <linux/skbuff.h> #include <net/pkt_sched.h> #include <net/inet_ecn.h> - -#define QUEUE_THRESHOLD 16384 -#define DQCOUNT_INVALID -1 -#define MAX_PROB 0xffffffffffffffff -#define PIE_SCALE 8 - -/* parameters used */ -struct pie_params { - psched_time_t target; /* user specified target delay in pschedtime */ - u32 tupdate; /* timer frequency (in jiffies) */ - u32 limit; /* number of packets that can be enqueued */ - u32 alpha; /* alpha and beta are between 0 and 32 */ - u32 beta; /* and are used for shift relative to 1 */ - bool ecn; /* true if ecn is enabled */ - bool bytemode; /* to scale drop early prob based on pkt size */ -}; - -/* variables used */ -struct pie_vars { - u64 prob; /* probability but scaled by u64 limit. */ - psched_time_t burst_time; - psched_time_t qdelay; - psched_time_t qdelay_old; - u64 dq_count; /* measured in bytes */ - psched_time_t dq_tstamp; /* drain rate */ - u64 accu_prob; /* accumulated drop probability */ - u32 avg_dq_rate; /* bytes per pschedtime tick,scaled */ - u32 qlen_old; /* in bytes */ - u8 accu_prob_overflows; /* overflows of accu_prob */ -}; - -/* statistics gathering */ -struct pie_stats { - u32 packets_in; /* total number of packets enqueued */ - u32 dropped; /* packets dropped due to pie_action */ - u32 overlimit; /* dropped due to lack of space in queue */ - u32 maxq; /* maximum queue size */ - u32 ecn_mark; /* packets marked with ECN */ -}; +#include <net/pie.h> /* private data for the Qdisc */ struct pie_sched_data { - struct pie_params params; struct pie_vars vars; + struct pie_params params; struct pie_stats stats; struct timer_list adapt_timer; struct Qdisc *sch; }; -static void pie_params_init(struct pie_params *params) -{ - params->alpha = 2; - params->beta = 20; - params->tupdate = usecs_to_jiffies(15 * USEC_PER_MSEC); /* 15 ms */ - params->limit = 1000; /* default of 1000 packets */ - params->target = PSCHED_NS2TICKS(15 * NSEC_PER_MSEC); /* 15 ms */ - params->ecn = false; - params->bytemode = false; -} - -static void pie_vars_init(struct pie_vars *vars) +bool pie_drop_early(struct Qdisc *sch, struct pie_params *params, + struct pie_vars *vars, u32 qlen, u32 packet_size) { - vars->dq_count = DQCOUNT_INVALID; - vars->accu_prob = 0; - vars->avg_dq_rate = 0; - /* default of 150 ms in pschedtime */ - vars->burst_time = PSCHED_NS2TICKS(150 * NSEC_PER_MSEC); - vars->accu_prob_overflows = 0; -} - -static bool drop_early(struct Qdisc *sch, u32 packet_size) -{ - struct pie_sched_data *q = qdisc_priv(sch); u64 rnd; - u64 local_prob = q->vars.prob; + u64 local_prob = vars->prob; u32 mtu = psched_mtu(qdisc_dev(sch)); /* If there is still burst allowance left skip random early drop */ - if (q->vars.burst_time > 0) + if (vars->burst_time > 0) return false; /* If current delay is less than half of target, and * if drop prob is low already, disable early_drop */ - if ((q->vars.qdelay < q->params.target / 2) && - (q->vars.prob < MAX_PROB / 5)) + if ((vars->qdelay < params->target / 2) && + (vars->prob < MAX_PROB / 5)) return false; - /* If we have fewer than 2 mtu-sized packets, disable drop_early, + /* If we have fewer than 2 mtu-sized packets, disable pie_drop_early, * similar to min_th in RED */ - if (sch->qstats.backlog < 2 * mtu) + if (qlen < 2 * mtu) return false; /* If bytemode is turned on, use packet size to compute new * probablity. Smaller packets will have lower drop prob in this case */ - if (q->params.bytemode && packet_size <= mtu) + if (params->bytemode && packet_size <= mtu) local_prob = (u64)packet_size * div_u64(local_prob, mtu); else - local_prob = q->vars.prob; + local_prob = vars->prob; if (local_prob == 0) { - q->vars.accu_prob = 0; - q->vars.accu_prob_overflows = 0; + vars->accu_prob = 0; + vars->accu_prob_overflows = 0; } - if (local_prob > MAX_PROB - q->vars.accu_prob) - q->vars.accu_prob_overflows++; + if (local_prob > MAX_PROB - vars->accu_prob) + vars->accu_prob_overflows++; - q->vars.accu_prob += local_prob; + vars->accu_prob += local_prob; - if (q->vars.accu_prob_overflows == 0 && - q->vars.accu_prob < (MAX_PROB / 100) * 85) + if (vars->accu_prob_overflows == 0 && + vars->accu_prob < (MAX_PROB / 100) * 85) return false; - if (q->vars.accu_prob_overflows == 8 && - q->vars.accu_prob >= MAX_PROB / 2) + if (vars->accu_prob_overflows == 8 && + vars->accu_prob >= MAX_PROB / 2) return true; prandom_bytes(&rnd, 8); if (rnd < local_prob) { - q->vars.accu_prob = 0; - q->vars.accu_prob_overflows = 0; + vars->accu_prob = 0; + vars->accu_prob_overflows = 0; return true; } return false; } +EXPORT_SYMBOL_GPL(pie_drop_early); static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) @@ -159,7 +101,8 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, goto out; } - if (!drop_early(sch, skb->len)) { + if (!pie_drop_early(sch, &q->params, &q->vars, sch->qstats.backlog, + skb->len)) { enqueue = true; } else if (q->params.ecn && (q->vars.prob <= MAX_PROB / 10) && INET_ECN_set_ce(skb)) { @@ -172,6 +115,10 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* we can enqueue the packet */ if (enqueue) { + /* Set enqueue time only when dq_rate_estimator is disabled. */ + if (!q->params.dq_rate_estimator) + pie_set_enqueue_time(skb); + q->stats.packets_in++; if (qdisc_qlen(sch) > q->stats.maxq) q->stats.maxq = qdisc_qlen(sch); @@ -187,13 +134,14 @@ out: } static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = { - [TCA_PIE_TARGET] = {.type = NLA_U32}, - [TCA_PIE_LIMIT] = {.type = NLA_U32}, - [TCA_PIE_TUPDATE] = {.type = NLA_U32}, - [TCA_PIE_ALPHA] = {.type = NLA_U32}, - [TCA_PIE_BETA] = {.type = NLA_U32}, - [TCA_PIE_ECN] = {.type = NLA_U32}, - [TCA_PIE_BYTEMODE] = {.type = NLA_U32}, + [TCA_PIE_TARGET] = {.type = NLA_U32}, + [TCA_PIE_LIMIT] = {.type = NLA_U32}, + [TCA_PIE_TUPDATE] = {.type = NLA_U32}, + [TCA_PIE_ALPHA] = {.type = NLA_U32}, + [TCA_PIE_BETA] = {.type = NLA_U32}, + [TCA_PIE_ECN] = {.type = NLA_U32}, + [TCA_PIE_BYTEMODE] = {.type = NLA_U32}, + [TCA_PIE_DQ_RATE_ESTIMATOR] = {.type = NLA_U32}, }; static int pie_change(struct Qdisc *sch, struct nlattr *opt, @@ -247,6 +195,10 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_PIE_BYTEMODE]) q->params.bytemode = nla_get_u32(tb[TCA_PIE_BYTEMODE]); + if (tb[TCA_PIE_DQ_RATE_ESTIMATOR]) + q->params.dq_rate_estimator = + nla_get_u32(tb[TCA_PIE_DQ_RATE_ESTIMATOR]); + /* Drop excess packets if new limit is lower */ qlen = sch->q.qlen; while (sch->q.qlen > sch->limit) { @@ -262,48 +214,69 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt, return 0; } -static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) +void pie_process_dequeue(struct sk_buff *skb, struct pie_params *params, + struct pie_vars *vars, u32 qlen) { - struct pie_sched_data *q = qdisc_priv(sch); - int qlen = sch->qstats.backlog; /* current queue size in bytes */ + psched_time_t now = psched_get_time(); + u32 dtime = 0; + + /* If dq_rate_estimator is disabled, calculate qdelay using the + * packet timestamp. + */ + if (!params->dq_rate_estimator) { + vars->qdelay = now - pie_get_enqueue_time(skb); + + if (vars->dq_tstamp != DTIME_INVALID) + dtime = now - vars->dq_tstamp; + + vars->dq_tstamp = now; + + if (qlen == 0) + vars->qdelay = 0; + + if (dtime == 0) + return; + + goto burst_allowance_reduction; + } /* If current queue is about 10 packets or more and dq_count is unset * we have enough packets to calculate the drain rate. Save * current time as dq_tstamp and start measurement cycle. */ - if (qlen >= QUEUE_THRESHOLD && q->vars.dq_count == DQCOUNT_INVALID) { - q->vars.dq_tstamp = psched_get_time(); - q->vars.dq_count = 0; + if (qlen >= QUEUE_THRESHOLD && vars->dq_count == DQCOUNT_INVALID) { + vars->dq_tstamp = psched_get_time(); + vars->dq_count = 0; } - /* Calculate the average drain rate from this value. If queue length - * has receded to a small value viz., <= QUEUE_THRESHOLD bytes,reset + /* Calculate the average drain rate from this value. If queue length + * has receded to a small value viz., <= QUEUE_THRESHOLD bytes, reset * the dq_count to -1 as we don't have enough packets to calculate the - * drain rate anymore The following if block is entered only when we + * drain rate anymore. The following if block is entered only when we * have a substantial queue built up (QUEUE_THRESHOLD bytes or more) * and we calculate the drain rate for the threshold here. dq_count is * in bytes, time difference in psched_time, hence rate is in * bytes/psched_time. */ - if (q->vars.dq_count != DQCOUNT_INVALID) { - q->vars.dq_count += skb->len; + if (vars->dq_count != DQCOUNT_INVALID) { + vars->dq_count += skb->len; + + if (vars->dq_count >= QUEUE_THRESHOLD) { + u32 count = vars->dq_count << PIE_SCALE; - if (q->vars.dq_count >= QUEUE_THRESHOLD) { - psched_time_t now = psched_get_time(); - u32 dtime = now - q->vars.dq_tstamp; - u32 count = q->vars.dq_count << PIE_SCALE; + dtime = now - vars->dq_tstamp; if (dtime == 0) return; count = count / dtime; - if (q->vars.avg_dq_rate == 0) - q->vars.avg_dq_rate = count; + if (vars->avg_dq_rate == 0) + vars->avg_dq_rate = count; else - q->vars.avg_dq_rate = - (q->vars.avg_dq_rate - - (q->vars.avg_dq_rate >> 3)) + (count >> 3); + vars->avg_dq_rate = + (vars->avg_dq_rate - + (vars->avg_dq_rate >> 3)) + (count >> 3); /* If the queue has receded below the threshold, we hold * on to the last drain rate calculated, else we reset @@ -311,43 +284,54 @@ static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) * packet is dequeued */ if (qlen < QUEUE_THRESHOLD) { - q->vars.dq_count = DQCOUNT_INVALID; + vars->dq_count = DQCOUNT_INVALID; } else { - q->vars.dq_count = 0; - q->vars.dq_tstamp = psched_get_time(); + vars->dq_count = 0; + vars->dq_tstamp = psched_get_time(); } - if (q->vars.burst_time > 0) { - if (q->vars.burst_time > dtime) - q->vars.burst_time -= dtime; - else - q->vars.burst_time = 0; - } + goto burst_allowance_reduction; } } + + return; + +burst_allowance_reduction: + if (vars->burst_time > 0) { + if (vars->burst_time > dtime) + vars->burst_time -= dtime; + else + vars->burst_time = 0; + } } +EXPORT_SYMBOL_GPL(pie_process_dequeue); -static void calculate_probability(struct Qdisc *sch) +void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars, + u32 qlen) { - struct pie_sched_data *q = qdisc_priv(sch); - u32 qlen = sch->qstats.backlog; /* queue size in bytes */ psched_time_t qdelay = 0; /* in pschedtime */ - psched_time_t qdelay_old = q->vars.qdelay; /* in pschedtime */ + psched_time_t qdelay_old = 0; /* in pschedtime */ s64 delta = 0; /* determines the change in probability */ u64 oldprob; u64 alpha, beta; u32 power; bool update_prob = true; - q->vars.qdelay_old = q->vars.qdelay; + if (params->dq_rate_estimator) { + qdelay_old = vars->qdelay; + vars->qdelay_old = vars->qdelay; - if (q->vars.avg_dq_rate > 0) - qdelay = (qlen << PIE_SCALE) / q->vars.avg_dq_rate; - else - qdelay = 0; + if (vars->avg_dq_rate > 0) + qdelay = (qlen << PIE_SCALE) / vars->avg_dq_rate; + else + qdelay = 0; + } else { + qdelay = vars->qdelay; + qdelay_old = vars->qdelay_old; + } - /* If qdelay is zero and qlen is not, it means qlen is very small, less - * than dequeue_rate, so we do not update probabilty in this round + /* If qdelay is zero and qlen is not, it means qlen is very small, + * so we do not update probabilty in this round. */ if (qdelay == 0 && qlen != 0) update_prob = false; @@ -359,18 +343,18 @@ static void calculate_probability(struct Qdisc *sch) * probability. alpha/beta are updated locally below by scaling down * by 16 to come to 0-2 range. */ - alpha = ((u64)q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; - beta = ((u64)q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; + alpha = ((u64)params->alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; + beta = ((u64)params->beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; /* We scale alpha and beta differently depending on how heavy the * congestion is. Please see RFC 8033 for details. */ - if (q->vars.prob < MAX_PROB / 10) { + if (vars->prob < MAX_PROB / 10) { alpha >>= 1; beta >>= 1; power = 100; - while (q->vars.prob < div_u64(MAX_PROB, power) && + while (vars->prob < div_u64(MAX_PROB, power) && power <= 1000000) { alpha >>= 2; beta >>= 2; @@ -379,14 +363,14 @@ static void calculate_probability(struct Qdisc *sch) } /* alpha and beta should be between 0 and 32, in multiples of 1/16 */ - delta += alpha * (u64)(qdelay - q->params.target); + delta += alpha * (u64)(qdelay - params->target); delta += beta * (u64)(qdelay - qdelay_old); - oldprob = q->vars.prob; + oldprob = vars->prob; /* to ensure we increase probability in steps of no more than 2% */ if (delta > (s64)(MAX_PROB / (100 / 2)) && - q->vars.prob >= MAX_PROB / 10) + vars->prob >= MAX_PROB / 10) delta = (MAX_PROB / 100) * 2; /* Non-linear drop: @@ -397,12 +381,12 @@ static void calculate_probability(struct Qdisc *sch) if (qdelay > (PSCHED_NS2TICKS(250 * NSEC_PER_MSEC))) delta += MAX_PROB / (100 / 2); - q->vars.prob += delta; + vars->prob += delta; if (delta > 0) { /* prevent overflow */ - if (q->vars.prob < oldprob) { - q->vars.prob = MAX_PROB; + if (vars->prob < oldprob) { + vars->prob = MAX_PROB; /* Prevent normalization error. If probability is at * maximum value already, we normalize it here, and * skip the check to do a non-linear drop in the next @@ -412,8 +396,8 @@ static void calculate_probability(struct Qdisc *sch) } } else { /* prevent underflow */ - if (q->vars.prob > oldprob) - q->vars.prob = 0; + if (vars->prob > oldprob) + vars->prob = 0; } /* Non-linear drop in probability: Reduce drop probability quickly if @@ -422,23 +406,28 @@ static void calculate_probability(struct Qdisc *sch) if (qdelay == 0 && qdelay_old == 0 && update_prob) /* Reduce drop probability to 98.4% */ - q->vars.prob -= q->vars.prob / 64u; + vars->prob -= vars->prob / 64; - q->vars.qdelay = qdelay; - q->vars.qlen_old = qlen; + vars->qdelay = qdelay; + vars->qlen_old = qlen; /* We restart the measurement cycle if the following conditions are met * 1. If the delay has been low for 2 consecutive Tupdate periods * 2. Calculated drop probability is zero - * 3. We have atleast one estimate for the avg_dq_rate ie., - * is a non-zero value + * 3. If average dq_rate_estimator is enabled, we have atleast one + * estimate for the avg_dq_rate ie., is a non-zero value */ - if ((q->vars.qdelay < q->params.target / 2) && - (q->vars.qdelay_old < q->params.target / 2) && - q->vars.prob == 0 && - q->vars.avg_dq_rate > 0) - pie_vars_init(&q->vars); + if ((vars->qdelay < params->target / 2) && + (vars->qdelay_old < params->target / 2) && + vars->prob == 0 && + (!params->dq_rate_estimator || vars->avg_dq_rate > 0)) { + pie_vars_init(vars); + } + + if (!params->dq_rate_estimator) + vars->qdelay_old = qdelay; } +EXPORT_SYMBOL_GPL(pie_calculate_probability); static void pie_timer(struct timer_list *t) { @@ -447,7 +436,7 @@ static void pie_timer(struct timer_list *t) spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch)); spin_lock(root_lock); - calculate_probability(sch); + pie_calculate_probability(&q->params, &q->vars, sch->qstats.backlog); /* reset the timer to fire after 'tupdate'. tupdate is in jiffies. */ if (q->params.tupdate) @@ -497,7 +486,9 @@ static int pie_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) || nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) || nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) || - nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode)) + nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode) || + nla_put_u32(skb, TCA_PIE_DQ_RATE_ESTIMATOR, + q->params.dq_rate_estimator)) goto nla_put_failure; return nla_nest_end(skb, opts); @@ -514,9 +505,6 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) .prob = q->vars.prob, .delay = ((u32)PSCHED_TICKS2NS(q->vars.qdelay)) / NSEC_PER_USEC, - /* unscale and return dq_rate in bytes per sec */ - .avg_dq_rate = q->vars.avg_dq_rate * - (PSCHED_TICKS_PER_SEC) >> PIE_SCALE, .packets_in = q->stats.packets_in, .overlimit = q->stats.overlimit, .maxq = q->stats.maxq, @@ -524,17 +512,26 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) .ecn_mark = q->stats.ecn_mark, }; + /* avg_dq_rate is only valid if dq_rate_estimator is enabled */ + st.dq_rate_estimating = q->params.dq_rate_estimator; + + /* unscale and return dq_rate in bytes per sec */ + if (q->params.dq_rate_estimator) + st.avg_dq_rate = q->vars.avg_dq_rate * + (PSCHED_TICKS_PER_SEC) >> PIE_SCALE; + return gnet_stats_copy_app(d, &st, sizeof(st)); } static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch) { + struct pie_sched_data *q = qdisc_priv(sch); struct sk_buff *skb = qdisc_dequeue_head(sch); if (!skb) return NULL; - pie_process_dequeue(sch, skb); + pie_process_dequeue(skb, &q->params, &q->vars, sch->qstats.backlog); return skb; } @@ -555,7 +552,7 @@ static void pie_destroy(struct Qdisc *sch) } static struct Qdisc_ops pie_qdisc_ops __read_mostly = { - .id = "pie", + .id = "pie", .priv_size = sizeof(struct pie_sched_data), .enqueue = pie_qdisc_enqueue, .dequeue = pie_qdisc_dequeue, diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 0f8fedb8809a..647941702f9f 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -292,8 +292,14 @@ static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct tc_prio_qopt_offload graft_offload; unsigned long band = arg - 1; - if (new == NULL) - new = &noop_qdisc; + if (!new) { + new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + TC_H_MAKE(sch->handle, arg), extack); + if (!new) + new = &noop_qdisc; + else + qdisc_hash_add(new, true); + } *old = qdisc_replace(sch, new, &q->queues[band]); @@ -356,7 +362,7 @@ static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl, cl_q = q->queues[cl - 1]; if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, NULL, &cl_q->bstats) < 0 || + d, cl_q->cpu_bstats, &cl_q->bstats) < 0 || qdisc_qstats_copy(d, cl_q) < 0) return -1; diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 1dff8506a715..4074c50ac3d7 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -18,7 +18,7 @@ #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/random.h> -#include <linux/jhash.h> +#include <linux/siphash.h> #include <net/ip.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> @@ -45,7 +45,7 @@ struct sfb_bucket { * (Section 4.4 of SFB reference : moving hash functions) */ struct sfb_bins { - u32 perturbation; /* jhash perturbation */ + siphash_key_t perturbation; /* siphash key */ struct sfb_bucket bins[SFB_LEVELS][SFB_NUMBUCKETS]; }; @@ -217,7 +217,8 @@ static u32 sfb_compute_qlen(u32 *prob_r, u32 *avgpm_r, const struct sfb_sched_da static void sfb_init_perturbation(u32 slot, struct sfb_sched_data *q) { - q->bins[slot].perturbation = prandom_u32(); + get_random_bytes(&q->bins[slot].perturbation, + sizeof(q->bins[slot].perturbation)); } static void sfb_swap_slot(struct sfb_sched_data *q) @@ -314,9 +315,9 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* If using external classifiers, get result and record it. */ if (!sfb_classify(skb, fl, &ret, &salt)) goto other_drop; - sfbhash = jhash_1word(salt, q->bins[slot].perturbation); + sfbhash = siphash_1u32(salt, &q->bins[slot].perturbation); } else { - sfbhash = skb_get_hash_perturb(skb, q->bins[slot].perturbation); + sfbhash = skb_get_hash_perturb(skb, &q->bins[slot].perturbation); } @@ -352,7 +353,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* Inelastic flow */ if (q->double_buffering) { sfbhash = skb_get_hash_perturb(skb, - q->bins[slot].perturbation); + &q->bins[slot].perturbation); if (!sfbhash) sfbhash = 1; sfb_skb_cb(skb)->hashes[slot] = sfbhash; @@ -488,7 +489,7 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct sfb_sched_data *q = qdisc_priv(sch); - struct Qdisc *child; + struct Qdisc *child, *old; struct nlattr *tb[TCA_SFB_MAX + 1]; const struct tc_sfb_qopt *ctl = &sfb_default_ops; u32 limit; @@ -518,8 +519,8 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt, qdisc_hash_add(child, true); sch_tree_lock(sch); - qdisc_tree_flush_backlog(q->qdisc); - qdisc_put(q->qdisc); + qdisc_purge_queue(q->qdisc); + old = q->qdisc; q->qdisc = child; q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval); @@ -542,6 +543,7 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt, sfb_init_perturbation(1, q); sch_tree_unlock(sch); + qdisc_put(old); return 0; } diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 68404a9d2ce4..c787d4d46017 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -14,7 +14,7 @@ #include <linux/errno.h> #include <linux/init.h> #include <linux/skbuff.h> -#include <linux/jhash.h> +#include <linux/siphash.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <net/netlink.h> @@ -117,7 +117,7 @@ struct sfq_sched_data { u8 headdrop; u8 maxdepth; /* limit of packets per flow */ - u32 perturbation; + siphash_key_t perturbation; u8 cur_depth; /* depth of longest slot */ u8 flags; unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */ @@ -157,7 +157,7 @@ static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index static unsigned int sfq_hash(const struct sfq_sched_data *q, const struct sk_buff *skb) { - return skb_get_hash_perturb(skb, q->perturbation) & (q->divisor - 1); + return skb_get_hash_perturb(skb, &q->perturbation) & (q->divisor - 1); } static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, @@ -607,9 +607,11 @@ static void sfq_perturbation(struct timer_list *t) struct sfq_sched_data *q = from_timer(q, t, perturb_timer); struct Qdisc *sch = q->sch; spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch)); + siphash_key_t nkey; + get_random_bytes(&nkey, sizeof(nkey)); spin_lock(root_lock); - q->perturbation = prandom_u32(); + q->perturbation = nkey; if (!q->filter_list && q->tail) sfq_rehash(sch); spin_unlock(root_lock); @@ -688,7 +690,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) del_timer(&q->perturb_timer); if (q->perturb_period) { mod_timer(&q->perturb_timer, jiffies + q->perturb_period); - q->perturbation = prandom_u32(); + get_random_bytes(&q->perturbation, sizeof(q->perturbation)); } sch_tree_unlock(sch); kfree(p); @@ -745,7 +747,7 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt, q->quantum = psched_mtu(qdisc_dev(sch)); q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum); q->perturb_period = 0; - q->perturbation = prandom_u32(); + get_random_bytes(&q->perturbation, sizeof(q->perturbation)); if (opt) { int err = sfq_change(sch, opt); diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index e25d414ae12f..660fc45ee40f 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -29,8 +29,9 @@ static DEFINE_SPINLOCK(taprio_list_lock); #define TAPRIO_ALL_GATES_OPEN -1 -#define FLAGS_VALID(flags) (!((flags) & ~TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST)) #define TXTIME_ASSIST_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) +#define FULL_OFFLOAD_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD) +#define TAPRIO_FLAGS_INVALID U32_MAX struct sched_entry { struct list_head list; @@ -75,9 +76,16 @@ struct taprio_sched { struct sched_gate_list __rcu *admin_sched; struct hrtimer advance_timer; struct list_head taprio_list; + struct sk_buff *(*dequeue)(struct Qdisc *sch); + struct sk_buff *(*peek)(struct Qdisc *sch); u32 txtime_delay; }; +struct __tc_taprio_qopt_offload { + refcount_t users; + struct tc_taprio_qopt_offload offload; +}; + static ktime_t sched_base_time(const struct sched_gate_list *sched) { if (!sched) @@ -268,6 +276,19 @@ static bool is_valid_interval(struct sk_buff *skb, struct Qdisc *sch) return entry; } +static bool taprio_flags_valid(u32 flags) +{ + /* Make sure no other flag bits are set. */ + if (flags & ~(TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST | + TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)) + return false; + /* txtime-assist and full offload are mutually exclusive */ + if ((flags & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) && + (flags & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)) + return false; + return true; +} + /* This returns the tstamp value set by TCP in terms of the set clock. */ static ktime_t get_tcp_tstamp(struct taprio_sched *q, struct sk_buff *skb) { @@ -417,7 +438,7 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, return qdisc_enqueue(skb, child, to_free); } -static struct sk_buff *taprio_peek(struct Qdisc *sch) +static struct sk_buff *taprio_peek_soft(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); @@ -461,6 +482,36 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch) return NULL; } +static struct sk_buff *taprio_peek_offload(struct Qdisc *sch) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct sk_buff *skb; + int i; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct Qdisc *child = q->qdiscs[i]; + + if (unlikely(!child)) + continue; + + skb = child->ops->peek(child); + if (!skb) + continue; + + return skb; + } + + return NULL; +} + +static struct sk_buff *taprio_peek(struct Qdisc *sch) +{ + struct taprio_sched *q = qdisc_priv(sch); + + return q->peek(sch); +} + static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry) { atomic_set(&entry->budget, @@ -468,7 +519,7 @@ static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry) atomic64_read(&q->picos_per_byte))); } -static struct sk_buff *taprio_dequeue(struct Qdisc *sch) +static struct sk_buff *taprio_dequeue_soft(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); @@ -477,11 +528,6 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch) u32 gate_mask; int i; - if (atomic64_read(&q->picos_per_byte) == -1) { - WARN_ONCE(1, "taprio: dequeue() called with unknown picos per byte."); - return NULL; - } - rcu_read_lock(); entry = rcu_dereference(q->current_entry); /* if there's no entry, it means that the schedule didn't @@ -555,6 +601,40 @@ done: return skb; } +static struct sk_buff *taprio_dequeue_offload(struct Qdisc *sch) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct sk_buff *skb; + int i; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct Qdisc *child = q->qdiscs[i]; + + if (unlikely(!child)) + continue; + + skb = child->ops->dequeue(child); + if (unlikely(!skb)) + continue; + + qdisc_bstats_update(sch, skb); + qdisc_qstats_backlog_dec(sch, skb); + sch->q.qlen--; + + return skb; + } + + return NULL; +} + +static struct sk_buff *taprio_dequeue(struct Qdisc *sch) +{ + struct taprio_sched *q = qdisc_priv(sch); + + return q->dequeue(sch); +} + static bool should_restart_cycle(const struct sched_gate_list *oper, const struct sched_entry *entry) { @@ -677,10 +757,6 @@ static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { [TCA_TAPRIO_SCHED_ENTRY_INTERVAL] = { .type = NLA_U32 }, }; -static const struct nla_policy entry_list_policy[TCA_TAPRIO_SCHED_MAX + 1] = { - [TCA_TAPRIO_SCHED_ENTRY] = { .type = NLA_NESTED }, -}; - static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = { [TCA_TAPRIO_ATTR_PRIOMAP] = { .len = sizeof(struct tc_mqprio_qopt) @@ -691,6 +767,7 @@ static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = { [TCA_TAPRIO_ATTR_SCHED_CLOCKID] = { .type = NLA_S32 }, [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME] = { .type = NLA_S64 }, [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION] = { .type = NLA_S64 }, + [TCA_TAPRIO_ATTR_FLAGS] = { .type = NLA_U32 }, }; static int fill_sched_entry(struct nlattr **tb, struct sched_entry *entry, @@ -847,7 +924,7 @@ static int taprio_parse_mqprio_opt(struct net_device *dev, } /* Verify priority mapping uses valid tcs */ - for (i = 0; i < TC_BITMASK + 1; i++) { + for (i = 0; i <= TC_BITMASK; i++) { if (qopt->prio_tc_map[i] >= qopt->num_tc) { NL_SET_ERR_MSG(extack, "Invalid traffic class in priority to traffic class mapping"); return -EINVAL; @@ -941,6 +1018,9 @@ static void taprio_start_sched(struct Qdisc *sch, struct taprio_sched *q = qdisc_priv(sch); ktime_t expires; + if (FULL_OFFLOAD_IS_ENABLED(q->flags)) + return; + expires = hrtimer_get_expires(&q->advance_timer); if (expires == 0) expires = KTIME_MAX; @@ -958,12 +1038,19 @@ static void taprio_set_picos_per_byte(struct net_device *dev, struct taprio_sched *q) { struct ethtool_link_ksettings ecmd; - int picos_per_byte = -1; + int speed = SPEED_10; + int picos_per_byte; + int err; + + err = __ethtool_get_link_ksettings(dev, &ecmd); + if (err < 0) + goto skip; + + if (ecmd.base.speed && ecmd.base.speed != SPEED_UNKNOWN) + speed = ecmd.base.speed; - if (!__ethtool_get_link_ksettings(dev, &ecmd) && - ecmd.base.speed != SPEED_UNKNOWN) - picos_per_byte = div64_s64(NSEC_PER_SEC * 1000LL * 8, - ecmd.base.speed * 1000 * 1000); +skip: + picos_per_byte = (USEC_PER_SEC * 8) / speed; atomic64_set(&q->picos_per_byte, picos_per_byte); netdev_dbg(dev, "taprio: set %s's picos_per_byte to: %lld, linkspeed: %d\n", @@ -1012,6 +1099,303 @@ static void setup_txtime(struct taprio_sched *q, } } +static struct tc_taprio_qopt_offload *taprio_offload_alloc(int num_entries) +{ + size_t size = sizeof(struct tc_taprio_sched_entry) * num_entries + + sizeof(struct __tc_taprio_qopt_offload); + struct __tc_taprio_qopt_offload *__offload; + + __offload = kzalloc(size, GFP_KERNEL); + if (!__offload) + return NULL; + + refcount_set(&__offload->users, 1); + + return &__offload->offload; +} + +struct tc_taprio_qopt_offload *taprio_offload_get(struct tc_taprio_qopt_offload + *offload) +{ + struct __tc_taprio_qopt_offload *__offload; + + __offload = container_of(offload, struct __tc_taprio_qopt_offload, + offload); + + refcount_inc(&__offload->users); + + return offload; +} +EXPORT_SYMBOL_GPL(taprio_offload_get); + +void taprio_offload_free(struct tc_taprio_qopt_offload *offload) +{ + struct __tc_taprio_qopt_offload *__offload; + + __offload = container_of(offload, struct __tc_taprio_qopt_offload, + offload); + + if (!refcount_dec_and_test(&__offload->users)) + return; + + kfree(__offload); +} +EXPORT_SYMBOL_GPL(taprio_offload_free); + +/* The function will only serve to keep the pointers to the "oper" and "admin" + * schedules valid in relation to their base times, so when calling dump() the + * users looks at the right schedules. + * When using full offload, the admin configuration is promoted to oper at the + * base_time in the PHC time domain. But because the system time is not + * necessarily in sync with that, we can't just trigger a hrtimer to call + * switch_schedules at the right hardware time. + * At the moment we call this by hand right away from taprio, but in the future + * it will be useful to create a mechanism for drivers to notify taprio of the + * offload state (PENDING, ACTIVE, INACTIVE) so it can be visible in dump(). + * This is left as TODO. + */ +static void taprio_offload_config_changed(struct taprio_sched *q) +{ + struct sched_gate_list *oper, *admin; + + spin_lock(&q->current_entry_lock); + + oper = rcu_dereference_protected(q->oper_sched, + lockdep_is_held(&q->current_entry_lock)); + admin = rcu_dereference_protected(q->admin_sched, + lockdep_is_held(&q->current_entry_lock)); + + switch_schedules(q, &admin, &oper); + + spin_unlock(&q->current_entry_lock); +} + +static void taprio_sched_to_offload(struct taprio_sched *q, + struct sched_gate_list *sched, + const struct tc_mqprio_qopt *mqprio, + struct tc_taprio_qopt_offload *offload) +{ + struct sched_entry *entry; + int i = 0; + + offload->base_time = sched->base_time; + offload->cycle_time = sched->cycle_time; + offload->cycle_time_extension = sched->cycle_time_extension; + + list_for_each_entry(entry, &sched->entries, list) { + struct tc_taprio_sched_entry *e = &offload->entries[i]; + + e->command = entry->command; + e->interval = entry->interval; + e->gate_mask = entry->gate_mask; + i++; + } + + offload->num_entries = i; +} + +static int taprio_enable_offload(struct net_device *dev, + struct tc_mqprio_qopt *mqprio, + struct taprio_sched *q, + struct sched_gate_list *sched, + struct netlink_ext_ack *extack) +{ + const struct net_device_ops *ops = dev->netdev_ops; + struct tc_taprio_qopt_offload *offload; + int err = 0; + + if (!ops->ndo_setup_tc) { + NL_SET_ERR_MSG(extack, + "Device does not support taprio offload"); + return -EOPNOTSUPP; + } + + offload = taprio_offload_alloc(sched->num_entries); + if (!offload) { + NL_SET_ERR_MSG(extack, + "Not enough memory for enabling offload mode"); + return -ENOMEM; + } + offload->enable = 1; + taprio_sched_to_offload(q, sched, mqprio, offload); + + err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); + if (err < 0) { + NL_SET_ERR_MSG(extack, + "Device failed to setup taprio offload"); + goto done; + } + +done: + taprio_offload_free(offload); + + return err; +} + +static int taprio_disable_offload(struct net_device *dev, + struct taprio_sched *q, + struct netlink_ext_ack *extack) +{ + const struct net_device_ops *ops = dev->netdev_ops; + struct tc_taprio_qopt_offload *offload; + int err; + + if (!FULL_OFFLOAD_IS_ENABLED(q->flags)) + return 0; + + if (!ops->ndo_setup_tc) + return -EOPNOTSUPP; + + offload = taprio_offload_alloc(0); + if (!offload) { + NL_SET_ERR_MSG(extack, + "Not enough memory to disable offload mode"); + return -ENOMEM; + } + offload->enable = 0; + + err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); + if (err < 0) { + NL_SET_ERR_MSG(extack, + "Device failed to disable offload"); + goto out; + } + +out: + taprio_offload_free(offload); + + return err; +} + +/* If full offload is enabled, the only possible clockid is the net device's + * PHC. For that reason, specifying a clockid through netlink is incorrect. + * For txtime-assist, it is implicitly assumed that the device's PHC is kept + * in sync with the specified clockid via a user space daemon such as phc2sys. + * For both software taprio and txtime-assist, the clockid is used for the + * hrtimer that advances the schedule and hence mandatory. + */ +static int taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + int err = -EINVAL; + + if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { + const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_ts_info info = { + .cmd = ETHTOOL_GET_TS_INFO, + .phc_index = -1, + }; + + if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { + NL_SET_ERR_MSG(extack, + "The 'clockid' cannot be specified for full offload"); + goto out; + } + + if (ops && ops->get_ts_info) + err = ops->get_ts_info(dev, &info); + + if (err || info.phc_index < 0) { + NL_SET_ERR_MSG(extack, + "Device does not have a PTP clock"); + err = -ENOTSUPP; + goto out; + } + } else if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { + int clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]); + + /* We only support static clockids and we don't allow + * for it to be modified after the first init. + */ + if (clockid < 0 || + (q->clockid != -1 && q->clockid != clockid)) { + NL_SET_ERR_MSG(extack, + "Changing the 'clockid' of a running schedule is not supported"); + err = -ENOTSUPP; + goto out; + } + + switch (clockid) { + case CLOCK_REALTIME: + q->tk_offset = TK_OFFS_REAL; + break; + case CLOCK_MONOTONIC: + q->tk_offset = TK_OFFS_MAX; + break; + case CLOCK_BOOTTIME: + q->tk_offset = TK_OFFS_BOOT; + break; + case CLOCK_TAI: + q->tk_offset = TK_OFFS_TAI; + break; + default: + NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); + err = -EINVAL; + goto out; + } + + q->clockid = clockid; + } else { + NL_SET_ERR_MSG(extack, "Specifying a 'clockid' is mandatory"); + goto out; + } + + /* Everything went ok, return success. */ + err = 0; + +out: + return err; +} + +static int taprio_mqprio_cmp(const struct net_device *dev, + const struct tc_mqprio_qopt *mqprio) +{ + int i; + + if (!mqprio || mqprio->num_tc != dev->num_tc) + return -1; + + for (i = 0; i < mqprio->num_tc; i++) + if (dev->tc_to_txq[i].count != mqprio->count[i] || + dev->tc_to_txq[i].offset != mqprio->offset[i]) + return -1; + + for (i = 0; i <= TC_BITMASK; i++) + if (dev->prio_tc_map[i] != mqprio->prio_tc_map[i]) + return -1; + + return 0; +} + +/* The semantics of the 'flags' argument in relation to 'change()' + * requests, are interpreted following two rules (which are applied in + * this order): (1) an omitted 'flags' argument is interpreted as + * zero; (2) the 'flags' of a "running" taprio instance cannot be + * changed. + */ +static int taprio_new_flags(const struct nlattr *attr, u32 old, + struct netlink_ext_ack *extack) +{ + u32 new = 0; + + if (attr) + new = nla_get_u32(attr); + + if (old != TAPRIO_FLAGS_INVALID && old != new) { + NL_SET_ERR_MSG_MOD(extack, "Changing 'flags' of a running schedule is not supported"); + return -EOPNOTSUPP; + } + + if (!taprio_flags_valid(new)) { + NL_SET_ERR_MSG_MOD(extack, "Specified 'flags' are not valid"); + return -EINVAL; + } + + return new; +} + static int taprio_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { @@ -1020,10 +1404,9 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct tc_mqprio_qopt *mqprio = NULL; - u32 taprio_flags = 0; - int i, err, clockid; unsigned long flags; ktime_t start; + int i, err; err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_ATTR_MAX, opt, taprio_policy, extack); @@ -1033,21 +1416,14 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_TAPRIO_ATTR_PRIOMAP]) mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]); - if (tb[TCA_TAPRIO_ATTR_FLAGS]) { - taprio_flags = nla_get_u32(tb[TCA_TAPRIO_ATTR_FLAGS]); - - if (q->flags != 0 && q->flags != taprio_flags) { - NL_SET_ERR_MSG_MOD(extack, "Changing 'flags' of a running schedule is not supported"); - return -EOPNOTSUPP; - } else if (!FLAGS_VALID(taprio_flags)) { - NL_SET_ERR_MSG_MOD(extack, "Specified 'flags' are not valid"); - return -EINVAL; - } + err = taprio_new_flags(tb[TCA_TAPRIO_ATTR_FLAGS], + q->flags, extack); + if (err < 0) + return err; - q->flags = taprio_flags; - } + q->flags = err; - err = taprio_parse_mqprio_opt(dev, mqprio, extack, taprio_flags); + err = taprio_parse_mqprio_opt(dev, mqprio, extack, q->flags); if (err < 0) return err; @@ -1063,6 +1439,10 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, admin = rcu_dereference(q->admin_sched); rcu_read_unlock(); + /* no changes - no new mqprio settings */ + if (!taprio_mqprio_cmp(dev, mqprio)) + mqprio = NULL; + if (mqprio && (oper || admin)) { NL_SET_ERR_MSG(extack, "Changing the traffic mapping of a running schedule is not supported"); err = -ENOTSUPP; @@ -1079,29 +1459,31 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, goto free_sched; } - if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { - clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]); + err = taprio_parse_clockid(sch, tb, extack); + if (err < 0) + goto free_sched; - /* We only support static clockids and we don't allow - * for it to be modified after the first init. - */ - if (clockid < 0 || - (q->clockid != -1 && q->clockid != clockid)) { - NL_SET_ERR_MSG(extack, "Changing the 'clockid' of a running schedule is not supported"); - err = -ENOTSUPP; - goto free_sched; - } + taprio_set_picos_per_byte(dev, q); - q->clockid = clockid; - } + if (mqprio) { + netdev_set_num_tc(dev, mqprio->num_tc); + for (i = 0; i < mqprio->num_tc; i++) + netdev_set_tc_queue(dev, i, + mqprio->count[i], + mqprio->offset[i]); - if (q->clockid == -1 && !tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { - NL_SET_ERR_MSG(extack, "Specifying a 'clockid' is mandatory"); - err = -EINVAL; - goto free_sched; + /* Always use supplied priority mappings */ + for (i = 0; i <= TC_BITMASK; i++) + netdev_set_prio_tc_map(dev, i, + mqprio->prio_tc_map[i]); } - taprio_set_picos_per_byte(dev, q); + if (FULL_OFFLOAD_IS_ENABLED(q->flags)) + err = taprio_enable_offload(dev, mqprio, q, new_admin, extack); + else + err = taprio_disable_offload(dev, q, extack); + if (err) + goto free_sched; /* Protects against enqueue()/dequeue() */ spin_lock_bh(qdisc_lock(sch)); @@ -1116,42 +1498,22 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, q->txtime_delay = nla_get_u32(tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]); } - if (!TXTIME_ASSIST_IS_ENABLED(taprio_flags) && + if (!TXTIME_ASSIST_IS_ENABLED(q->flags) && + !FULL_OFFLOAD_IS_ENABLED(q->flags) && !hrtimer_active(&q->advance_timer)) { hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS); q->advance_timer.function = advance_sched; } - if (mqprio) { - netdev_set_num_tc(dev, mqprio->num_tc); - for (i = 0; i < mqprio->num_tc; i++) - netdev_set_tc_queue(dev, i, - mqprio->count[i], - mqprio->offset[i]); - - /* Always use supplied priority mappings */ - for (i = 0; i < TC_BITMASK + 1; i++) - netdev_set_prio_tc_map(dev, i, - mqprio->prio_tc_map[i]); - } - - switch (q->clockid) { - case CLOCK_REALTIME: - q->tk_offset = TK_OFFS_REAL; - break; - case CLOCK_MONOTONIC: - q->tk_offset = TK_OFFS_MAX; - break; - case CLOCK_BOOTTIME: - q->tk_offset = TK_OFFS_BOOT; - break; - case CLOCK_TAI: - q->tk_offset = TK_OFFS_TAI; - break; - default: - NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); - err = -EINVAL; - goto unlock; + if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { + q->dequeue = taprio_dequeue_offload; + q->peek = taprio_peek_offload; + } else { + /* Be sure to always keep the function pointers + * in a consistent state. + */ + q->dequeue = taprio_dequeue_soft; + q->peek = taprio_peek_soft; } err = taprio_get_start_time(sch, new_admin, &start); @@ -1160,9 +1522,9 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, goto unlock; } - if (TXTIME_ASSIST_IS_ENABLED(taprio_flags)) { - setup_txtime(q, new_admin, start); + setup_txtime(q, new_admin, start); + if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { if (!oper) { rcu_assign_pointer(q->oper_sched, new_admin); err = 0; @@ -1186,6 +1548,9 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, call_rcu(&admin->rcu, taprio_free_sched_cb); spin_unlock_irqrestore(&q->current_entry_lock, flags); + + if (FULL_OFFLOAD_IS_ENABLED(q->flags)) + taprio_offload_config_changed(q); } new_admin = NULL; @@ -1213,6 +1578,8 @@ static void taprio_destroy(struct Qdisc *sch) hrtimer_cancel(&q->advance_timer); + taprio_disable_offload(dev, q, NULL); + if (q->qdiscs) { for (i = 0; i < dev->num_tx_queues && q->qdiscs[i]; i++) qdisc_put(q->qdiscs[i]); @@ -1221,7 +1588,7 @@ static void taprio_destroy(struct Qdisc *sch) } q->qdiscs = NULL; - netdev_set_num_tc(dev, 0); + netdev_reset_tc(dev); if (q->oper_sched) call_rcu(&q->oper_sched->rcu, taprio_free_sched_cb); @@ -1242,12 +1609,20 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt, hrtimer_init(&q->advance_timer, CLOCK_TAI, HRTIMER_MODE_ABS); q->advance_timer.function = advance_sched; + q->dequeue = taprio_dequeue_soft; + q->peek = taprio_peek_soft; + q->root = sch; /* We only support static clockids. Use an invalid value as default * and get the valid one on taprio_change(). */ q->clockid = -1; + q->flags = TAPRIO_FLAGS_INVALID; + + spin_lock(&taprio_list_lock); + list_add(&q->taprio_list, &taprio_list); + spin_unlock(&taprio_list_lock); if (sch->parent != TC_H_ROOT) return -EOPNOTSUPP; @@ -1266,10 +1641,6 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt, if (!opt) return -EINVAL; - spin_lock(&taprio_list_lock); - list_add(&q->taprio_list, &taprio_list); - spin_unlock(&taprio_list_lock); - for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *dev_queue; struct Qdisc *qdisc; @@ -1424,7 +1795,8 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put(skb, TCA_TAPRIO_ATTR_PRIOMAP, sizeof(opt), &opt)) goto options_error; - if (nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid)) + if (!FULL_OFFLOAD_IS_ENABLED(q->flags) && + nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid)) goto options_error; if (q->flags && nla_put_u32(skb, TCA_TAPRIO_ATTR_FLAGS, q->flags)) diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 5f72f3f916a5..78e79029dc63 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -15,6 +15,7 @@ #include <linux/skbuff.h> #include <net/netlink.h> #include <net/sch_generic.h> +#include <net/pkt_cls.h> #include <net/pkt_sched.h> @@ -137,6 +138,52 @@ static u64 psched_ns_t2l(const struct psched_ratecfg *r, return len; } +static void tbf_offload_change(struct Qdisc *sch) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct tc_tbf_qopt_offload qopt; + + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return; + + qopt.command = TC_TBF_REPLACE; + qopt.handle = sch->handle; + qopt.parent = sch->parent; + qopt.replace_params.rate = q->rate; + qopt.replace_params.max_size = q->max_size; + qopt.replace_params.qstats = &sch->qstats; + + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt); +} + +static void tbf_offload_destroy(struct Qdisc *sch) +{ + struct net_device *dev = qdisc_dev(sch); + struct tc_tbf_qopt_offload qopt; + + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return; + + qopt.command = TC_TBF_DESTROY; + qopt.handle = sch->handle; + qopt.parent = sch->parent; + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt); +} + +static int tbf_offload_dump(struct Qdisc *sch) +{ + struct tc_tbf_qopt_offload qopt; + + qopt.command = TC_TBF_STATS; + qopt.handle = sch->handle; + qopt.parent = sch->parent; + qopt.stats.bstats = &sch->bstats; + qopt.stats.qstats = &sch->qstats; + + return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_TBF, &qopt); +} + /* GSO packet is too big, segment it so that tbf can transmit * each segment in time */ @@ -155,8 +202,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch, return qdisc_drop(skb, sch, to_free); nb = 0; - while (segs) { - nskb = segs->next; + skb_list_walk_safe(segs, segs, nskb) { skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; len += segs->len; @@ -167,7 +213,6 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch, } else { nb++; } - segs = nskb; } sch->q.qlen += nb; if (nb > 1) @@ -409,6 +454,8 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt, sch_tree_unlock(sch); err = 0; + + tbf_offload_change(sch); done: return err; } @@ -434,6 +481,7 @@ static void tbf_destroy(struct Qdisc *sch) struct tbf_sched_data *q = qdisc_priv(sch); qdisc_watchdog_cancel(&q->watchdog); + tbf_offload_destroy(sch); qdisc_put(q->qdisc); } @@ -442,8 +490,12 @@ static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) struct tbf_sched_data *q = qdisc_priv(sch); struct nlattr *nest; struct tc_tbf_qopt opt; + int err; + + err = tbf_offload_dump(sch); + if (err) + return err; - sch->qstats.backlog = q->qdisc->qstats.backlog; nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 5010cce52c93..437079a4883d 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -54,7 +54,6 @@ static struct sctp_association *sctp_association_init( const struct sock *sk, enum sctp_scope scope, gfp_t gfp) { - struct net *net = sock_net(sk); struct sctp_sock *sp; struct sctp_paramhdr *p; int i; @@ -65,6 +64,7 @@ static struct sctp_association *sctp_association_init( /* Discarding const is appropriate here. */ asoc->ep = (struct sctp_endpoint *)ep; asoc->base.sk = (struct sock *)sk; + asoc->base.net = sock_net(sk); sctp_endpoint_hold(asoc->ep); sock_hold(asoc->base.sk); @@ -87,6 +87,8 @@ static struct sctp_association *sctp_association_init( */ asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt; asoc->pf_retrans = sp->pf_retrans; + asoc->ps_retrans = sp->ps_retrans; + asoc->pf_expose = sp->pf_expose; asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial); asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max); @@ -214,14 +216,6 @@ static struct sctp_association *sctp_association_init( asoc->peer.sack_needed = 1; asoc->peer.sack_generation = 1; - /* Assume that the peer will tell us if he recognizes ASCONF - * as part of INIT exchange. - * The sctp_addip_noauth option is there for backward compatibility - * and will revert old behavior. - */ - if (net->sctp.addip_noauth) - asoc->peer.asconf_capable = 1; - /* Create an input queue. */ sctp_inq_init(&asoc->base.inqueue); sctp_inq_set_th_handler(&asoc->base.inqueue, sctp_assoc_bh_rcv); @@ -333,7 +327,7 @@ void sctp_association_free(struct sctp_association *asoc) * socket. */ if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) - sk->sk_ack_backlog--; + sk_acceptq_removed(sk); } /* Mark as dead, so other users can know this structure is @@ -438,6 +432,8 @@ void sctp_assoc_set_primary(struct sctp_association *asoc, changeover = 1 ; asoc->peer.primary_path = transport; + sctp_ulpevent_nofity_peer_addr_change(transport, + SCTP_ADDR_MADE_PRIM, 0); /* Set a default msg_name for events. */ memcpy(&asoc->peer.primary_addr, &transport->ipaddr, @@ -578,6 +574,7 @@ void sctp_assoc_rm_peer(struct sctp_association *asoc, asoc->peer.transport_count--; + sctp_ulpevent_nofity_peer_addr_change(peer, SCTP_ADDR_REMOVED, 0); sctp_transport_free(peer); } @@ -587,7 +584,6 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, const gfp_t gfp, const int peer_state) { - struct net *net = sock_net(asoc->base.sk); struct sctp_transport *peer; struct sctp_sock *sp; unsigned short port; @@ -617,7 +613,7 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, return peer; } - peer = sctp_transport_new(net, addr, gfp); + peer = sctp_transport_new(asoc->base.net, addr, gfp); if (!peer) return NULL; @@ -633,6 +629,8 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, /* And the partial failure retrans threshold */ peer->pf_retrans = asoc->pf_retrans; + /* And the primary path switchover retrans threshold */ + peer->ps_retrans = asoc->ps_retrans; /* Initialize the peer's SACK delay timeout based on the * association configured value. @@ -716,6 +714,8 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, list_add_tail_rcu(&peer->transports, &asoc->peer.transport_addr_list); asoc->peer.transport_count++; + sctp_ulpevent_nofity_peer_addr_change(peer, SCTP_ADDR_ADDED, 0); + /* If we do not yet have a primary path, set one. */ if (!asoc->peer.primary_path) { sctp_assoc_set_primary(asoc, peer); @@ -790,9 +790,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, enum sctp_transport_cmd command, sctp_sn_error_t error) { - struct sctp_ulpevent *event; - struct sockaddr_storage addr; - int spc_state = 0; + int spc_state = SCTP_ADDR_AVAILABLE; bool ulp_notify = true; /* Record the transition on the transport. */ @@ -802,19 +800,13 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, * to heartbeat success, report the SCTP_ADDR_CONFIRMED * state to the user, otherwise report SCTP_ADDR_AVAILABLE. */ - if (SCTP_UNCONFIRMED == transport->state && - SCTP_HEARTBEAT_SUCCESS == error) - spc_state = SCTP_ADDR_CONFIRMED; - else - spc_state = SCTP_ADDR_AVAILABLE; - /* Don't inform ULP about transition from PF to - * active state and set cwnd to 1 MTU, see SCTP - * Quick failover draft section 5.1, point 5 - */ - if (transport->state == SCTP_PF) { + if (transport->state == SCTP_PF && + asoc->pf_expose != SCTP_PF_EXPOSE_ENABLE) ulp_notify = false; - transport->cwnd = asoc->pathmtu; - } + else if (transport->state == SCTP_UNCONFIRMED && + error == SCTP_HEARTBEAT_SUCCESS) + spc_state = SCTP_ADDR_CONFIRMED; + transport->state = SCTP_ACTIVE; break; @@ -823,19 +815,21 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, * to inactive state. Also, release the cached route since * there may be a better route next time. */ - if (transport->state != SCTP_UNCONFIRMED) + if (transport->state != SCTP_UNCONFIRMED) { transport->state = SCTP_INACTIVE; - else { + spc_state = SCTP_ADDR_UNREACHABLE; + } else { sctp_transport_dst_release(transport); ulp_notify = false; } - - spc_state = SCTP_ADDR_UNREACHABLE; break; case SCTP_TRANSPORT_PF: transport->state = SCTP_PF; - ulp_notify = false; + if (asoc->pf_expose != SCTP_PF_EXPOSE_ENABLE) + ulp_notify = false; + else + spc_state = SCTP_ADDR_POTENTIALLY_FAILED; break; default: @@ -845,16 +839,9 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, /* Generate and send a SCTP_PEER_ADDR_CHANGE notification * to the user. */ - if (ulp_notify) { - memset(&addr, 0, sizeof(struct sockaddr_storage)); - memcpy(&addr, &transport->ipaddr, - transport->af_specific->sockaddr_len); - - event = sctp_ulpevent_make_peer_addr_change(asoc, &addr, - 0, spc_state, error, GFP_ATOMIC); - if (event) - asoc->stream.si->enqueue_event(&asoc->ulpq, event); - } + if (ulp_notify) + sctp_ulpevent_nofity_peer_addr_change(transport, + spc_state, error); /* Select new active and retran paths. */ sctp_select_active_and_retran_path(asoc); @@ -986,7 +973,7 @@ static void sctp_assoc_bh_rcv(struct work_struct *work) struct sctp_association *asoc = container_of(work, struct sctp_association, base.inqueue.immediate); - struct net *net = sock_net(asoc->base.sk); + struct net *net = asoc->base.net; union sctp_subtype subtype; struct sctp_endpoint *ep; struct sctp_chunk *chunk; @@ -1086,7 +1073,7 @@ void sctp_assoc_migrate(struct sctp_association *assoc, struct sock *newsk) /* Decrement the backlog value for a TCP-style socket. */ if (sctp_style(oldsk, TCP)) - oldsk->sk_ack_backlog--; + sk_acceptq_removed(oldsk); /* Release references to the old endpoint and the sock. */ sctp_endpoint_put(assoc->ep); @@ -1454,7 +1441,8 @@ void sctp_assoc_sync_pmtu(struct sctp_association *asoc) /* Should we send a SACK to update our peer? */ static inline bool sctp_peer_needs_update(struct sctp_association *asoc) { - struct net *net = sock_net(asoc->base.sk); + struct net *net = asoc->base.net; + switch (asoc->state) { case SCTP_STATE_ESTABLISHED: case SCTP_STATE_SHUTDOWN_PENDING: @@ -1588,7 +1576,7 @@ int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc, if (asoc->peer.ipv6_address) flags |= SCTP_ADDR6_PEERSUPP; - return sctp_bind_addr_copy(sock_net(asoc->base.sk), + return sctp_bind_addr_copy(asoc->base.net, &asoc->base.bind_addr, &asoc->ep->base.bind_addr, scope, gfp, flags); diff --git a/net/sctp/auth.c b/net/sctp/auth.c index de4c78d4a21e..4278764d82b8 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -389,7 +389,7 @@ int sctp_auth_asoc_init_active_key(struct sctp_association *asoc, gfp_t gfp) /* If we don't support AUTH, or peer is not capable * we don't need to do anything. */ - if (!asoc->ep->auth_enable || !asoc->peer.auth_capable) + if (!asoc->peer.auth_capable) return 0; /* If the key_id is non-zero and we couldn't find an @@ -675,7 +675,7 @@ int sctp_auth_send_cid(enum sctp_cid chunk, const struct sctp_association *asoc) if (!asoc) return 0; - if (!asoc->ep->auth_enable || !asoc->peer.auth_capable) + if (!asoc->peer.auth_capable) return 0; return __sctp_auth_cid(chunk, asoc->peer.peer_chunks); @@ -687,7 +687,7 @@ int sctp_auth_recv_cid(enum sctp_cid chunk, const struct sctp_association *asoc) if (!asoc) return 0; - if (!asoc->ep->auth_enable) + if (!asoc->peer.auth_capable) return 0; return __sctp_auth_cid(chunk, @@ -831,10 +831,15 @@ int sctp_auth_set_key(struct sctp_endpoint *ep, /* Try to find the given key id to see if * we are doing a replace, or adding a new key */ - if (asoc) + if (asoc) { + if (!asoc->peer.auth_capable) + return -EACCES; sh_keys = &asoc->endpoint_shared_keys; - else + } else { + if (!ep->auth_enable) + return -EACCES; sh_keys = &ep->endpoint_shared_keys; + } key_for_each(shkey, sh_keys) { if (shkey->key_id == auth_key->sca_keynumber) { @@ -875,10 +880,15 @@ int sctp_auth_set_active_key(struct sctp_endpoint *ep, int found = 0; /* The key identifier MUST correst to an existing key */ - if (asoc) + if (asoc) { + if (!asoc->peer.auth_capable) + return -EACCES; sh_keys = &asoc->endpoint_shared_keys; - else + } else { + if (!ep->auth_enable) + return -EACCES; sh_keys = &ep->endpoint_shared_keys; + } key_for_each(key, sh_keys) { if (key->key_id == key_id) { @@ -911,11 +921,15 @@ int sctp_auth_del_key_id(struct sctp_endpoint *ep, * The key identifier MUST correst to an existing key */ if (asoc) { + if (!asoc->peer.auth_capable) + return -EACCES; if (asoc->active_key_id == key_id) return -EINVAL; sh_keys = &asoc->endpoint_shared_keys; } else { + if (!ep->auth_enable) + return -EACCES; if (ep->active_key_id == key_id) return -EINVAL; @@ -950,11 +964,15 @@ int sctp_auth_deact_key_id(struct sctp_endpoint *ep, * The key identifier MUST correst to an existing key */ if (asoc) { + if (!asoc->peer.auth_capable) + return -EACCES; if (asoc->active_key_id == key_id) return -EINVAL; sh_keys = &asoc->endpoint_shared_keys; } else { + if (!ep->auth_enable) + return -EACCES; if (ep->active_key_id == key_id) return -EINVAL; @@ -989,3 +1007,72 @@ int sctp_auth_deact_key_id(struct sctp_endpoint *ep, return 0; } + +int sctp_auth_init(struct sctp_endpoint *ep, gfp_t gfp) +{ + int err = -ENOMEM; + + /* Allocate space for HMACS and CHUNKS authentication + * variables. There are arrays that we encode directly + * into parameters to make the rest of the operations easier. + */ + if (!ep->auth_hmacs_list) { + struct sctp_hmac_algo_param *auth_hmacs; + + auth_hmacs = kzalloc(struct_size(auth_hmacs, hmac_ids, + SCTP_AUTH_NUM_HMACS), gfp); + if (!auth_hmacs) + goto nomem; + /* Initialize the HMACS parameter. + * SCTP-AUTH: Section 3.3 + * Every endpoint supporting SCTP chunk authentication MUST + * support the HMAC based on the SHA-1 algorithm. + */ + auth_hmacs->param_hdr.type = SCTP_PARAM_HMAC_ALGO; + auth_hmacs->param_hdr.length = + htons(sizeof(struct sctp_paramhdr) + 2); + auth_hmacs->hmac_ids[0] = htons(SCTP_AUTH_HMAC_ID_SHA1); + ep->auth_hmacs_list = auth_hmacs; + } + + if (!ep->auth_chunk_list) { + struct sctp_chunks_param *auth_chunks; + + auth_chunks = kzalloc(sizeof(*auth_chunks) + + SCTP_NUM_CHUNK_TYPES, gfp); + if (!auth_chunks) + goto nomem; + /* Initialize the CHUNKS parameter */ + auth_chunks->param_hdr.type = SCTP_PARAM_CHUNKS; + auth_chunks->param_hdr.length = + htons(sizeof(struct sctp_paramhdr)); + ep->auth_chunk_list = auth_chunks; + } + + /* Allocate and initialize transorms arrays for supported + * HMACs. + */ + err = sctp_auth_init_hmacs(ep, gfp); + if (err) + goto nomem; + + return 0; + +nomem: + /* Free all allocations */ + kfree(ep->auth_hmacs_list); + kfree(ep->auth_chunk_list); + ep->auth_hmacs_list = NULL; + ep->auth_chunk_list = NULL; + return err; +} + +void sctp_auth_free(struct sctp_endpoint *ep) +{ + kfree(ep->auth_hmacs_list); + kfree(ep->auth_chunk_list); + ep->auth_hmacs_list = NULL; + ep->auth_chunk_list = NULL; + sctp_auth_destroy_hmacs(ep->auth_hmacs); + ep->auth_hmacs = NULL; +} diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index cc0405c79dfc..ab6a997e222f 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -75,41 +75,39 @@ static void sctp_datamsg_destroy(struct sctp_datamsg *msg) struct list_head *pos, *temp; struct sctp_chunk *chunk; struct sctp_ulpevent *ev; - int error = 0, notify; - - /* If we failed, we may need to notify. */ - notify = msg->send_failed ? -1 : 0; + int error, sent; /* Release all references. */ list_for_each_safe(pos, temp, &msg->chunks) { list_del_init(pos); chunk = list_entry(pos, struct sctp_chunk, frag_list); - /* Check whether we _really_ need to notify. */ - if (notify < 0) { - asoc = chunk->asoc; - if (msg->send_error) - error = msg->send_error; - else - error = asoc->outqueue.error; - - notify = sctp_ulpevent_type_enabled(asoc->subscribe, - SCTP_SEND_FAILED); + + if (!msg->send_failed) { + sctp_chunk_put(chunk); + continue; } - /* Generate a SEND FAILED event only if enabled. */ - if (notify > 0) { - int sent; - if (chunk->has_tsn) - sent = SCTP_DATA_SENT; - else - sent = SCTP_DATA_UNSENT; + asoc = chunk->asoc; + error = msg->send_error ?: asoc->outqueue.error; + sent = chunk->has_tsn ? SCTP_DATA_SENT : SCTP_DATA_UNSENT; + if (sctp_ulpevent_type_enabled(asoc->subscribe, + SCTP_SEND_FAILED)) { ev = sctp_ulpevent_make_send_failed(asoc, chunk, sent, error, GFP_ATOMIC); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } + if (sctp_ulpevent_type_enabled(asoc->subscribe, + SCTP_SEND_FAILED_EVENT)) { + ev = sctp_ulpevent_make_send_failed_event(asoc, chunk, + sent, error, + GFP_ATOMIC); + if (ev) + asoc->stream.si->enqueue_event(&asoc->ulpq, ev); + } + sctp_chunk_put(chunk); } @@ -227,7 +225,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, if (msg_len >= first_len) { msg->can_delay = 0; if (msg_len > first_len) - SCTP_INC_STATS(sock_net(asoc->base.sk), + SCTP_INC_STATS(asoc->base.net, SCTP_MIB_FRAGUSRMSGS); } else { /* Which may be the only one... */ diff --git a/net/sctp/diag.c b/net/sctp/diag.c index fc9a4c6629ce..8a15146faaeb 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -175,7 +175,7 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc, mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); - mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; + mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); if (nla_put(skb, INET_DIAG_SKMEMINFO, sizeof(mem), &mem) < 0) @@ -425,8 +425,8 @@ static void sctp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, r->idiag_rqueue = atomic_read(&infox->asoc->rmem_alloc); r->idiag_wqueue = infox->asoc->sndbuf_used; } else { - r->idiag_rqueue = sk->sk_ack_backlog; - r->idiag_wqueue = sk->sk_max_ack_backlog; + r->idiag_rqueue = READ_ONCE(sk->sk_ack_backlog); + r->idiag_wqueue = READ_ONCE(sk->sk_max_ack_backlog); } if (infox->sctpinfo) sctp_get_sctp_info(sk, infox->asoc, infox->sctpinfo); diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 69cebb2c998b..48c9c2c7602f 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -43,62 +43,21 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, gfp_t gfp) { struct net *net = sock_net(sk); - struct sctp_hmac_algo_param *auth_hmacs = NULL; - struct sctp_chunks_param *auth_chunks = NULL; struct sctp_shared_key *null_key; - int err; ep->digest = kzalloc(SCTP_SIGNATURE_SIZE, gfp); if (!ep->digest) return NULL; + ep->asconf_enable = net->sctp.addip_enable; ep->auth_enable = net->sctp.auth_enable; if (ep->auth_enable) { - /* Allocate space for HMACS and CHUNKS authentication - * variables. There are arrays that we encode directly - * into parameters to make the rest of the operations easier. - */ - auth_hmacs = kzalloc(struct_size(auth_hmacs, hmac_ids, - SCTP_AUTH_NUM_HMACS), gfp); - if (!auth_hmacs) - goto nomem; - - auth_chunks = kzalloc(sizeof(*auth_chunks) + - SCTP_NUM_CHUNK_TYPES, gfp); - if (!auth_chunks) + if (sctp_auth_init(ep, gfp)) goto nomem; - - /* Initialize the HMACS parameter. - * SCTP-AUTH: Section 3.3 - * Every endpoint supporting SCTP chunk authentication MUST - * support the HMAC based on the SHA-1 algorithm. - */ - auth_hmacs->param_hdr.type = SCTP_PARAM_HMAC_ALGO; - auth_hmacs->param_hdr.length = - htons(sizeof(struct sctp_paramhdr) + 2); - auth_hmacs->hmac_ids[0] = htons(SCTP_AUTH_HMAC_ID_SHA1); - - /* Initialize the CHUNKS parameter */ - auth_chunks->param_hdr.type = SCTP_PARAM_CHUNKS; - auth_chunks->param_hdr.length = - htons(sizeof(struct sctp_paramhdr)); - - /* If the Add-IP functionality is enabled, we must - * authenticate, ASCONF and ASCONF-ACK chunks - */ - if (net->sctp.addip_enable) { - auth_chunks->chunks[0] = SCTP_CID_ASCONF; - auth_chunks->chunks[1] = SCTP_CID_ASCONF_ACK; - auth_chunks->param_hdr.length = - htons(sizeof(struct sctp_paramhdr) + 2); + if (ep->asconf_enable) { + sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF); + sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF_ACK); } - - /* Allocate and initialize transorms arrays for supported - * HMACs. - */ - err = sctp_auth_init_hmacs(ep, gfp); - if (err) - goto nomem; } /* Initialize the base structure. */ @@ -145,23 +104,20 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, /* Add the null key to the endpoint shared keys list and * set the hmcas and chunks pointers. */ - ep->auth_hmacs_list = auth_hmacs; - ep->auth_chunk_list = auth_chunks; ep->prsctp_enable = net->sctp.prsctp_enable; ep->reconf_enable = net->sctp.reconf_enable; + ep->ecn_enable = net->sctp.ecn_enable; /* Remember who we are attached to. */ ep->base.sk = sk; + ep->base.net = sock_net(sk); sock_hold(ep->base.sk); return ep; nomem_shkey: - sctp_auth_destroy_hmacs(ep->auth_hmacs); + sctp_auth_free(ep); nomem: - /* Free all allocations */ - kfree(auth_hmacs); - kfree(auth_chunks); kfree(ep->digest); return NULL; @@ -209,7 +165,7 @@ void sctp_endpoint_add_asoc(struct sctp_endpoint *ep, /* Increment the backlog value for a TCP-style listening socket. */ if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) - sk->sk_ack_backlog++; + sk_acceptq_added(sk); } /* Free the endpoint structure. Delay cleanup until @@ -244,11 +200,7 @@ static void sctp_endpoint_destroy(struct sctp_endpoint *ep) * chunks and hmacs arrays that were allocated */ sctp_auth_destroy_keys(&ep->endpoint_shared_keys); - kfree(ep->auth_hmacs_list); - kfree(ep->auth_chunk_list); - - /* AUTH - Free any allocated HMAC transform containers */ - sctp_auth_destroy_hmacs(ep->auth_hmacs); + sctp_auth_free(ep); /* Cleanup. */ sctp_inq_free(&ep->base.inqueue); @@ -292,7 +244,7 @@ struct sctp_endpoint *sctp_endpoint_is_match(struct sctp_endpoint *ep, struct sctp_endpoint *retval = NULL; if ((htons(ep->base.bind_addr.port) == laddr->v4.sin_port) && - net_eq(sock_net(ep->base.sk), net)) { + net_eq(ep->base.net, net)) { if (sctp_bind_addr_match(&ep->base.bind_addr, laddr, sctp_sk(ep->base.sk))) retval = ep; @@ -340,8 +292,8 @@ bool sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep, const union sctp_addr *paddr) { struct sctp_sockaddr_entry *addr; + struct net *net = ep->base.net; struct sctp_bind_addr *bp; - struct net *net = sock_net(ep->base.sk); bp = &ep->base.bind_addr; /* This function is called with the socket lock held, @@ -432,7 +384,7 @@ normal: if (asoc && sctp_chunk_is_data(chunk)) asoc->peer.last_data_from = chunk->transport; else { - SCTP_INC_STATS(sock_net(ep->base.sk), SCTP_MIB_INCTRLCHUNKS); + SCTP_INC_STATS(ep->base.net, SCTP_MIB_INCTRLCHUNKS); if (asoc) asoc->stats.ictrlchunks++; } diff --git a/net/sctp/input.c b/net/sctp/input.c index 1008cdc44dd6..efaaefc3bb1c 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -201,7 +201,7 @@ int sctp_rcv(struct sk_buff *skb) if (!xfrm_policy_check(sk, XFRM_POLICY_IN, skb, family)) goto discard_release; - nf_reset(skb); + nf_reset_ct(skb); if (sk_filter(sk, skb)) goto discard_release; @@ -243,7 +243,7 @@ int sctp_rcv(struct sk_buff *skb) bh_lock_sock(sk); } - if (sock_owned_by_user(sk)) { + if (sock_owned_by_user(sk) || !sctp_newsk_ready(sk)) { if (sctp_add_backlog(sk, skb)) { bh_unlock_sock(sk); sctp_chunk_free(chunk); @@ -321,8 +321,8 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) local_bh_disable(); bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { - if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) + if (sock_owned_by_user(sk) || !sctp_newsk_ready(sk)) { + if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) sctp_chunk_free(chunk); else backloged = 1; @@ -336,7 +336,13 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) if (backloged) return 0; } else { - sctp_inq_push(inqueue, chunk); + if (!sctp_newsk_ready(sk)) { + if (!sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) + return 0; + sctp_chunk_free(chunk); + } else { + sctp_inq_push(inqueue, chunk); + } } done: @@ -358,7 +364,7 @@ static int sctp_add_backlog(struct sock *sk, struct sk_buff *skb) struct sctp_ep_common *rcvr = chunk->rcvr; int ret; - ret = sk_add_backlog(sk, skb, sk->sk_rcvbuf); + ret = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)); if (!ret) { /* Hold the assoc/ep while hanging on the backlog queue. * This way, we know structures we need will not disappear @@ -876,7 +882,7 @@ static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg, if (!sctp_transport_hold(t)) return err; - if (!net_eq(sock_net(t->asoc->base.sk), x->net)) + if (!net_eq(t->asoc->base.net, x->net)) goto out; if (x->lport != htons(t->asoc->base.bind_addr.port)) goto out; @@ -891,7 +897,7 @@ static inline __u32 sctp_hash_obj(const void *data, u32 len, u32 seed) { const struct sctp_transport *t = data; - return sctp_hashfn(sock_net(t->asoc->base.sk), + return sctp_hashfn(t->asoc->base.net, htons(t->asoc->base.bind_addr.port), &t->ipaddr, seed); } @@ -931,7 +937,7 @@ int sctp_hash_transport(struct sctp_transport *t) if (t->asoc->temp) return 0; - arg.net = sock_net(t->asoc->base.sk); + arg.net = t->asoc->base.net; arg.paddr = &t->ipaddr; arg.lport = htons(t->asoc->base.bind_addr.port); @@ -998,12 +1004,11 @@ struct sctp_transport *sctp_epaddr_lookup_transport( const struct sctp_endpoint *ep, const union sctp_addr *paddr) { - struct net *net = sock_net(ep->base.sk); struct rhlist_head *tmp, *list; struct sctp_transport *t; struct sctp_hash_cmp_arg arg = { .paddr = paddr, - .net = net, + .net = ep->base.net, .lport = htons(ep->base.bind_addr.port), }; diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index e5f2fc726a98..bc734cfaa29e 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -215,7 +215,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport) rcu_read_lock(); res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt), - tclass); + tclass, sk->sk_priority); rcu_read_unlock(); return res; } @@ -275,7 +275,7 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); rcu_read_unlock(); - dst = ip6_dst_lookup_flow(sk, fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (!asoc || saddr) goto out; @@ -328,7 +328,7 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, fl6->saddr = laddr->a.v6.sin6_addr; fl6->fl6_sport = laddr->a.v6.sin6_port; final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); - bdst = ip6_dst_lookup_flow(sk, fl6, final_p); + bdst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (IS_ERR(bdst)) continue; diff --git a/net/sctp/output.c b/net/sctp/output.c index dbda7e7927fd..1441eaf460bb 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -282,7 +282,7 @@ static enum sctp_xmit sctp_packet_bundle_sack(struct sctp_packet *pkt, sctp_chunk_free(sack); goto out; } - SCTP_INC_STATS(sock_net(asoc->base.sk), + SCTP_INC_STATS(asoc->base.net, SCTP_MIB_OUTCTRLCHUNKS); asoc->stats.octrlchunks++; asoc->peer.sack_needed = 0; diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 0dab62b67b9a..577e3bc4ee6f 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -36,6 +36,7 @@ #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> +#include <trace/events/sctp.h> /* Declare internal functions here. */ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn); @@ -279,7 +280,7 @@ void sctp_outq_free(struct sctp_outq *q) /* Put a new chunk in an sctp_outq. */ void sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp) { - struct net *net = sock_net(q->asoc->base.sk); + struct net *net = q->asoc->base.net; pr_debug("%s: outq:%p, chunk:%p[%s]\n", __func__, q, chunk, chunk && chunk->chunk_hdr ? @@ -533,7 +534,7 @@ void sctp_retransmit_mark(struct sctp_outq *q, void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport, enum sctp_retransmit_reason reason) { - struct net *net = sock_net(q->asoc->base.sk); + struct net *net = q->asoc->base.net; switch (reason) { case SCTP_RTXR_T3_RTX: @@ -1238,6 +1239,12 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk) /* Grab the association's destination address list. */ transport_list = &asoc->peer.transport_addr_list; + /* SCTP path tracepoint for congestion control debugging. */ + if (trace_sctp_probe_path_enabled()) { + list_for_each_entry(transport, transport_list, transports) + trace_sctp_probe_path(transport, asoc); + } + sack_ctsn = ntohl(sack->cum_tsn_ack); gap_ack_blocks = ntohs(sack->num_gap_ack_blocks); asoc->stats.gapcnt += gap_ack_blocks; @@ -1884,6 +1891,6 @@ void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn) if (ftsn_chunk) { list_add_tail(&ftsn_chunk->list, &q->control_chunk_list); - SCTP_INC_STATS(sock_net(asoc->base.sk), SCTP_MIB_OUTCTRLCHUNKS); + SCTP_INC_STATS(asoc->base.net, SCTP_MIB_OUTCTRLCHUNKS); } } diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 2d47adcb4cbe..78af2fcf90cc 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -227,6 +227,7 @@ static void sctp_v4_from_skb(union sctp_addr *addr, struct sk_buff *skb, sa->sin_port = sh->dest; sa->sin_addr.s_addr = ip_hdr(skb)->daddr; } + memset(sa->sin_zero, 0, sizeof(sa->sin_zero)); } /* Initialize an sctp_addr from a socket. */ @@ -235,6 +236,7 @@ static void sctp_v4_from_sk(union sctp_addr *addr, struct sock *sk) addr->v4.sin_family = AF_INET; addr->v4.sin_port = 0; addr->v4.sin_addr.s_addr = inet_sk(sk)->inet_rcv_saddr; + memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero)); } /* Initialize sk->sk_rcv_saddr from sctp_addr. */ @@ -257,6 +259,7 @@ static void sctp_v4_from_addr_param(union sctp_addr *addr, addr->v4.sin_family = AF_INET; addr->v4.sin_port = port; addr->v4.sin_addr.s_addr = param->v4.addr.s_addr; + memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero)); } /* Initialize an address parameter from a sctp_addr and return the length @@ -281,6 +284,7 @@ static void sctp_v4_dst_saddr(union sctp_addr *saddr, struct flowi4 *fl4, saddr->v4.sin_family = AF_INET; saddr->v4.sin_port = port; saddr->v4.sin_addr.s_addr = fl4->saddr; + memset(saddr->v4.sin_zero, 0, sizeof(saddr->v4.sin_zero)); } /* Compare two addresses exactly. */ @@ -303,6 +307,7 @@ static void sctp_v4_inaddr_any(union sctp_addr *addr, __be16 port) addr->v4.sin_family = AF_INET; addr->v4.sin_addr.s_addr = htonl(INADDR_ANY); addr->v4.sin_port = port; + memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero)); } /* Is this a wildcard address? */ @@ -1217,9 +1222,15 @@ static int __net_init sctp_defaults_init(struct net *net) /* Max.Burst - 4 */ net->sctp.max_burst = SCTP_DEFAULT_MAX_BURST; + /* Disable of Primary Path Switchover by default */ + net->sctp.ps_retrans = SCTP_PS_RETRANS_MAX; + /* Enable pf state by default */ net->sctp.pf_enable = 1; + /* Ignore pf exposure feature by default */ + net->sctp.pf_expose = SCTP_PF_EXPOSE_UNSET; + /* Association.Max.Retrans - 10 attempts * Path.Max.Retrans - 5 attempts (per destination address) * Max.Init.Retransmits - 8 attempts @@ -1254,6 +1265,9 @@ static int __net_init sctp_defaults_init(struct net *net) /* Disable AUTH by default. */ net->sctp.auth_enable = 0; + /* Enable ECN by default. */ + net->sctp.ecn_enable = 1; + /* Set SCOPE policy to enabled */ net->sctp.scope_policy = SCTP_SCOPE_POLICY_ENABLE; @@ -1336,7 +1350,7 @@ static int __net_init sctp_ctrlsock_init(struct net *net) return status; } -static void __net_init sctp_ctrlsock_exit(struct net *net) +static void __net_exit sctp_ctrlsock_exit(struct net *net) { /* Free the control endpoint. */ inet_ctl_sock_destroy(net->sctp.ctl_sock); diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 36bd8a6e82df..09050c1d5517 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -207,7 +207,6 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, const struct sctp_bind_addr *bp, gfp_t gfp, int vparam_len) { - struct net *net = sock_net(asoc->base.sk); struct sctp_supported_ext_param ext_param; struct sctp_adaptation_ind_param aiparam; struct sctp_paramhdr *auth_chunks = NULL; @@ -245,7 +244,9 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, chunksize = sizeof(init) + addrs_len; chunksize += SCTP_PAD4(SCTP_SAT_LEN(num_types)); - chunksize += sizeof(ecap_param); + + if (asoc->ep->ecn_enable) + chunksize += sizeof(ecap_param); if (asoc->ep->prsctp_enable) chunksize += sizeof(prsctp_param); @@ -255,7 +256,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, * the ASCONF,the ASCONF-ACK, and the AUTH chunks in its INIT and * INIT-ACK parameters. */ - if (net->sctp.addip_enable) { + if (asoc->ep->asconf_enable) { extensions[num_ext] = SCTP_CID_ASCONF; extensions[num_ext+1] = SCTP_CID_ASCONF_ACK; num_ext += 2; @@ -336,7 +337,8 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, sctp_addto_chunk(retval, sizeof(sat), &sat); sctp_addto_chunk(retval, num_types * sizeof(__u16), &types); - sctp_addto_chunk(retval, sizeof(ecap_param), &ecap_param); + if (asoc->ep->ecn_enable) + sctp_addto_chunk(retval, sizeof(ecap_param), &ecap_param); /* Add the supported extensions parameter. Be nice and add this * fist before addiding the parameters for the extensions themselves @@ -1964,7 +1966,9 @@ static int sctp_process_hn_param(const struct sctp_association *asoc, return 0; } -static int sctp_verify_ext_param(struct net *net, union sctp_params param) +static int sctp_verify_ext_param(struct net *net, + const struct sctp_endpoint *ep, + union sctp_params param) { __u16 num_ext = ntohs(param.p->length) - sizeof(struct sctp_paramhdr); int have_asconf = 0; @@ -1991,7 +1995,7 @@ static int sctp_verify_ext_param(struct net *net, union sctp_params param) if (net->sctp.addip_noauth) return 1; - if (net->sctp.addip_enable && !have_auth && have_asconf) + if (ep->asconf_enable && !have_auth && have_asconf) return 0; return 1; @@ -2001,7 +2005,6 @@ static void sctp_process_ext_param(struct sctp_association *asoc, union sctp_params param) { __u16 num_ext = ntohs(param.p->length) - sizeof(struct sctp_paramhdr); - struct net *net = sock_net(asoc->base.sk); int i; for (i = 0; i < num_ext; i++) { @@ -2023,7 +2026,7 @@ static void sctp_process_ext_param(struct sctp_association *asoc, break; case SCTP_CID_ASCONF: case SCTP_CID_ASCONF_ACK: - if (net->sctp.addip_enable) + if (asoc->ep->asconf_enable) asoc->peer.asconf_capable = 1; break; case SCTP_CID_I_DATA: @@ -2145,14 +2148,14 @@ static enum sctp_ierror sctp_verify_param(struct net *net, break; case SCTP_PARAM_SUPPORTED_EXT: - if (!sctp_verify_ext_param(net, param)) + if (!sctp_verify_ext_param(net, ep, param)) return SCTP_IERROR_ABORT; break; case SCTP_PARAM_SET_PRIMARY: - if (net->sctp.addip_enable) + if (ep->asconf_enable) break; - goto fallthrough; + goto unhandled; case SCTP_PARAM_HOST_NAME_ADDRESS: /* Tell the peer, we won't support this param. */ @@ -2163,11 +2166,11 @@ static enum sctp_ierror sctp_verify_param(struct net *net, case SCTP_PARAM_FWD_TSN_SUPPORT: if (ep->prsctp_enable) break; - goto fallthrough; + goto unhandled; case SCTP_PARAM_RANDOM: if (!ep->auth_enable) - goto fallthrough; + goto unhandled; /* SCTP-AUTH: Secion 6.1 * If the random number is not 32 byte long the association @@ -2184,7 +2187,7 @@ static enum sctp_ierror sctp_verify_param(struct net *net, case SCTP_PARAM_CHUNKS: if (!ep->auth_enable) - goto fallthrough; + goto unhandled; /* SCTP-AUTH: Section 3.2 * The CHUNKS parameter MUST be included once in the INIT or @@ -2200,7 +2203,7 @@ static enum sctp_ierror sctp_verify_param(struct net *net, case SCTP_PARAM_HMAC_ALGO: if (!ep->auth_enable) - goto fallthrough; + goto unhandled; hmacs = (struct sctp_hmac_algo_param *)param.p; n_elt = (ntohs(param.p->length) - @@ -2223,7 +2226,7 @@ static enum sctp_ierror sctp_verify_param(struct net *net, retval = SCTP_IERROR_ABORT; } break; -fallthrough: +unhandled: default: pr_debug("%s: unrecognized param:%d for chunk:%d\n", __func__, ntohs(param.p->type), cid); @@ -2304,7 +2307,6 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, const union sctp_addr *peer_addr, struct sctp_init_chunk *peer_init, gfp_t gfp) { - struct net *net = sock_net(asoc->base.sk); struct sctp_transport *transport; struct list_head *pos, *temp; union sctp_params param; @@ -2360,8 +2362,8 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, * also give us an option to silently ignore the packet, which * is what we'll do here. */ - if (!net->sctp.addip_noauth && - (asoc->peer.asconf_capable && !asoc->peer.auth_capable)) { + if (!asoc->base.net->sctp.addip_noauth && + (asoc->peer.asconf_capable && !asoc->peer.auth_capable)) { asoc->peer.addip_disabled_mask |= (SCTP_PARAM_ADD_IP | SCTP_PARAM_DEL_IP | SCTP_PARAM_SET_PRIMARY); @@ -2488,9 +2490,9 @@ static int sctp_process_param(struct sctp_association *asoc, const union sctp_addr *peer_addr, gfp_t gfp) { - struct net *net = sock_net(asoc->base.sk); struct sctp_endpoint *ep = asoc->ep; union sctp_addr_param *addr_param; + struct net *net = asoc->base.net; struct sctp_transport *t; enum sctp_scope scope; union sctp_addr addr; @@ -2597,15 +2599,20 @@ do_addr_param: break; case SCTP_PARAM_ECN_CAPABLE: - asoc->peer.ecn_capable = 1; - break; + if (asoc->ep->ecn_enable) { + asoc->peer.ecn_capable = 1; + break; + } + /* Fall Through */ + goto fall_through; + case SCTP_PARAM_ADAPTATION_LAYER_IND: asoc->peer.adaptation_ind = ntohl(param.aind->adaptation_ind); break; case SCTP_PARAM_SET_PRIMARY: - if (!net->sctp.addip_enable) + if (!ep->asconf_enable) goto fall_through; addr_param = param.v + sizeof(struct sctp_addip_param); diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 1cf5bb5b73c4..2bc29463e1dc 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -516,8 +516,6 @@ static void sctp_do_8_2_transport_strike(struct sctp_cmd_seq *commands, struct sctp_transport *transport, int is_hb) { - struct net *net = sock_net(asoc->base.sk); - /* The check for association's overall error counter exceeding the * threshold is done in the state function. */ @@ -544,10 +542,10 @@ static void sctp_do_8_2_transport_strike(struct sctp_cmd_seq *commands, * is SCTP_ACTIVE, then mark this transport as Partially Failed, * see SCTP Quick Failover Draft, section 5.1 */ - if (net->sctp.pf_enable && - (transport->state == SCTP_ACTIVE) && - (transport->error_count < transport->pathmaxrxt) && - (transport->error_count > asoc->pf_retrans)) { + if (asoc->base.net->sctp.pf_enable && + transport->state == SCTP_ACTIVE && + transport->error_count < transport->pathmaxrxt && + transport->error_count > transport->pf_retrans) { sctp_assoc_control_transport(asoc, transport, SCTP_TRANSPORT_PF, @@ -567,6 +565,11 @@ static void sctp_do_8_2_transport_strike(struct sctp_cmd_seq *commands, SCTP_FAILED_THRESHOLD); } + if (transport->error_count > transport->ps_retrans && + asoc->peer.primary_path == transport && + asoc->peer.active_path != transport) + sctp_assoc_set_primary(asoc, asoc->peer.active_path); + /* E2) For the destination address for which the timer * expires, set RTO <- RTO * 2 ("back off the timer"). The * maximum value discussed in rule C7 above (RTO.max) may be @@ -793,10 +796,8 @@ static int sctp_cmd_process_sack(struct sctp_cmd_seq *cmds, int err = 0; if (sctp_outq_sack(&asoc->outqueue, chunk)) { - struct net *net = sock_net(asoc->base.sk); - /* There are no more TSNs awaiting SACK. */ - err = sctp_do_sm(net, SCTP_EVENT_T_OTHER, + err = sctp_do_sm(asoc->base.net, SCTP_EVENT_T_OTHER, SCTP_ST_OTHER(SCTP_EVENT_NO_PENDING_TSN), asoc->state, asoc->ep, asoc, NULL, GFP_ATOMIC); @@ -829,7 +830,7 @@ static void sctp_cmd_assoc_update(struct sctp_cmd_seq *cmds, struct sctp_association *asoc, struct sctp_association *new) { - struct net *net = sock_net(asoc->base.sk); + struct net *net = asoc->base.net; struct sctp_chunk *abort; if (!sctp_assoc_update(asoc, new)) @@ -1358,8 +1359,10 @@ static int sctp_cmd_interpreter(enum sctp_event_type event_type, /* Generate an INIT ACK chunk. */ new_obj = sctp_make_init_ack(asoc, chunk, GFP_ATOMIC, 0); - if (!new_obj) - goto nomem; + if (!new_obj) { + error = -ENOMEM; + break; + } sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(new_obj)); @@ -1381,7 +1384,8 @@ static int sctp_cmd_interpreter(enum sctp_event_type event_type, if (!new_obj) { if (cmd->obj.chunk) sctp_chunk_free(cmd->obj.chunk); - goto nomem; + error = -ENOMEM; + break; } sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(new_obj)); @@ -1428,8 +1432,10 @@ static int sctp_cmd_interpreter(enum sctp_event_type event_type, /* Generate a SHUTDOWN chunk. */ new_obj = sctp_make_shutdown(asoc, chunk); - if (!new_obj) - goto nomem; + if (!new_obj) { + error = -ENOMEM; + break; + } sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(new_obj)); break; @@ -1765,11 +1771,17 @@ static int sctp_cmd_interpreter(enum sctp_event_type event_type, break; } - if (error) + if (error) { + cmd = sctp_next_cmd(commands); + while (cmd) { + if (cmd->verb == SCTP_CMD_REPLY) + sctp_chunk_free(cmd->obj.chunk); + cmd = sctp_next_cmd(commands); + } break; + } } -out: /* If this is in response to a received chunk, wait until * we are done with the packet to open the queue so that we don't * send multiple packets in response to a single request. @@ -1784,7 +1796,4 @@ out: sp->data_ready_signalled = 0; return error; -nomem: - error = -ENOMEM; - goto out; } diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 2c244b29a199..748e3b19ec1d 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -1320,7 +1320,7 @@ static int sctp_sf_check_restart_addrs(const struct sctp_association *new_asoc, struct sctp_chunk *init, struct sctp_cmd_seq *commands) { - struct net *net = sock_net(new_asoc->base.sk); + struct net *net = new_asoc->base.net; struct sctp_transport *new_addr; int ret = 1; @@ -2160,8 +2160,10 @@ enum sctp_disposition sctp_sf_do_5_2_4_dupcook( /* Update socket peer label if first association. */ if (security_sctp_assoc_request((struct sctp_endpoint *)ep, - chunk->skb)) + chunk->skb)) { + sctp_association_free(new_asoc); return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + } /* Set temp so that it won't be added into hashtable */ new_asoc->temp = 1; @@ -3279,8 +3281,6 @@ enum sctp_disposition sctp_sf_eat_sack_6_2(struct net *net, struct sctp_sackhdr *sackh; __u32 ctsn; - trace_sctp_probe(ep, asoc, chunk); - if (!sctp_vtag_verify(chunk, asoc)) return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); @@ -3297,6 +3297,15 @@ enum sctp_disposition sctp_sf_eat_sack_6_2(struct net *net, chunk->subh.sack_hdr = sackh; ctsn = ntohl(sackh->cum_tsn_ack); + /* If Cumulative TSN Ack beyond the max tsn currently + * send, terminating the association and respond to the + * sender with an ABORT. + */ + if (TSN_lte(asoc->next_tsn, ctsn)) + return sctp_sf_violation_ctsn(net, ep, asoc, type, arg, commands); + + trace_sctp_probe(ep, asoc, chunk); + /* i) If Cumulative TSN Ack is less than the Cumulative TSN * Ack Point, then drop the SACK. Since Cumulative TSN * Ack is monotonically increasing, a SACK whose @@ -3310,13 +3319,6 @@ enum sctp_disposition sctp_sf_eat_sack_6_2(struct net *net, return SCTP_DISPOSITION_DISCARD; } - /* If Cumulative TSN Ack beyond the max tsn currently - * send, terminating the association and respond to the - * sender with an ABORT. - */ - if (!TSN_lt(ctsn, asoc->next_tsn)) - return sctp_sf_violation_ctsn(net, ep, asoc, type, arg, commands); - /* Return this SACK for further processing. */ sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_SACK, SCTP_CHUNK(chunk)); @@ -3721,7 +3723,8 @@ enum sctp_disposition sctp_sf_do_asconf(struct net *net, * is received unauthenticated it MUST be silently discarded as * described in [I-D.ietf-tsvwg-sctp-auth]. */ - if (!net->sctp.addip_noauth && !chunk->auth) + if (!asoc->peer.asconf_capable || + (!net->sctp.addip_noauth && !chunk->auth)) return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands); @@ -3863,7 +3866,8 @@ enum sctp_disposition sctp_sf_do_asconf_ack(struct net *net, * is received unauthenticated it MUST be silently discarded as * described in [I-D.ietf-tsvwg-sctp-auth]. */ - if (!net->sctp.addip_noauth && !asconf_ack->auth) + if (!asoc->peer.asconf_capable || + (!net->sctp.addip_noauth && !asconf_ack->auth)) return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands); diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c index 61ed9c6e3be3..88ea87f4f0e7 100644 --- a/net/sctp/sm_statetable.c +++ b/net/sctp/sm_statetable.c @@ -976,26 +976,22 @@ static const struct sctp_sm_table_entry *sctp_chunk_event_lookup( if (cid <= SCTP_CID_BASE_MAX) return &chunk_event_table[cid][state]; - if (net->sctp.prsctp_enable) { - if (cid == SCTP_CID_FWD_TSN || cid == SCTP_CID_I_FWD_TSN) - return &prsctp_chunk_event_table[0][state]; - } + switch ((u16)cid) { + case SCTP_CID_FWD_TSN: + case SCTP_CID_I_FWD_TSN: + return &prsctp_chunk_event_table[0][state]; - if (net->sctp.addip_enable) { - if (cid == SCTP_CID_ASCONF) - return &addip_chunk_event_table[0][state]; + case SCTP_CID_ASCONF: + return &addip_chunk_event_table[0][state]; - if (cid == SCTP_CID_ASCONF_ACK) - return &addip_chunk_event_table[1][state]; - } + case SCTP_CID_ASCONF_ACK: + return &addip_chunk_event_table[1][state]; - if (net->sctp.reconf_enable) - if (cid == SCTP_CID_RECONF) - return &reconf_chunk_event_table[0][state]; + case SCTP_CID_RECONF: + return &reconf_chunk_event_table[0][state]; - if (net->sctp.auth_enable) { - if (cid == SCTP_CID_AUTH) - return &auth_chunk_event_table[0][state]; + case SCTP_CID_AUTH: + return &auth_chunk_event_table[0][state]; } return &chunk_event_table_unknown[state]; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 9d1f83b10c0a..1b56fc440606 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -309,7 +309,7 @@ static int sctp_bind(struct sock *sk, struct sockaddr *addr, int addr_len) return retval; } -static long sctp_get_port_local(struct sock *, union sctp_addr *); +static int sctp_get_port_local(struct sock *, union sctp_addr *); /* Verify this is a valid sockaddr. */ static struct sctp_af *sctp_sockaddr_af(struct sctp_sock *opt, @@ -384,7 +384,7 @@ static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len) } } - if (snum && snum < inet_prot_sock(net) && + if (snum && inet_port_requires_bind_service(net, snum) && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) return -EACCES; @@ -399,9 +399,8 @@ static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len) * detection. */ addr->v4.sin_port = htons(snum); - if ((ret = sctp_get_port_local(sk, addr))) { + if (sctp_get_port_local(sk, addr)) return -EADDRINUSE; - } /* Refresh ephemeral port. */ if (!bp->port) @@ -413,11 +412,13 @@ static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len) ret = sctp_add_bind_addr(bp, addr, af->sockaddr_len, SCTP_ADDR_SRC, GFP_ATOMIC); - /* Copy back into socket for getsockname() use. */ - if (!ret) { - inet_sk(sk)->inet_sport = htons(inet_sk(sk)->inet_num); - sp->pf->to_sk_saddr(addr, sk); + if (ret) { + sctp_put_port(sk); + return ret; } + /* Copy back into socket for getsockname() use. */ + inet_sk(sk)->inet_sport = htons(inet_sk(sk)->inet_num); + sp->pf->to_sk_saddr(addr, sk); return ret; } @@ -435,8 +436,7 @@ static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len) static int sctp_send_asconf(struct sctp_association *asoc, struct sctp_chunk *chunk) { - struct net *net = sock_net(asoc->base.sk); - int retval = 0; + int retval = 0; /* If there is an outstanding ASCONF chunk, queue it for later * transmission. @@ -448,7 +448,7 @@ static int sctp_send_asconf(struct sctp_association *asoc, /* Hold the chunk until an ASCONF_ACK is received. */ sctp_chunk_hold(chunk); - retval = sctp_primitive_ASCONF(net, asoc, chunk); + retval = sctp_primitive_ASCONF(asoc->base.net, asoc, chunk); if (retval) sctp_chunk_free(chunk); else @@ -524,7 +524,6 @@ static int sctp_send_asconf_add_ip(struct sock *sk, struct sockaddr *addrs, int addrcnt) { - struct net *net = sock_net(sk); struct sctp_sock *sp; struct sctp_endpoint *ep; struct sctp_association *asoc; @@ -539,12 +538,12 @@ static int sctp_send_asconf_add_ip(struct sock *sk, int i; int retval = 0; - if (!net->sctp.addip_enable) - return retval; - sp = sctp_sk(sk); ep = sp->ep; + if (!ep->asconf_enable) + return retval; + pr_debug("%s: sk:%p, addrs:%p, addrcnt:%d\n", __func__, sk, addrs, addrcnt); @@ -727,7 +726,6 @@ static int sctp_send_asconf_del_ip(struct sock *sk, struct sockaddr *addrs, int addrcnt) { - struct net *net = sock_net(sk); struct sctp_sock *sp; struct sctp_endpoint *ep; struct sctp_association *asoc; @@ -743,12 +741,12 @@ static int sctp_send_asconf_del_ip(struct sock *sk, int stored = 0; chunk = NULL; - if (!net->sctp.addip_enable) - return retval; - sp = sctp_sk(sk); ep = sp->ep; + if (!ep->asconf_enable) + return retval; + pr_debug("%s: sk:%p, addrs:%p, addrcnt:%d\n", __func__, sk, addrs, addrcnt); @@ -1044,158 +1042,161 @@ out: return err; } -/* __sctp_connect(struct sock* sk, struct sockaddr *kaddrs, int addrs_size) - * - * Common routine for handling connect() and sctp_connectx(). - * Connect will come in with just a single address. - */ -static int __sctp_connect(struct sock *sk, - struct sockaddr *kaddrs, - int addrs_size, int flags, - sctp_assoc_t *assoc_id) +static int sctp_connect_new_asoc(struct sctp_endpoint *ep, + const union sctp_addr *daddr, + const struct sctp_initmsg *init, + struct sctp_transport **tp) { + struct sctp_association *asoc; + struct sock *sk = ep->base.sk; struct net *net = sock_net(sk); - struct sctp_sock *sp; - struct sctp_endpoint *ep; - struct sctp_association *asoc = NULL; - struct sctp_association *asoc2; - struct sctp_transport *transport; - union sctp_addr to; enum sctp_scope scope; - long timeo; - int err = 0; - int addrcnt = 0; - int walk_size = 0; - union sctp_addr *sa_addr = NULL; - void *addr_buf; - unsigned short port; + int err; - sp = sctp_sk(sk); - ep = sp->ep; + if (sctp_endpoint_is_peeled_off(ep, daddr)) + return -EADDRNOTAVAIL; - /* connect() cannot be done on a socket that is already in ESTABLISHED - * state - UDP-style peeled off socket or a TCP-style socket that - * is already connected. - * It cannot be done even on a TCP-style listening socket. - */ - if (sctp_sstate(sk, ESTABLISHED) || sctp_sstate(sk, CLOSING) || - (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))) { - err = -EISCONN; - goto out_free; + if (!ep->base.bind_addr.port) { + if (sctp_autobind(sk)) + return -EAGAIN; + } else { + if (inet_port_requires_bind_service(net, ep->base.bind_addr.port) && + !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) + return -EACCES; } - /* Walk through the addrs buffer and count the number of addresses. */ - addr_buf = kaddrs; - while (walk_size < addrs_size) { - struct sctp_af *af; - - if (walk_size + sizeof(sa_family_t) > addrs_size) { - err = -EINVAL; - goto out_free; - } + scope = sctp_scope(daddr); + asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL); + if (!asoc) + return -ENOMEM; - sa_addr = addr_buf; - af = sctp_get_af_specific(sa_addr->sa.sa_family); + err = sctp_assoc_set_bind_addr_from_ep(asoc, scope, GFP_KERNEL); + if (err < 0) + goto free; - /* If the address family is not supported or if this address - * causes the address buffer to overflow return EINVAL. - */ - if (!af || (walk_size + af->sockaddr_len) > addrs_size) { - err = -EINVAL; - goto out_free; - } + *tp = sctp_assoc_add_peer(asoc, daddr, GFP_KERNEL, SCTP_UNKNOWN); + if (!*tp) { + err = -ENOMEM; + goto free; + } - port = ntohs(sa_addr->v4.sin_port); + if (!init) + return 0; - /* Save current address so we can work with it */ - memcpy(&to, sa_addr, af->sockaddr_len); + if (init->sinit_num_ostreams) { + __u16 outcnt = init->sinit_num_ostreams; - err = sctp_verify_addr(sk, &to, af->sockaddr_len); + asoc->c.sinit_num_ostreams = outcnt; + /* outcnt has been changed, need to re-init stream */ + err = sctp_stream_init(&asoc->stream, outcnt, 0, GFP_KERNEL); if (err) - goto out_free; + goto free; + } - /* Make sure the destination port is correctly set - * in all addresses. - */ - if (asoc && asoc->peer.port && asoc->peer.port != port) { - err = -EINVAL; - goto out_free; - } + if (init->sinit_max_instreams) + asoc->c.sinit_max_instreams = init->sinit_max_instreams; - /* Check if there already is a matching association on the - * endpoint (other than the one created here). - */ - asoc2 = sctp_endpoint_lookup_assoc(ep, &to, &transport); - if (asoc2 && asoc2 != asoc) { - if (asoc2->state >= SCTP_STATE_ESTABLISHED) - err = -EISCONN; - else - err = -EALREADY; - goto out_free; - } + if (init->sinit_max_attempts) + asoc->max_init_attempts = init->sinit_max_attempts; - /* If we could not find a matching association on the endpoint, - * make sure that there is no peeled-off association matching - * the peer address even on another socket. - */ - if (sctp_endpoint_is_peeled_off(ep, &to)) { - err = -EADDRNOTAVAIL; - goto out_free; - } + if (init->sinit_max_init_timeo) + asoc->max_init_timeo = + msecs_to_jiffies(init->sinit_max_init_timeo); - if (!asoc) { - /* If a bind() or sctp_bindx() is not called prior to - * an sctp_connectx() call, the system picks an - * ephemeral port and will choose an address set - * equivalent to binding with a wildcard address. - */ - if (!ep->base.bind_addr.port) { - if (sctp_autobind(sk)) { - err = -EAGAIN; - goto out_free; - } - } else { - /* - * If an unprivileged user inherits a 1-many - * style socket with open associations on a - * privileged port, it MAY be permitted to - * accept new associations, but it SHOULD NOT - * be permitted to open new associations. - */ - if (ep->base.bind_addr.port < - inet_prot_sock(net) && - !ns_capable(net->user_ns, - CAP_NET_BIND_SERVICE)) { - err = -EACCES; - goto out_free; - } - } + return 0; +free: + sctp_association_free(asoc); + return err; +} - scope = sctp_scope(&to); - asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL); - if (!asoc) { - err = -ENOMEM; - goto out_free; - } +static int sctp_connect_add_peer(struct sctp_association *asoc, + union sctp_addr *daddr, int addr_len) +{ + struct sctp_endpoint *ep = asoc->ep; + struct sctp_association *old; + struct sctp_transport *t; + int err; - err = sctp_assoc_set_bind_addr_from_ep(asoc, scope, - GFP_KERNEL); - if (err < 0) { - goto out_free; - } + err = sctp_verify_addr(ep->base.sk, daddr, addr_len); + if (err) + return err; - } + old = sctp_endpoint_lookup_assoc(ep, daddr, &t); + if (old && old != asoc) + return old->state >= SCTP_STATE_ESTABLISHED ? -EISCONN + : -EALREADY; + + if (sctp_endpoint_is_peeled_off(ep, daddr)) + return -EADDRNOTAVAIL; + + t = sctp_assoc_add_peer(asoc, daddr, GFP_KERNEL, SCTP_UNKNOWN); + if (!t) + return -ENOMEM; + + return 0; +} + +/* __sctp_connect(struct sock* sk, struct sockaddr *kaddrs, int addrs_size) + * + * Common routine for handling connect() and sctp_connectx(). + * Connect will come in with just a single address. + */ +static int __sctp_connect(struct sock *sk, struct sockaddr *kaddrs, + int addrs_size, int flags, sctp_assoc_t *assoc_id) +{ + struct sctp_sock *sp = sctp_sk(sk); + struct sctp_endpoint *ep = sp->ep; + struct sctp_transport *transport; + struct sctp_association *asoc; + void *addr_buf = kaddrs; + union sctp_addr *daddr; + struct sctp_af *af; + int walk_size, err; + long timeo; + + if (sctp_sstate(sk, ESTABLISHED) || sctp_sstate(sk, CLOSING) || + (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))) + return -EISCONN; - /* Prime the peer's transport structures. */ - transport = sctp_assoc_add_peer(asoc, &to, GFP_KERNEL, - SCTP_UNKNOWN); - if (!transport) { - err = -ENOMEM; + daddr = addr_buf; + af = sctp_get_af_specific(daddr->sa.sa_family); + if (!af || af->sockaddr_len > addrs_size) + return -EINVAL; + + err = sctp_verify_addr(sk, daddr, af->sockaddr_len); + if (err) + return err; + + asoc = sctp_endpoint_lookup_assoc(ep, daddr, &transport); + if (asoc) + return asoc->state >= SCTP_STATE_ESTABLISHED ? -EISCONN + : -EALREADY; + + err = sctp_connect_new_asoc(ep, daddr, NULL, &transport); + if (err) + return err; + asoc = transport->asoc; + + addr_buf += af->sockaddr_len; + walk_size = af->sockaddr_len; + while (walk_size < addrs_size) { + err = -EINVAL; + if (walk_size + sizeof(sa_family_t) > addrs_size) goto out_free; - } - addrcnt++; - addr_buf += af->sockaddr_len; + daddr = addr_buf; + af = sctp_get_af_specific(daddr->sa.sa_family); + if (!af || af->sockaddr_len + walk_size > addrs_size) + goto out_free; + + if (asoc->peer.port != ntohs(daddr->v4.sin_port)) + goto out_free; + + err = sctp_connect_add_peer(asoc, daddr, af->sockaddr_len); + if (err) + goto out_free; + + addr_buf += af->sockaddr_len; walk_size += af->sockaddr_len; } @@ -1208,40 +1209,25 @@ static int __sctp_connect(struct sock *sk, goto out_free; } - err = sctp_primitive_ASSOCIATE(net, asoc, NULL); - if (err < 0) { + err = sctp_primitive_ASSOCIATE(sock_net(sk), asoc, NULL); + if (err < 0) goto out_free; - } /* Initialize sk's dport and daddr for getpeername() */ inet_sk(sk)->inet_dport = htons(asoc->peer.port); - sp->pf->to_sk_daddr(sa_addr, sk); + sp->pf->to_sk_daddr(daddr, sk); sk->sk_err = 0; - timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); - if (assoc_id) *assoc_id = asoc->assoc_id; - err = sctp_wait_for_connect(asoc, &timeo); - /* Note: the asoc may be freed after the return of - * sctp_wait_for_connect. - */ - - /* Don't free association on exit. */ - asoc = NULL; + timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); + return sctp_wait_for_connect(asoc, &timeo); out_free: pr_debug("%s: took out_free path with asoc:%p kaddrs:%p err:%d\n", __func__, asoc, kaddrs, err); - - if (asoc) { - /* sctp_primitive_ASSOCIATE may have added this association - * To the hash table, try to unhash it, just in case, its a noop - * if it wasn't hashed so we're safe - */ - sctp_association_free(asoc); - } + sctp_association_free(asoc); return err; } @@ -1311,7 +1297,8 @@ static int __sctp_setsockopt_connectx(struct sock *sk, pr_debug("%s: sk:%p addrs:%p addrs_size:%d\n", __func__, sk, addrs, addrs_size); - if (unlikely(addrs_size <= 0)) + /* make sure the 1st addr's sa_family is accessible later */ + if (unlikely(addrs_size < sizeof(sa_family_t))) return -EINVAL; kaddrs = memdup_user(addrs, addrs_size); @@ -1659,9 +1646,7 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags, struct sctp_transport **tp) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; - struct net *net = sock_net(sk); struct sctp_association *asoc; - enum sctp_scope scope; struct cmsghdr *cmsg; __be32 flowinfo = 0; struct sctp_af *af; @@ -1676,20 +1661,6 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags, sctp_sstate(sk, CLOSING))) return -EADDRNOTAVAIL; - if (sctp_endpoint_is_peeled_off(ep, daddr)) - return -EADDRNOTAVAIL; - - if (!ep->base.bind_addr.port) { - if (sctp_autobind(sk)) - return -EAGAIN; - } else { - if (ep->base.bind_addr.port < inet_prot_sock(net) && - !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) - return -EACCES; - } - - scope = sctp_scope(daddr); - /* Label connection socket for first association 1-to-many * style for client sequence socket()->sendmsg(). This * needs to be done before sctp_assoc_add_peer() as that will @@ -1705,45 +1676,10 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags, if (err < 0) return err; - asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL); - if (!asoc) - return -ENOMEM; - - if (sctp_assoc_set_bind_addr_from_ep(asoc, scope, GFP_KERNEL) < 0) { - err = -ENOMEM; - goto free; - } - - if (cmsgs->init) { - struct sctp_initmsg *init = cmsgs->init; - - if (init->sinit_num_ostreams) { - __u16 outcnt = init->sinit_num_ostreams; - - asoc->c.sinit_num_ostreams = outcnt; - /* outcnt has been changed, need to re-init stream */ - err = sctp_stream_init(&asoc->stream, outcnt, 0, - GFP_KERNEL); - if (err) - goto free; - } - - if (init->sinit_max_instreams) - asoc->c.sinit_max_instreams = init->sinit_max_instreams; - - if (init->sinit_max_attempts) - asoc->max_init_attempts = init->sinit_max_attempts; - - if (init->sinit_max_init_timeo) - asoc->max_init_timeo = - msecs_to_jiffies(init->sinit_max_init_timeo); - } - - *tp = sctp_assoc_add_peer(asoc, daddr, GFP_KERNEL, SCTP_UNKNOWN); - if (!*tp) { - err = -ENOMEM; - goto free; - } + err = sctp_connect_new_asoc(ep, daddr, cmsgs->init, tp); + if (err) + return err; + asoc = (*tp)->asoc; if (!cmsgs->addrs_msg) return 0; @@ -1753,8 +1689,6 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags, /* sendv addr list parse */ for_each_cmsghdr(cmsg, cmsgs->addrs_msg) { - struct sctp_transport *transport; - struct sctp_association *old; union sctp_addr _daddr; int dlen; @@ -1788,30 +1722,10 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags, daddr->v6.sin6_port = htons(asoc->peer.port); memcpy(&daddr->v6.sin6_addr, CMSG_DATA(cmsg), dlen); } - err = sctp_verify_addr(sk, daddr, sizeof(*daddr)); - if (err) - goto free; - - old = sctp_endpoint_lookup_assoc(ep, daddr, &transport); - if (old && old != asoc) { - if (old->state >= SCTP_STATE_ESTABLISHED) - err = -EISCONN; - else - err = -EALREADY; - goto free; - } - - if (sctp_endpoint_is_peeled_off(ep, daddr)) { - err = -EADDRNOTAVAIL; - goto free; - } - transport = sctp_assoc_add_peer(asoc, daddr, GFP_KERNEL, - SCTP_UNKNOWN); - if (!transport) { - err = -ENOMEM; + err = sctp_connect_add_peer(asoc, daddr, sizeof(*daddr)); + if (err) goto free; - } } return 0; @@ -2513,9 +2427,8 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, int error; if (params->spp_flags & SPP_HB_DEMAND && trans) { - struct net *net = sock_net(trans->asoc->base.sk); - - error = sctp_primitive_REQUESTHEARTBEAT(net, trans->asoc, trans); + error = sctp_primitive_REQUESTHEARTBEAT(trans->asoc->base.net, + trans->asoc, trans); if (error) return error; } @@ -3414,7 +3327,6 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned static int sctp_setsockopt_peer_primary_addr(struct sock *sk, char __user *optval, unsigned int optlen) { - struct net *net = sock_net(sk); struct sctp_sock *sp; struct sctp_association *asoc = NULL; struct sctp_setpeerprim prim; @@ -3424,7 +3336,7 @@ static int sctp_setsockopt_peer_primary_addr(struct sock *sk, char __user *optva sp = sctp_sk(sk); - if (!net->sctp.addip_enable) + if (!sp->ep->asconf_enable) return -EPERM; if (optlen != sizeof(struct sctp_setpeerprim)) @@ -3774,9 +3686,6 @@ static int sctp_setsockopt_auth_key(struct sock *sk, struct sctp_association *asoc; int ret = -EINVAL; - if (!ep->auth_enable) - return -EACCES; - if (optlen <= sizeof(struct sctp_authkey)) return -EINVAL; /* authkey->sca_keylength is u16, so optlen can't be bigger than @@ -3843,9 +3752,6 @@ static int sctp_setsockopt_active_key(struct sock *sk, struct sctp_authkeyid val; int ret = 0; - if (!ep->auth_enable) - return -EACCES; - if (optlen != sizeof(struct sctp_authkeyid)) return -EINVAL; if (copy_from_user(&val, optval, optlen)) @@ -3897,9 +3803,6 @@ static int sctp_setsockopt_del_key(struct sock *sk, struct sctp_authkeyid val; int ret = 0; - if (!ep->auth_enable) - return -EACCES; - if (optlen != sizeof(struct sctp_authkeyid)) return -EINVAL; if (copy_from_user(&val, optval, optlen)) @@ -3950,9 +3853,6 @@ static int sctp_setsockopt_deactivate_key(struct sock *sk, char __user *optval, struct sctp_authkeyid val; int ret = 0; - if (!ep->auth_enable) - return -EACCES; - if (optlen != sizeof(struct sctp_authkeyid)) return -EINVAL; if (copy_from_user(&val, optval, optlen)) @@ -4041,18 +3941,22 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval, */ static int sctp_setsockopt_paddr_thresholds(struct sock *sk, char __user *optval, - unsigned int optlen) + unsigned int optlen, bool v2) { - struct sctp_paddrthlds val; + struct sctp_paddrthlds_v2 val; struct sctp_transport *trans; struct sctp_association *asoc; + int len; - if (optlen < sizeof(struct sctp_paddrthlds)) + len = v2 ? sizeof(val) : sizeof(struct sctp_paddrthlds); + if (optlen < len) return -EINVAL; - if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval, - sizeof(struct sctp_paddrthlds))) + if (copy_from_user(&val, optval, len)) return -EFAULT; + if (v2 && val.spt_pathpfthld > val.spt_pathcpthld) + return -EINVAL; + if (!sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) { trans = sctp_addr_id2transport(sk, &val.spt_address, val.spt_assoc_id); @@ -4061,6 +3965,8 @@ static int sctp_setsockopt_paddr_thresholds(struct sock *sk, if (val.spt_pathmaxrxt) trans->pathmaxrxt = val.spt_pathmaxrxt; + if (v2) + trans->ps_retrans = val.spt_pathcpthld; trans->pf_retrans = val.spt_pathpfthld; return 0; @@ -4076,17 +3982,23 @@ static int sctp_setsockopt_paddr_thresholds(struct sock *sk, transports) { if (val.spt_pathmaxrxt) trans->pathmaxrxt = val.spt_pathmaxrxt; + if (v2) + trans->ps_retrans = val.spt_pathcpthld; trans->pf_retrans = val.spt_pathpfthld; } if (val.spt_pathmaxrxt) asoc->pathmaxrxt = val.spt_pathmaxrxt; + if (v2) + asoc->ps_retrans = val.spt_pathcpthld; asoc->pf_retrans = val.spt_pathpfthld; } else { struct sctp_sock *sp = sctp_sk(sk); if (val.spt_pathmaxrxt) sp->pathmaxrxt = val.spt_pathmaxrxt; + if (v2) + sp->ps_retrans = val.spt_pathcpthld; sp->pf_retrans = val.spt_pathpfthld; } @@ -4583,6 +4495,144 @@ static int sctp_setsockopt_event(struct sock *sk, char __user *optval, return retval; } +static int sctp_setsockopt_asconf_supported(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + struct sctp_endpoint *ep; + int retval = -EINVAL; + + if (optlen != sizeof(params)) + goto out; + + if (copy_from_user(¶ms, optval, optlen)) { + retval = -EFAULT; + goto out; + } + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && + sctp_style(sk, UDP)) + goto out; + + ep = sctp_sk(sk)->ep; + ep->asconf_enable = !!params.assoc_value; + + if (ep->asconf_enable && ep->auth_enable) { + sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF); + sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF_ACK); + } + + retval = 0; + +out: + return retval; +} + +static int sctp_setsockopt_auth_supported(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + struct sctp_endpoint *ep; + int retval = -EINVAL; + + if (optlen != sizeof(params)) + goto out; + + if (copy_from_user(¶ms, optval, optlen)) { + retval = -EFAULT; + goto out; + } + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && + sctp_style(sk, UDP)) + goto out; + + ep = sctp_sk(sk)->ep; + if (params.assoc_value) { + retval = sctp_auth_init(ep, GFP_KERNEL); + if (retval) + goto out; + if (ep->asconf_enable) { + sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF); + sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF_ACK); + } + } + + ep->auth_enable = !!params.assoc_value; + retval = 0; + +out: + return retval; +} + +static int sctp_setsockopt_ecn_supported(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + int retval = -EINVAL; + + if (optlen != sizeof(params)) + goto out; + + if (copy_from_user(¶ms, optval, optlen)) { + retval = -EFAULT; + goto out; + } + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && + sctp_style(sk, UDP)) + goto out; + + sctp_sk(sk)->ep->ecn_enable = !!params.assoc_value; + retval = 0; + +out: + return retval; +} + +static int sctp_setsockopt_pf_expose(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + int retval = -EINVAL; + + if (optlen != sizeof(params)) + goto out; + + if (copy_from_user(¶ms, optval, optlen)) { + retval = -EFAULT; + goto out; + } + + if (params.assoc_value > SCTP_PF_EXPOSE_MAX) + goto out; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && + sctp_style(sk, UDP)) + goto out; + + if (asoc) + asoc->pf_expose = params.assoc_value; + else + sctp_sk(sk)->pf_expose = params.assoc_value; + retval = 0; + +out: + return retval; +} + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@ -4738,7 +4788,12 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, retval = sctp_setsockopt_auto_asconf(sk, optval, optlen); break; case SCTP_PEER_ADDR_THLDS: - retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen); + retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen, + false); + break; + case SCTP_PEER_ADDR_THLDS_V2: + retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen, + true); break; case SCTP_RECVRCVINFO: retval = sctp_setsockopt_recvrcvinfo(sk, optval, optlen); @@ -4783,6 +4838,18 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_EVENT: retval = sctp_setsockopt_event(sk, optval, optlen); break; + case SCTP_ASCONF_SUPPORTED: + retval = sctp_setsockopt_asconf_supported(sk, optval, optlen); + break; + case SCTP_AUTH_SUPPORTED: + retval = sctp_setsockopt_auth_supported(sk, optval, optlen); + break; + case SCTP_ECN_SUPPORTED: + retval = sctp_setsockopt_ecn_supported(sk, optval, optlen); + break; + case SCTP_EXPOSE_POTENTIALLY_FAILED_STATE: + retval = sctp_setsockopt_pf_expose(sk, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -5026,6 +5093,8 @@ static int sctp_init_sock(struct sock *sk) sp->hbinterval = net->sctp.hb_interval; sp->pathmaxrxt = net->sctp.max_retrans_path; sp->pf_retrans = net->sctp.pf_retrans; + sp->ps_retrans = net->sctp.ps_retrans; + sp->pf_expose = net->sctp.pf_expose; sp->pathmtu = 0; /* allow default discovery */ sp->sackdelay = net->sctp.sack_timeout; sp->sackfreq = 2; @@ -5293,7 +5362,7 @@ struct sctp_transport *sctp_transport_get_next(struct net *net, if (!sctp_transport_hold(t)) continue; - if (net_eq(sock_net(t->asoc->base.sk), net) && + if (net_eq(t->asoc->base.net, net) && t->asoc->peer.primary_path == t) break; @@ -5506,8 +5575,16 @@ static int sctp_getsockopt_peer_addr_info(struct sock *sk, int len, transport = sctp_addr_id2transport(sk, &pinfo.spinfo_address, pinfo.spinfo_assoc_id); - if (!transport) - return -EINVAL; + if (!transport) { + retval = -EINVAL; + goto out; + } + + if (transport->state == SCTP_PF && + transport->asoc->pf_expose == SCTP_PF_EXPOSE_DISABLE) { + retval = -EACCES; + goto out; + } pinfo.spinfo_assoc_id = sctp_assoc2id(transport->asoc); pinfo.spinfo_state = transport->state; @@ -6920,9 +6997,6 @@ static int sctp_getsockopt_active_key(struct sock *sk, int len, struct sctp_authkeyid val; struct sctp_association *asoc; - if (!ep->auth_enable) - return -EACCES; - if (len < sizeof(struct sctp_authkeyid)) return -EINVAL; @@ -6934,10 +7008,15 @@ static int sctp_getsockopt_active_key(struct sock *sk, int len, if (!asoc && val.scact_assoc_id && sctp_style(sk, UDP)) return -EINVAL; - if (asoc) + if (asoc) { + if (!asoc->peer.auth_capable) + return -EACCES; val.scact_keynumber = asoc->active_key_id; - else + } else { + if (!ep->auth_enable) + return -EACCES; val.scact_keynumber = ep->active_key_id; + } if (put_user(len, optlen)) return -EFAULT; @@ -6950,7 +7029,6 @@ static int sctp_getsockopt_active_key(struct sock *sk, int len, static int sctp_getsockopt_peer_auth_chunks(struct sock *sk, int len, char __user *optval, int __user *optlen) { - struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_authchunks __user *p = (void __user *)optval; struct sctp_authchunks val; struct sctp_association *asoc; @@ -6958,9 +7036,6 @@ static int sctp_getsockopt_peer_auth_chunks(struct sock *sk, int len, u32 num_chunks = 0; char __user *to; - if (!ep->auth_enable) - return -EACCES; - if (len < sizeof(struct sctp_authchunks)) return -EINVAL; @@ -6972,6 +7047,9 @@ static int sctp_getsockopt_peer_auth_chunks(struct sock *sk, int len, if (!asoc) return -EINVAL; + if (!asoc->peer.auth_capable) + return -EACCES; + ch = asoc->peer.peer_chunks; if (!ch) goto num; @@ -7003,9 +7081,6 @@ static int sctp_getsockopt_local_auth_chunks(struct sock *sk, int len, u32 num_chunks = 0; char __user *to; - if (!ep->auth_enable) - return -EACCES; - if (len < sizeof(struct sctp_authchunks)) return -EINVAL; @@ -7018,8 +7093,15 @@ static int sctp_getsockopt_local_auth_chunks(struct sock *sk, int len, sctp_style(sk, UDP)) return -EINVAL; - ch = asoc ? (struct sctp_chunks_param *)asoc->c.auth_chunks - : ep->auth_chunk_list; + if (asoc) { + if (!asoc->peer.auth_capable) + return -EACCES; + ch = (struct sctp_chunks_param *)asoc->c.auth_chunks; + } else { + if (!ep->auth_enable) + return -EACCES; + ch = ep->auth_chunk_list; + } if (!ch) goto num; @@ -7150,18 +7232,19 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len, * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt */ static int sctp_getsockopt_paddr_thresholds(struct sock *sk, - char __user *optval, - int len, - int __user *optlen) + char __user *optval, int len, + int __user *optlen, bool v2) { - struct sctp_paddrthlds val; + struct sctp_paddrthlds_v2 val; struct sctp_transport *trans; struct sctp_association *asoc; + int min; - if (len < sizeof(struct sctp_paddrthlds)) + min = v2 ? sizeof(val) : sizeof(struct sctp_paddrthlds); + if (len < min) return -EINVAL; - len = sizeof(struct sctp_paddrthlds); - if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval, len)) + len = min; + if (copy_from_user(&val, optval, len)) return -EFAULT; if (!sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) { @@ -7172,8 +7255,9 @@ static int sctp_getsockopt_paddr_thresholds(struct sock *sk, val.spt_pathmaxrxt = trans->pathmaxrxt; val.spt_pathpfthld = trans->pf_retrans; + val.spt_pathcpthld = trans->ps_retrans; - return 0; + goto out; } asoc = sctp_id2assoc(sk, val.spt_assoc_id); @@ -7184,13 +7268,16 @@ static int sctp_getsockopt_paddr_thresholds(struct sock *sk, if (asoc) { val.spt_pathpfthld = asoc->pf_retrans; val.spt_pathmaxrxt = asoc->pathmaxrxt; + val.spt_pathcpthld = asoc->ps_retrans; } else { struct sctp_sock *sp = sctp_sk(sk); val.spt_pathpfthld = sp->pf_retrans; val.spt_pathmaxrxt = sp->pathmaxrxt; + val.spt_pathcpthld = sp->ps_retrans; } +out: if (put_user(len, optlen) || copy_to_user(optval, &val, len)) return -EFAULT; @@ -7762,6 +7849,162 @@ static int sctp_getsockopt_event(struct sock *sk, int len, char __user *optval, return 0; } +static int sctp_getsockopt_asconf_supported(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + int retval = -EFAULT; + + if (len < sizeof(params)) { + retval = -EINVAL; + goto out; + } + + len = sizeof(params); + if (copy_from_user(¶ms, optval, len)) + goto out; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && + sctp_style(sk, UDP)) { + retval = -EINVAL; + goto out; + } + + params.assoc_value = asoc ? asoc->peer.asconf_capable + : sctp_sk(sk)->ep->asconf_enable; + + if (put_user(len, optlen)) + goto out; + + if (copy_to_user(optval, ¶ms, len)) + goto out; + + retval = 0; + +out: + return retval; +} + +static int sctp_getsockopt_auth_supported(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + int retval = -EFAULT; + + if (len < sizeof(params)) { + retval = -EINVAL; + goto out; + } + + len = sizeof(params); + if (copy_from_user(¶ms, optval, len)) + goto out; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && + sctp_style(sk, UDP)) { + retval = -EINVAL; + goto out; + } + + params.assoc_value = asoc ? asoc->peer.auth_capable + : sctp_sk(sk)->ep->auth_enable; + + if (put_user(len, optlen)) + goto out; + + if (copy_to_user(optval, ¶ms, len)) + goto out; + + retval = 0; + +out: + return retval; +} + +static int sctp_getsockopt_ecn_supported(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + int retval = -EFAULT; + + if (len < sizeof(params)) { + retval = -EINVAL; + goto out; + } + + len = sizeof(params); + if (copy_from_user(¶ms, optval, len)) + goto out; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && + sctp_style(sk, UDP)) { + retval = -EINVAL; + goto out; + } + + params.assoc_value = asoc ? asoc->peer.ecn_capable + : sctp_sk(sk)->ep->ecn_enable; + + if (put_user(len, optlen)) + goto out; + + if (copy_to_user(optval, ¶ms, len)) + goto out; + + retval = 0; + +out: + return retval; +} + +static int sctp_getsockopt_pf_expose(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + int retval = -EFAULT; + + if (len < sizeof(params)) { + retval = -EINVAL; + goto out; + } + + len = sizeof(params); + if (copy_from_user(¶ms, optval, len)) + goto out; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && + sctp_style(sk, UDP)) { + retval = -EINVAL; + goto out; + } + + params.assoc_value = asoc ? asoc->pf_expose + : sctp_sk(sk)->pf_expose; + + if (put_user(len, optlen)) + goto out; + + if (copy_to_user(optval, ¶ms, len)) + goto out; + + retval = 0; + +out: + return retval; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -7911,7 +8154,12 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen); break; case SCTP_PEER_ADDR_THLDS: - retval = sctp_getsockopt_paddr_thresholds(sk, optval, len, optlen); + retval = sctp_getsockopt_paddr_thresholds(sk, optval, len, + optlen, false); + break; + case SCTP_PEER_ADDR_THLDS_V2: + retval = sctp_getsockopt_paddr_thresholds(sk, optval, len, + optlen, true); break; case SCTP_GET_ASSOC_STATS: retval = sctp_getsockopt_assoc_stats(sk, len, optval, optlen); @@ -7963,6 +8211,20 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, case SCTP_EVENT: retval = sctp_getsockopt_event(sk, len, optval, optlen); break; + case SCTP_ASCONF_SUPPORTED: + retval = sctp_getsockopt_asconf_supported(sk, len, optval, + optlen); + break; + case SCTP_AUTH_SUPPORTED: + retval = sctp_getsockopt_auth_supported(sk, len, optval, + optlen); + break; + case SCTP_ECN_SUPPORTED: + retval = sctp_getsockopt_ecn_supported(sk, len, optval, optlen); + break; + case SCTP_EXPOSE_POTENTIALLY_FAILED_STATE: + retval = sctp_getsockopt_pf_expose(sk, len, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -7998,11 +8260,12 @@ static void sctp_unhash(struct sock *sk) static struct sctp_bind_bucket *sctp_bucket_create( struct sctp_bind_hashbucket *head, struct net *, unsigned short snum); -static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) +static int sctp_get_port_local(struct sock *sk, union sctp_addr *addr) { struct sctp_sock *sp = sctp_sk(sk); bool reuse = (sk->sk_reuse || sp->reuse); struct sctp_bind_hashbucket *head; /* hash list */ + struct net *net = sock_net(sk); kuid_t uid = sock_i_uid(sk); struct sctp_bind_bucket *pp; unsigned short snum; @@ -8018,7 +8281,6 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) /* Search for an available port. */ int low, high, remaining, index; unsigned int rover; - struct net *net = sock_net(sk); inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; @@ -8030,12 +8292,12 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) rover = low; if (inet_is_local_reserved_port(net, rover)) continue; - index = sctp_phashfn(sock_net(sk), rover); + index = sctp_phashfn(net, rover); head = &sctp_port_hashtable[index]; spin_lock(&head->lock); sctp_for_each_hentry(pp, &head->chain) if ((pp->port == rover) && - net_eq(sock_net(sk), pp->net)) + net_eq(net, pp->net)) goto next; break; next: @@ -8059,10 +8321,10 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) * to the port number (snum) - we detect that with the * port iterator, pp being NULL. */ - head = &sctp_port_hashtable[sctp_phashfn(sock_net(sk), snum)]; + head = &sctp_port_hashtable[sctp_phashfn(net, snum)]; spin_lock(&head->lock); sctp_for_each_hentry(pp, &head->chain) { - if ((pp->port == snum) && net_eq(pp->net, sock_net(sk))) + if ((pp->port == snum) && net_eq(pp->net, net)) goto pp_found; } } @@ -8108,7 +8370,7 @@ pp_found: if (sctp_bind_addr_conflict(&ep2->base.bind_addr, addr, sp2, sp)) { - ret = (long)sk2; + ret = 1; goto fail_unlock; } } @@ -8118,7 +8380,7 @@ pp_found: pp_not_found: /* If there was a hash table miss, create a new port. */ ret = 1; - if (!pp && !(pp = sctp_bucket_create(head, sock_net(sk), snum))) + if (!pp && !(pp = sctp_bucket_create(head, net, snum))) goto fail_unlock; /* In either case (hit or miss), make sure fastreuse is 1 only @@ -8180,7 +8442,7 @@ static int sctp_get_port(struct sock *sk, unsigned short snum) addr.v4.sin_port = htons(snum); /* Note: sk->sk_num gets filled in if ephemeral port request. */ - return !!sctp_get_port_local(sk, &addr); + return sctp_get_port_local(sk, &addr); } /* @@ -8227,7 +8489,7 @@ static int sctp_listen_start(struct sock *sk, int backlog) } } - sk->sk_max_ack_backlog = backlog; + WRITE_ONCE(sk->sk_max_ack_backlog, backlog); return sctp_hash_endpoint(ep); } @@ -8281,7 +8543,7 @@ int sctp_inet_listen(struct socket *sock, int backlog) /* If we are already listening, just update the backlog */ if (sctp_sstate(sk, LISTENING)) - sk->sk_max_ack_backlog = backlog; + WRITE_ONCE(sk->sk_max_ack_backlog, backlog); else { err = sctp_listen_start(sk, backlog); if (err) @@ -8327,7 +8589,7 @@ __poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait) mask = 0; /* Is there any exceptional events? */ - if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); if (sk->sk_shutdown & RCV_SHUTDOWN) @@ -8336,7 +8598,7 @@ __poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait) mask |= EPOLLHUP; /* Is it readable? Reconsider this code with TCP-style support. */ - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* The association is either gone or not ready. */ @@ -8722,7 +8984,7 @@ struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags, if (sk_can_busy_loop(sk)) { sk_busy_loop(sk, noblock); - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) continue; } @@ -9157,7 +9419,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, newinet->inet_rcv_saddr = inet->inet_rcv_saddr; newinet->inet_dport = htons(asoc->peer.port); newinet->pmtudisc = inet->pmtudisc; - newinet->inet_id = asoc->next_tsn ^ jiffies; + newinet->inet_id = prandom_u32(); newinet->uc_ttl = inet->uc_ttl; newinet->mc_loop = 1; @@ -9351,7 +9613,7 @@ struct proto sctp_prot = { .backlog_rcv = sctp_backlog_rcv, .hash = sctp_hash, .unhash = sctp_unhash, - .get_port = sctp_get_port, + .no_autobind = true, .obj_size = sizeof(struct sctp_sock), .useroffset = offsetof(struct sctp_sock, subscribe), .usersize = offsetof(struct sctp_sock, initmsg) - @@ -9393,7 +9655,7 @@ struct proto sctpv6_prot = { .backlog_rcv = sctp_backlog_rcv, .hash = sctp_hash, .unhash = sctp_unhash, - .get_port = sctp_get_port, + .no_autobind = true, .obj_size = sizeof(struct sctp6_sock), .useroffset = offsetof(struct sctp6_sock, sctp.subscribe), .usersize = offsetof(struct sctp6_sock, sctp.initmsg) - diff --git a/net/sctp/stream.c b/net/sctp/stream.c index e83cdaa2ab76..67f7e71f9129 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -119,7 +119,7 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, * a new one with new outcnt to save memory if needed. */ if (outcnt == stream->outcnt) - goto in; + goto handle_in; /* Filter out chunks queued on streams that won't exist anymore */ sched->unsched_all(stream); @@ -128,24 +128,28 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, ret = sctp_stream_alloc_out(stream, outcnt, gfp); if (ret) - goto out; + goto out_err; for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; -in: +handle_in: sctp_stream_interleave_init(stream); if (!incnt) goto out; ret = sctp_stream_alloc_in(stream, incnt, gfp); - if (ret) { - sched->free(stream); - genradix_free(&stream->out); - stream->outcnt = 0; - goto out; - } + if (ret) + goto in_err; + goto out; + +in_err: + sched->free(stream); + genradix_free(&stream->in); +out_err: + genradix_free(&stream->out); + stream->outcnt = 0; out: return ret; } @@ -218,10 +222,9 @@ void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new) static int sctp_send_reconf(struct sctp_association *asoc, struct sctp_chunk *chunk) { - struct net *net = sock_net(asoc->base.sk); int retval = 0; - retval = sctp_primitive_RECONF(net, asoc, chunk); + retval = sctp_primitive_RECONF(asoc->base.net, asoc, chunk); if (retval) sctp_chunk_free(chunk); diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c index 40c40be23fcb..6b13f737ebf2 100644 --- a/net/sctp/stream_interleave.c +++ b/net/sctp/stream_interleave.c @@ -241,9 +241,8 @@ out: if (!first_frag) return NULL; - retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk), - &ulpq->reasm, first_frag, - last_frag); + retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, + first_frag, last_frag); if (retval) { sin->fsn = next_fsn; if (is_last) { @@ -326,7 +325,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_reassembled( pd_point = sctp_sk(asoc->base.sk)->pd_point; if (pd_point && pd_point <= pd_len) { - retval = sctp_make_reassembled_event(sock_net(asoc->base.sk), + retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm, pd_first, pd_last); if (retval) { @@ -337,8 +336,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_reassembled( goto out; found: - retval = sctp_make_reassembled_event(sock_net(asoc->base.sk), - &ulpq->reasm, + retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm, first_frag, pos); if (retval) retval->msg_flags |= MSG_EOR; @@ -630,7 +628,7 @@ out: if (!first_frag) return NULL; - retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk), + retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm_uo, first_frag, last_frag); if (retval) { @@ -716,7 +714,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_reassembled_uo( pd_point = sctp_sk(asoc->base.sk)->pd_point; if (pd_point && pd_point <= pd_len) { - retval = sctp_make_reassembled_event(sock_net(asoc->base.sk), + retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm_uo, pd_first, pd_last); if (retval) { @@ -727,8 +725,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_reassembled_uo( goto out; found: - retval = sctp_make_reassembled_event(sock_net(asoc->base.sk), - &ulpq->reasm_uo, + retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm_uo, first_frag, pos); if (retval) retval->msg_flags |= MSG_EOR; @@ -814,7 +811,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_first_uo(struct sctp_ulpq *ulpq) return NULL; out: - retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk), + retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm_uo, first_frag, last_frag); if (retval) { @@ -921,7 +918,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_first(struct sctp_ulpq *ulpq) return NULL; out: - retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk), + retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, first_frag, last_frag); if (retval) { @@ -1159,7 +1156,7 @@ static void sctp_generate_iftsn(struct sctp_outq *q, __u32 ctsn) if (ftsn_chunk) { list_add_tail(&ftsn_chunk->list, &q->control_chunk_list); - SCTP_INC_STATS(sock_net(asoc->base.sk), SCTP_MIB_OUTCTRLCHUNKS); + SCTP_INC_STATS(asoc->base.net, SCTP_MIB_OUTCTRLCHUNKS); } } diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 1250751bca1b..4740aa70e652 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -34,6 +34,8 @@ static int rto_alpha_min = 0; static int rto_beta_min = 0; static int rto_alpha_max = 1000; static int rto_beta_max = 1000; +static int pf_expose_max = SCTP_PF_EXPOSE_MAX; +static int ps_retrans_max = SCTP_PS_RETRANS_MAX; static unsigned long max_autoclose_min = 0; static unsigned long max_autoclose_max = @@ -212,7 +214,16 @@ static struct ctl_table sctp_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_INT_MAX, + .extra2 = &init_net.sctp.ps_retrans, + }, + { + .procname = "ps_retrans", + .data = &init_net.sctp.ps_retrans, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &init_net.sctp.pf_retrans, + .extra2 = &ps_retrans_max, }, { .procname = "sndbuf_policy", @@ -278,6 +289,13 @@ static struct ctl_table sctp_net_table[] = { .proc_handler = proc_dointvec, }, { + .procname = "ecn_enable", + .data = &init_net.sctp.ecn_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .procname = "addr_scope_policy", .data = &init_net.sctp.scope_policy, .maxlen = sizeof(int), @@ -311,6 +329,15 @@ static struct ctl_table sctp_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "pf_expose", + .data = &init_net.sctp.pf_expose, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &pf_expose_max, + }, { /* sentinel */ } }; diff --git a/net/sctp/transport.c b/net/sctp/transport.c index e2f8e369cd08..806af58f4375 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -43,8 +43,8 @@ static struct sctp_transport *sctp_transport_init(struct net *net, gfp_t gfp) { /* Copy in the address. */ - peer->ipaddr = *addr; peer->af_specific = sctp_get_af_specific(addr->sa.sa_family); + memcpy(&peer->ipaddr, addr, peer->af_specific->sockaddr_len); memset(&peer->saddr, 0, sizeof(union sctp_addr)); peer->sack_generation = 0; @@ -263,7 +263,7 @@ bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu) pf->af->from_sk(&addr, sk); pf->to_sk_daddr(&t->ipaddr, sk); - dst->ops->update_pmtu(dst, sk, NULL, pmtu); + dst->ops->update_pmtu(dst, sk, NULL, pmtu, true); pf->to_sk_daddr(&addr, sk); dst = sctp_transport_dst_check(t); @@ -334,7 +334,7 @@ void sctp_transport_update_rto(struct sctp_transport *tp, __u32 rtt) pr_debug("%s: rto_pending not set on transport %p!\n", __func__, tp); if (tp->rttvar || tp->srtt) { - struct net *net = sock_net(tp->asoc->base.sk); + struct net *net = tp->asoc->base.net; /* 6.3.1 C3) When a new RTT measurement R' is made, set * RTTVAR <- (1 - RTO.Beta) * RTTVAR + RTO.Beta * |SRTT - R'| * SRTT <- (1 - RTO.Alpha) * SRTT + RTO.Alpha * R' diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index e0cc1edf49a0..c82dbdcf13f2 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -238,7 +238,7 @@ fail: * When a destination address on a multi-homed peer encounters a change * an interface details event is sent. */ -struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change( +static struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change( const struct sctp_association *asoc, const struct sockaddr_storage *aaddr, int flags, int state, int error, gfp_t gfp) @@ -336,6 +336,22 @@ fail: return NULL; } +void sctp_ulpevent_nofity_peer_addr_change(struct sctp_transport *transport, + int state, int error) +{ + struct sctp_association *asoc = transport->asoc; + struct sockaddr_storage addr; + struct sctp_ulpevent *event; + + memset(&addr, 0, sizeof(struct sockaddr_storage)); + memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len); + + event = sctp_ulpevent_make_peer_addr_change(asoc, &addr, 0, state, + error, GFP_ATOMIC); + if (event) + asoc->stream.si->enqueue_event(&asoc->ulpq, event); +} + /* Create and initialize an SCTP_REMOTE_ERROR notification. * * Note: This assumes that the chunk->skb->data already points to the @@ -511,6 +527,45 @@ fail: return NULL; } +struct sctp_ulpevent *sctp_ulpevent_make_send_failed_event( + const struct sctp_association *asoc, struct sctp_chunk *chunk, + __u16 flags, __u32 error, gfp_t gfp) +{ + struct sctp_send_failed_event *ssf; + struct sctp_ulpevent *event; + struct sk_buff *skb; + int len; + + skb = skb_copy_expand(chunk->skb, sizeof(*ssf), 0, gfp); + if (!skb) + return NULL; + + len = ntohs(chunk->chunk_hdr->length); + len -= sctp_datachk_len(&asoc->stream); + + skb_pull(skb, sctp_datachk_len(&asoc->stream)); + event = sctp_skb2event(skb); + sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize); + + ssf = skb_push(skb, sizeof(*ssf)); + ssf->ssf_type = SCTP_SEND_FAILED_EVENT; + ssf->ssf_flags = flags; + ssf->ssf_length = sizeof(*ssf) + len; + skb_trim(skb, ssf->ssf_length); + ssf->ssf_error = error; + + ssf->ssfe_info.snd_sid = chunk->sinfo.sinfo_stream; + ssf->ssfe_info.snd_ppid = chunk->sinfo.sinfo_ppid; + ssf->ssfe_info.snd_context = chunk->sinfo.sinfo_context; + ssf->ssfe_info.snd_assoc_id = chunk->sinfo.sinfo_assoc_id; + ssf->ssfe_info.snd_flags = chunk->chunk_hdr->flags; + + sctp_ulpevent_set_owner(event, asoc); + ssf->ssf_assoc_id = sctp_assoc2id(asoc); + + return event; +} + /* Create and initialize a SCTP_SHUTDOWN_EVENT notification. * * Socket Extensions for SCTP - draft-01 diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index b6536b7f14c0..1c6c640607c5 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -486,10 +486,9 @@ static struct sctp_ulpevent *sctp_ulpq_retrieve_reassembled(struct sctp_ulpq *ul cevent = sctp_skb2event(pd_first); pd_point = sctp_sk(asoc->base.sk)->pd_point; if (pd_point && pd_point <= pd_len) { - retval = sctp_make_reassembled_event(sock_net(asoc->base.sk), + retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm, - pd_first, - pd_last); + pd_first, pd_last); if (retval) sctp_ulpq_set_pd(ulpq); } @@ -497,7 +496,7 @@ static struct sctp_ulpevent *sctp_ulpq_retrieve_reassembled(struct sctp_ulpq *ul done: return retval; found: - retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk), + retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, first_frag, pos); if (retval) retval->msg_flags |= MSG_EOR; @@ -563,8 +562,8 @@ static struct sctp_ulpevent *sctp_ulpq_retrieve_partial(struct sctp_ulpq *ulpq) * further. */ done: - retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk), - &ulpq->reasm, first_frag, last_frag); + retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, + first_frag, last_frag); if (retval && is_last) retval->msg_flags |= MSG_EOR; @@ -664,8 +663,8 @@ static struct sctp_ulpevent *sctp_ulpq_retrieve_first(struct sctp_ulpq *ulpq) * further. */ done: - retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk), - &ulpq->reasm, first_frag, last_frag); + retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, + first_frag, last_frag); return retval; } diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 5b932583e407..90988a511cd5 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -25,6 +25,7 @@ #include <linux/in.h> #include <linux/sched/signal.h> #include <linux/if_vlan.h> +#include <linux/rcupdate_wait.h> #include <net/sock.h> #include <net/tcp.h> @@ -123,6 +124,12 @@ struct proto smc_proto6 = { }; EXPORT_SYMBOL_GPL(smc_proto6); +static void smc_restore_fallback_changes(struct smc_sock *smc) +{ + smc->clcsock->file->private_data = smc->sk.sk_socket; + smc->clcsock->file = NULL; +} + static int __smc_release(struct smc_sock *smc) { struct sock *sk = &smc->sk; @@ -141,6 +148,7 @@ static int __smc_release(struct smc_sock *smc) } sk->sk_state = SMC_CLOSED; sk->sk_state_change(sk); + smc_restore_fallback_changes(smc); } sk->sk_prot->unhash(sk); @@ -167,6 +175,7 @@ static int smc_release(struct socket *sock) if (!sk) goto out; + sock_hold(sk); /* sock_put below */ smc = smc_sk(sk); /* cleanup for a dangling non-blocking connect */ @@ -189,6 +198,7 @@ static int smc_release(struct socket *sock) sock->sk = NULL; release_sock(sk); + sock_put(sk); /* sock_hold above */ sock_put(sk); /* final sock_put */ out: return rc; @@ -460,6 +470,8 @@ static void smc_switch_to_fallback(struct smc_sock *smc) if (smc->sk.sk_socket && smc->sk.sk_socket->file) { smc->clcsock->file = smc->sk.sk_socket->file; smc->clcsock->file->private_data = smc->clcsock; + smc->clcsock->wq.fasync_list = + smc->sk.sk_socket->wq.fasync_list; } } @@ -700,8 +712,6 @@ static int __smc_connect(struct smc_sock *smc) int smc_type; int rc = 0; - sock_hold(&smc->sk); /* sock put in passive closing */ - if (smc->use_fallback) return smc_connect_fallback(smc, smc->fallback_rsn); @@ -791,6 +801,7 @@ static void smc_connect_work(struct work_struct *work) smc->sk.sk_err = EPIPE; else if (signal_pending(current)) smc->sk.sk_err = -sock_intr_errno(timeo); + sock_put(&smc->sk); /* passive closing */ goto out; } @@ -846,6 +857,10 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, rc = kernel_connect(smc->clcsock, addr, alen, flags); if (rc && rc != -EINPROGRESS) goto out; + + sock_hold(&smc->sk); /* sock put in passive closing */ + if (smc->use_fallback) + goto out; if (flags & O_NONBLOCK) { if (schedule_work(&smc->connect_work)) smc->connect_nonblock = 1; @@ -970,12 +985,14 @@ void smc_close_non_accepted(struct sock *sk) { struct smc_sock *smc = smc_sk(sk); + sock_hold(sk); /* sock_put below */ lock_sock(sk); if (!sk->sk_lingertime) /* wait for peer closing */ sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT; __smc_release(smc); release_sock(sk); + sock_put(sk); /* sock_hold above */ sock_put(sk); /* final sock_put */ } @@ -1291,8 +1308,8 @@ static void smc_listen_work(struct work_struct *work) /* check if RDMA is available */ if (!ism_supported) { /* SMC_TYPE_R or SMC_TYPE_B */ /* prepare RDMA check */ - memset(&ini, 0, sizeof(ini)); ini.is_smcd = false; + ini.ism_dev = NULL; ini.ib_lcl = &pclc->lcl; rc = smc_find_rdma_device(new_smc, &ini); if (rc) { @@ -1708,8 +1725,6 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, sk->sk_err = smc->clcsock->sk->sk_err; sk->sk_error_report(sk); } - if (rc) - return rc; if (optlen < sizeof(int)) return -EINVAL; @@ -1717,6 +1732,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, return -EFAULT; lock_sock(sk); + if (rc || smc->use_fallback) + goto out; switch (optname) { case TCP_ULP: case TCP_FASTOPEN: @@ -1724,19 +1741,18 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, case TCP_FASTOPEN_KEY: case TCP_FASTOPEN_NO_COOKIE: /* option not supported by SMC */ - if (sk->sk_state == SMC_INIT) { + if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { smc_switch_to_fallback(smc); smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP; } else { - if (!smc->use_fallback) - rc = -EINVAL; + rc = -EINVAL; } break; case TCP_NODELAY: if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_CLOSED) { - if (val && !smc->use_fallback) + if (val) mod_delayed_work(system_wq, &smc->conn.tx_work, 0); } @@ -1745,7 +1761,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_CLOSED) { - if (!val && !smc->use_fallback) + if (!val) mod_delayed_work(system_wq, &smc->conn.tx_work, 0); } @@ -1756,6 +1772,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, default: break; } +out: release_sock(sk); return rc; @@ -2027,22 +2044,28 @@ static int __init smc_init(void) if (rc) goto out_pernet_subsys; + rc = smc_core_init(); + if (rc) { + pr_err("%s: smc_core_init fails with %d\n", __func__, rc); + goto out_pnet; + } + rc = smc_llc_init(); if (rc) { pr_err("%s: smc_llc_init fails with %d\n", __func__, rc); - goto out_pnet; + goto out_core; } rc = smc_cdc_init(); if (rc) { pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc); - goto out_pnet; + goto out_core; } rc = proto_register(&smc_proto, 1); if (rc) { pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc); - goto out_pnet; + goto out_core; } rc = proto_register(&smc_proto6, 1); @@ -2074,6 +2097,8 @@ out_proto6: proto_unregister(&smc_proto6); out_proto: proto_unregister(&smc_proto); +out_core: + smc_core_exit(); out_pnet: smc_pnet_exit(); out_pernet_subsys: @@ -2084,14 +2109,15 @@ out_pernet_subsys: static void __exit smc_exit(void) { - smc_core_exit(); static_branch_disable(&tcp_have_smc); - smc_ib_unregister_client(); sock_unregister(PF_SMC); + smc_core_exit(); + smc_ib_unregister_client(); proto_unregister(&smc_proto6); proto_unregister(&smc_proto); smc_pnet_exit(); unregister_pernet_subsys(&smc_net_ops); + rcu_barrier(); } module_init(smc_init); diff --git a/net/smc/smc.h b/net/smc/smc.h index 878313f8d6c1..be11ba41190f 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -188,6 +188,7 @@ struct smc_connection { * 0 for SMC-R, 32 for SMC-D */ u64 peer_token; /* SMC-D token of peer */ + u8 killed : 1; /* abnormal termination */ }; struct smc_sock { /* smc sock container */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index d0b0f4c865b4..164f1584861b 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -63,7 +63,7 @@ int smc_cdc_get_free_slot(struct smc_connection *conn, rc = smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf, wr_rdma_buf, (struct smc_wr_tx_pend_priv **)pend); - if (!conn->alert_token_local) + if (conn->killed) /* abnormal termination */ rc = -EPIPE; return rc; @@ -131,6 +131,9 @@ int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn) { int rc; + if (!conn->lgr || (conn->lgr->is_smcd && conn->lgr->peer_shutdown)) + return -EPIPE; + if (conn->lgr->is_smcd) { spin_lock_bh(&conn->send_lock); rc = smcd_cdc_msg_send(conn); @@ -328,7 +331,7 @@ static void smcd_cdc_rx_tsklet(unsigned long data) struct smcd_cdc_msg cdc; struct smc_sock *smc; - if (!conn) + if (!conn || conn->killed) return; data_cdc = (struct smcd_cdc_msg *)conn->rmb_desc->cpu_addr; diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 49bcebff6378..86cccc24e52e 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -349,7 +349,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, smc->peer_diagnosis = ntohl(dclc->peer_diagnosis); if (((struct smc_clc_msg_decline *)buf)->hdr.flag) { smc->conn.lgr->sync_err = 1; - smc_lgr_terminate(smc->conn.lgr); + smc_lgr_terminate(smc->conn.lgr, true); } } @@ -372,7 +372,9 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info) dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline)); dclc.hdr.version = SMC_CLC_V1; dclc.hdr.flag = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ? 1 : 0; - memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid)); + if (smc->conn.lgr && !smc->conn.lgr->is_smcd) + memcpy(dclc.id_for_peer, local_systemid, + sizeof(local_systemid)); dclc.peer_diagnosis = htonl(peer_diag_info); memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index fc06720b53c1..290270c821ca 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -13,14 +13,13 @@ #include <linux/sched/signal.h> #include <net/sock.h> +#include <net/tcp.h> #include "smc.h" #include "smc_tx.h" #include "smc_cdc.h" #include "smc_close.h" -#define SMC_CLOSE_WAIT_LISTEN_CLCSOCK_TIME (5 * HZ) - /* release the clcsock that is assigned to the smc_sock */ void smc_clcsock_release(struct smc_sock *smc) { @@ -65,8 +64,9 @@ static void smc_close_stream_wait(struct smc_sock *smc, long timeout) rc = sk_wait_event(sk, &timeout, !smc_tx_prepared_sends(&smc->conn) || - (sk->sk_err == ECONNABORTED) || - (sk->sk_err == ECONNRESET), + sk->sk_err == ECONNABORTED || + sk->sk_err == ECONNRESET || + smc->conn.killed, &wait); if (rc) break; @@ -95,68 +95,73 @@ static int smc_close_final(struct smc_connection *conn) conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; else conn->local_tx_ctrl.conn_state_flags.peer_conn_closed = 1; + if (conn->killed) + return -EPIPE; return smc_cdc_get_slot_and_msg_send(conn); } -static int smc_close_abort(struct smc_connection *conn) +int smc_close_abort(struct smc_connection *conn) { conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; return smc_cdc_get_slot_and_msg_send(conn); } +static void smc_close_cancel_work(struct smc_sock *smc) +{ + struct sock *sk = &smc->sk; + + release_sock(sk); + cancel_work_sync(&smc->conn.close_work); + cancel_delayed_work_sync(&smc->conn.tx_work); + lock_sock(sk); + sk->sk_state = SMC_CLOSED; +} + /* terminate smc socket abnormally - active abort * link group is terminated, i.e. RDMA communication no longer possible */ -static void smc_close_active_abort(struct smc_sock *smc) +void smc_close_active_abort(struct smc_sock *smc) { struct sock *sk = &smc->sk; - - struct smc_cdc_conn_state_flags *txflags = - &smc->conn.local_tx_ctrl.conn_state_flags; + bool release_clcsock = false; if (sk->sk_state != SMC_INIT && smc->clcsock && smc->clcsock->sk) { sk->sk_err = ECONNABORTED; - if (smc->clcsock && smc->clcsock->sk) { - smc->clcsock->sk->sk_err = ECONNABORTED; - smc->clcsock->sk->sk_state_change(smc->clcsock->sk); - } + if (smc->clcsock && smc->clcsock->sk) + tcp_abort(smc->clcsock->sk, ECONNABORTED); } switch (sk->sk_state) { case SMC_ACTIVE: sk->sk_state = SMC_PEERABORTWAIT; - release_sock(sk); - cancel_delayed_work_sync(&smc->conn.tx_work); - lock_sock(sk); + smc_close_cancel_work(smc); + sk->sk_state = SMC_CLOSED; sock_put(sk); /* passive closing */ break; case SMC_APPCLOSEWAIT1: case SMC_APPCLOSEWAIT2: - if (!smc_cdc_rxed_any_close(&smc->conn)) - sk->sk_state = SMC_PEERABORTWAIT; - else - sk->sk_state = SMC_CLOSED; - release_sock(sk); - cancel_delayed_work_sync(&smc->conn.tx_work); - lock_sock(sk); + smc_close_cancel_work(smc); + sk->sk_state = SMC_CLOSED; + sock_put(sk); /* postponed passive closing */ break; case SMC_PEERCLOSEWAIT1: case SMC_PEERCLOSEWAIT2: - if (!txflags->peer_conn_closed) { - /* just SHUTDOWN_SEND done */ - sk->sk_state = SMC_PEERABORTWAIT; - } else { - sk->sk_state = SMC_CLOSED; - } + case SMC_PEERFINCLOSEWAIT: + sk->sk_state = SMC_PEERABORTWAIT; + smc_close_cancel_work(smc); + sk->sk_state = SMC_CLOSED; + smc_conn_free(&smc->conn); + release_clcsock = true; sock_put(sk); /* passive closing */ break; case SMC_PROCESSABORT: case SMC_APPFINCLOSEWAIT: + sk->sk_state = SMC_PEERABORTWAIT; + smc_close_cancel_work(smc); sk->sk_state = SMC_CLOSED; - break; - case SMC_PEERFINCLOSEWAIT: - sock_put(sk); /* passive closing */ + smc_conn_free(&smc->conn); + release_clcsock = true; break; case SMC_INIT: case SMC_PEERABORTWAIT: @@ -166,6 +171,12 @@ static void smc_close_active_abort(struct smc_sock *smc) sock_set_flag(sk, SOCK_DEAD); sk->sk_state_change(sk); + + if (release_clcsock) { + release_sock(sk); + smc_clcsock_release(smc); + lock_sock(sk); + } } static inline bool smc_close_sent_any_close(struct smc_connection *conn) @@ -215,8 +226,6 @@ again: if (sk->sk_state == SMC_ACTIVE) { /* send close request */ rc = smc_close_final(conn); - if (rc) - break; sk->sk_state = SMC_PEERCLOSEWAIT1; } else { /* peer event has changed the state */ @@ -229,8 +238,6 @@ again: !smc_close_sent_any_close(conn)) { /* just shutdown wr done, send close request */ rc = smc_close_final(conn); - if (rc) - break; } sk->sk_state = SMC_CLOSED; break; @@ -246,8 +253,6 @@ again: goto again; /* confirm close from peer */ rc = smc_close_final(conn); - if (rc) - break; if (smc_cdc_rxed_any_close(conn)) { /* peer has closed the socket already */ sk->sk_state = SMC_CLOSED; @@ -263,8 +268,6 @@ again: !smc_close_sent_any_close(conn)) { /* just shutdown wr done, send close request */ rc = smc_close_final(conn); - if (rc) - break; } /* peer sending PeerConnectionClosed will cause transition */ break; @@ -272,10 +275,12 @@ again: /* peer sending PeerConnectionClosed will cause transition */ break; case SMC_PROCESSABORT: - smc_close_abort(conn); + rc = smc_close_abort(conn); sk->sk_state = SMC_CLOSED; break; case SMC_PEERABORTWAIT: + sk->sk_state = SMC_CLOSED; + break; case SMC_CLOSED: /* nothing to do, add tracing in future patch */ break; @@ -344,12 +349,6 @@ static void smc_close_passive_work(struct work_struct *work) lock_sock(sk); old_state = sk->sk_state; - if (!conn->alert_token_local) { - /* abnormal termination */ - smc_close_active_abort(smc); - goto wakeup; - } - rxflags = &conn->local_rx_ctrl.conn_state_flags; if (rxflags->peer_conn_abort) { /* peer has not received all data */ @@ -451,8 +450,6 @@ again: goto again; /* send close wr request */ rc = smc_close_wr(conn); - if (rc) - break; sk->sk_state = SMC_PEERCLOSEWAIT1; break; case SMC_APPCLOSEWAIT1: @@ -466,8 +463,6 @@ again: goto again; /* confirm close from peer */ rc = smc_close_wr(conn); - if (rc) - break; sk->sk_state = SMC_APPCLOSEWAIT2; break; case SMC_APPCLOSEWAIT2: diff --git a/net/smc/smc_close.h b/net/smc/smc_close.h index e0e3b5df25d2..634fea2b7c95 100644 --- a/net/smc/smc_close.h +++ b/net/smc/smc_close.h @@ -24,5 +24,7 @@ int smc_close_active(struct smc_sock *smc); int smc_close_shutdown_write(struct smc_sock *smc); void smc_close_init(struct smc_sock *smc); void smc_clcsock_release(struct smc_sock *smc); +int smc_close_abort(struct smc_connection *conn); +void smc_close_active_abort(struct smc_sock *smc); #endif /* SMC_CLOSE_H */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 4ca50ddf8d16..2249de5379ee 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -13,6 +13,8 @@ #include <linux/if_vlan.h> #include <linux/random.h> #include <linux/workqueue.h> +#include <linux/wait.h> +#include <linux/reboot.h> #include <net/tcp.h> #include <net/sock.h> #include <rdma/ib_verbs.h> @@ -39,23 +41,46 @@ static struct smc_lgr_list smc_lgr_list = { /* established link groups */ .num = 0, }; +static atomic_t lgr_cnt = ATOMIC_INIT(0); /* number of existing link groups */ +static DECLARE_WAIT_QUEUE_HEAD(lgrs_deleted); + static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, struct smc_buf_desc *buf_desc); +/* return head of link group list and its lock for a given link group */ +static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr, + spinlock_t **lgr_lock) +{ + if (lgr->is_smcd) { + *lgr_lock = &lgr->smcd->lgr_lock; + return &lgr->smcd->lgr_list; + } + + *lgr_lock = &smc_lgr_list.lock; + return &smc_lgr_list.list; +} + static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) { /* client link group creation always follows the server link group * creation. For client use a somewhat higher removal delay time, * otherwise there is a risk of out-of-sync link groups. */ - mod_delayed_work(system_wq, &lgr->free_work, - (!lgr->is_smcd && lgr->role == SMC_CLNT) ? - SMC_LGR_FREE_DELAY_CLNT : SMC_LGR_FREE_DELAY_SERV); + if (!lgr->freeing && !lgr->freefast) { + mod_delayed_work(system_wq, &lgr->free_work, + (!lgr->is_smcd && lgr->role == SMC_CLNT) ? + SMC_LGR_FREE_DELAY_CLNT : + SMC_LGR_FREE_DELAY_SERV); + } } void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr) { - mod_delayed_work(system_wq, &lgr->free_work, SMC_LGR_FREE_DELAY_FAST); + if (!lgr->freeing && !lgr->freefast) { + lgr->freefast = 1; + mod_delayed_work(system_wq, &lgr->free_work, + SMC_LGR_FREE_DELAY_FAST); + } } /* Register connection's alert token in our lookup structure. @@ -134,16 +159,17 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) __smc_lgr_unregister_conn(conn); } write_unlock_bh(&lgr->conns_lock); + conn->lgr = NULL; } /* Send delete link, either as client to request the initiation * of the DELETE LINK sequence from server; or as server to * initiate the delete processing. See smc_llc_rx_delete_link(). */ -static int smc_link_send_delete(struct smc_link *lnk) +static int smc_link_send_delete(struct smc_link *lnk, bool orderly) { if (lnk->state == SMC_LNK_ACTIVE && - !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, true)) { + !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, orderly)) { smc_llc_link_deleting(lnk); return 0; } @@ -157,48 +183,62 @@ static void smc_lgr_free_work(struct work_struct *work) struct smc_link_group *lgr = container_of(to_delayed_work(work), struct smc_link_group, free_work); + spinlock_t *lgr_lock; + struct smc_link *lnk; bool conns; - spin_lock_bh(&smc_lgr_list.lock); + smc_lgr_list_head(lgr, &lgr_lock); + spin_lock_bh(lgr_lock); + if (lgr->freeing) { + spin_unlock_bh(lgr_lock); + return; + } read_lock_bh(&lgr->conns_lock); conns = RB_EMPTY_ROOT(&lgr->conns_all); read_unlock_bh(&lgr->conns_lock); if (!conns) { /* number of lgr connections is no longer zero */ - spin_unlock_bh(&smc_lgr_list.lock); + spin_unlock_bh(lgr_lock); return; } - if (!list_empty(&lgr->list)) - list_del_init(&lgr->list); /* remove from smc_lgr_list */ - spin_unlock_bh(&smc_lgr_list.lock); + list_del_init(&lgr->list); /* remove from smc_lgr_list */ + lnk = &lgr->lnk[SMC_SINGLE_LINK]; if (!lgr->is_smcd && !lgr->terminating) { - struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; - /* try to send del link msg, on error free lgr immediately */ if (lnk->state == SMC_LNK_ACTIVE && - !smc_link_send_delete(lnk)) { + !smc_link_send_delete(lnk, true)) { /* reschedule in case we never receive a response */ smc_lgr_schedule_free_work(lgr); + spin_unlock_bh(lgr_lock); return; } } + lgr->freeing = 1; /* this instance does the freeing, no new schedule */ + spin_unlock_bh(lgr_lock); + cancel_delayed_work(&lgr->free_work); + + if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE) + smc_llc_link_inactive(lnk); + if (lgr->is_smcd && !lgr->terminating) + smc_ism_signal_shutdown(lgr); + smc_lgr_free(lgr); +} - if (!delayed_work_pending(&lgr->free_work)) { - struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; +static void smc_lgr_terminate_work(struct work_struct *work) +{ + struct smc_link_group *lgr = container_of(work, struct smc_link_group, + terminate_work); - if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE) - smc_llc_link_inactive(lnk); - if (lgr->is_smcd) - smc_ism_signal_shutdown(lgr); - smc_lgr_free(lgr); - } + smc_lgr_terminate(lgr, true); } /* create a new SMC link group */ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) { struct smc_link_group *lgr; + struct list_head *lgr_list; struct smc_link *lnk; + spinlock_t *lgr_lock; u8 rndvec[3]; int rc = 0; int i; @@ -213,10 +253,13 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lgr = kzalloc(sizeof(*lgr), GFP_KERNEL); if (!lgr) { rc = SMC_CLC_DECL_MEM; - goto out; + goto ism_put_vlan; } lgr->is_smcd = ini->is_smcd; lgr->sync_err = 0; + lgr->terminating = 0; + lgr->freefast = 0; + lgr->freeing = 0; lgr->vlan_id = ini->vlan_id; rwlock_init(&lgr->sndbufs_lock); rwlock_init(&lgr->rmbs_lock); @@ -228,13 +271,20 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) smc_lgr_list.num += SMC_LGR_NUM_INCR; memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE); INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); + INIT_WORK(&lgr->terminate_work, smc_lgr_terminate_work); lgr->conns_all = RB_ROOT; if (ini->is_smcd) { /* SMC-D specific settings */ + get_device(&ini->ism_dev->dev); lgr->peer_gid = ini->ism_gid; lgr->smcd = ini->ism_dev; + lgr_list = &ini->ism_dev->lgr_list; + lgr_lock = &lgr->smcd->lgr_lock; + lgr->peer_shutdown = 0; + atomic_inc(&ini->ism_dev->lgr_cnt); } else { /* SMC-R specific settings */ + get_device(&ini->ib_dev->ibdev->dev); lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer, SMC_SYSTEMID_LEN); @@ -245,6 +295,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lnk->link_id = SMC_SINGLE_LINK; lnk->smcibdev = ini->ib_dev; lnk->ibport = ini->ib_port; + lgr_list = &smc_lgr_list.list; + lgr_lock = &smc_lgr_list.lock; lnk->path_mtu = ini->ib_dev->pattr[ini->ib_port - 1].active_mtu; if (!ini->ib_dev->initialized) @@ -272,11 +324,13 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) rc = smc_wr_create_link(lnk); if (rc) goto destroy_qp; + atomic_inc(&lgr_cnt); + atomic_inc(&ini->ib_dev->lnk_cnt); } smc->conn.lgr = lgr; - spin_lock_bh(&smc_lgr_list.lock); - list_add(&lgr->list, &smc_lgr_list.list); - spin_unlock_bh(&smc_lgr_list.lock); + spin_lock_bh(lgr_lock); + list_add(&lgr->list, lgr_list); + spin_unlock_bh(lgr_lock); return 0; destroy_qp: @@ -289,6 +343,9 @@ clear_llc_lnk: smc_llc_link_clear(lnk); free_lgr: kfree(lgr); +ism_put_vlan: + if (ini->is_smcd && ini->vlan_id) + smc_ism_put_vlan(ini->ism_dev, ini->vlan_id); out: if (rc < 0) { if (rc == -ENOMEM) @@ -306,7 +363,7 @@ static void smc_buf_unuse(struct smc_connection *conn, conn->sndbuf_desc->used = 0; if (conn->rmb_desc) { if (!conn->rmb_desc->regerr) { - if (!lgr->is_smcd) { + if (!lgr->is_smcd && !list_empty(&lgr->list)) { /* unregister rmb with peer */ smc_llc_do_delete_rkey( &lgr->lnk[SMC_SINGLE_LINK], @@ -332,14 +389,16 @@ void smc_conn_free(struct smc_connection *conn) if (!lgr) return; if (lgr->is_smcd) { - smc_ism_unset_conn(conn); + if (!list_empty(&lgr->list)) + smc_ism_unset_conn(conn); tasklet_kill(&conn->rx_tsklet); } else { smc_cdc_tx_dismiss_slots(conn); } - smc_lgr_unregister_conn(conn); - smc_buf_unuse(conn, lgr); /* allow buffer reuse */ - conn->lgr = NULL; + if (!list_empty(&lgr->list)) { + smc_lgr_unregister_conn(conn); + smc_buf_unuse(conn, lgr); /* allow buffer reuse */ + } if (!lgr->conns_num) smc_lgr_schedule_free_work(lgr); @@ -354,6 +413,8 @@ static void smc_link_clear(struct smc_link *lnk) smc_ib_destroy_queue_pair(lnk); smc_ib_dealloc_protection_domain(lnk); smc_wr_free_link_mem(lnk); + if (!atomic_dec_return(&lnk->smcibdev->lnk_cnt)) + wake_up(&lnk->smcibdev->lnks_deleted); } static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, @@ -430,24 +491,101 @@ static void smc_lgr_free_bufs(struct smc_link_group *lgr) static void smc_lgr_free(struct smc_link_group *lgr) { smc_lgr_free_bufs(lgr); - if (lgr->is_smcd) - smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); - else + if (lgr->is_smcd) { + if (!lgr->terminating) { + smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); + put_device(&lgr->smcd->dev); + } + if (!atomic_dec_return(&lgr->smcd->lgr_cnt)) + wake_up(&lgr->smcd->lgrs_deleted); + } else { smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); + put_device(&lgr->lnk[SMC_SINGLE_LINK].smcibdev->ibdev->dev); + if (!atomic_dec_return(&lgr_cnt)) + wake_up(&lgrs_deleted); + } kfree(lgr); } void smc_lgr_forget(struct smc_link_group *lgr) { - spin_lock_bh(&smc_lgr_list.lock); + struct list_head *lgr_list; + spinlock_t *lgr_lock; + + lgr_list = smc_lgr_list_head(lgr, &lgr_lock); + spin_lock_bh(lgr_lock); /* do not use this link group for new connections */ - if (!list_empty(&lgr->list)) - list_del_init(&lgr->list); - spin_unlock_bh(&smc_lgr_list.lock); + if (!list_empty(lgr_list)) + list_del_init(lgr_list); + spin_unlock_bh(lgr_lock); +} + +static void smcd_unregister_all_dmbs(struct smc_link_group *lgr) +{ + int i; + + for (i = 0; i < SMC_RMBE_SIZES; i++) { + struct smc_buf_desc *buf_desc; + + list_for_each_entry(buf_desc, &lgr->rmbs[i], list) { + buf_desc->len += sizeof(struct smcd_cdc_msg); + smc_ism_unregister_dmb(lgr->smcd, buf_desc); + } + } +} + +static void smc_sk_wake_ups(struct smc_sock *smc) +{ + smc->sk.sk_write_space(&smc->sk); + smc->sk.sk_data_ready(&smc->sk); + smc->sk.sk_state_change(&smc->sk); } -/* terminate linkgroup abnormally */ -static void __smc_lgr_terminate(struct smc_link_group *lgr) +/* kill a connection */ +static void smc_conn_kill(struct smc_connection *conn, bool soft) +{ + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + + if (conn->lgr->is_smcd && conn->lgr->peer_shutdown) + conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; + else + smc_close_abort(conn); + conn->killed = 1; + smc->sk.sk_err = ECONNABORTED; + smc_sk_wake_ups(smc); + if (conn->lgr->is_smcd) { + smc_ism_unset_conn(conn); + if (soft) + tasklet_kill(&conn->rx_tsklet); + else + tasklet_unlock_wait(&conn->rx_tsklet); + } else { + smc_cdc_tx_dismiss_slots(conn); + } + smc_lgr_unregister_conn(conn); + smc_close_active_abort(smc); +} + +static void smc_lgr_cleanup(struct smc_link_group *lgr) +{ + if (lgr->is_smcd) { + smc_ism_signal_shutdown(lgr); + smcd_unregister_all_dmbs(lgr); + smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); + put_device(&lgr->smcd->dev); + } else { + struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + + wake_up(&lnk->wr_reg_wait); + if (lnk->state != SMC_LNK_INACTIVE) { + smc_link_send_delete(lnk, false); + smc_llc_link_inactive(lnk); + } + } +} + +/* terminate link group */ +static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft) { struct smc_connection *conn; struct smc_sock *smc; @@ -455,80 +593,161 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr) if (lgr->terminating) return; /* lgr already terminating */ + if (!soft) + cancel_delayed_work_sync(&lgr->free_work); lgr->terminating = 1; - if (!list_empty(&lgr->list)) /* forget lgr */ - list_del_init(&lgr->list); if (!lgr->is_smcd) smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); - write_lock_bh(&lgr->conns_lock); + /* kill remaining link group connections */ + read_lock_bh(&lgr->conns_lock); node = rb_first(&lgr->conns_all); while (node) { + read_unlock_bh(&lgr->conns_lock); conn = rb_entry(node, struct smc_connection, alert_node); smc = container_of(conn, struct smc_sock, conn); - sock_hold(&smc->sk); /* sock_put in close work */ - conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; - __smc_lgr_unregister_conn(conn); - conn->lgr = NULL; - write_unlock_bh(&lgr->conns_lock); - if (!schedule_work(&conn->close_work)) - sock_put(&smc->sk); - write_lock_bh(&lgr->conns_lock); + sock_hold(&smc->sk); /* sock_put below */ + lock_sock(&smc->sk); + smc_conn_kill(conn, soft); + release_sock(&smc->sk); + sock_put(&smc->sk); /* sock_hold above */ + read_lock_bh(&lgr->conns_lock); node = rb_first(&lgr->conns_all); } - write_unlock_bh(&lgr->conns_lock); - if (!lgr->is_smcd) - wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait); - smc_lgr_schedule_free_work(lgr); + read_unlock_bh(&lgr->conns_lock); + smc_lgr_cleanup(lgr); + if (soft) + smc_lgr_schedule_free_work_fast(lgr); + else + smc_lgr_free(lgr); } -void smc_lgr_terminate(struct smc_link_group *lgr) +/* unlink and terminate link group + * @soft: true if link group shutdown can take its time + * false if immediate link group shutdown is required + */ +void smc_lgr_terminate(struct smc_link_group *lgr, bool soft) { - spin_lock_bh(&smc_lgr_list.lock); - __smc_lgr_terminate(lgr); - spin_unlock_bh(&smc_lgr_list.lock); + spinlock_t *lgr_lock; + + smc_lgr_list_head(lgr, &lgr_lock); + spin_lock_bh(lgr_lock); + if (lgr->terminating) { + spin_unlock_bh(lgr_lock); + return; /* lgr already terminating */ + } + if (!soft) + lgr->freeing = 1; + list_del_init(&lgr->list); + spin_unlock_bh(lgr_lock); + __smc_lgr_terminate(lgr, soft); } /* Called when IB port is terminated */ void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) { struct smc_link_group *lgr, *l; + LIST_HEAD(lgr_free_list); spin_lock_bh(&smc_lgr_list.lock); list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { if (!lgr->is_smcd && lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && - lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) - __smc_lgr_terminate(lgr); + lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) { + list_move(&lgr->list, &lgr_free_list); + lgr->freeing = 1; + } } spin_unlock_bh(&smc_lgr_list.lock); + + list_for_each_entry_safe(lgr, l, &lgr_free_list, list) { + list_del_init(&lgr->list); + __smc_lgr_terminate(lgr, false); + } } -/* Called when SMC-D device is terminated or peer is lost */ +/* Called when peer lgr shutdown (regularly or abnormally) is received */ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan) { struct smc_link_group *lgr, *l; LIST_HEAD(lgr_free_list); /* run common cleanup function and build free list */ - spin_lock_bh(&smc_lgr_list.lock); - list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { - if (lgr->is_smcd && lgr->smcd == dev && - (!peer_gid || lgr->peer_gid == peer_gid) && + spin_lock_bh(&dev->lgr_lock); + list_for_each_entry_safe(lgr, l, &dev->lgr_list, list) { + if ((!peer_gid || lgr->peer_gid == peer_gid) && (vlan == VLAN_VID_MASK || lgr->vlan_id == vlan)) { - __smc_lgr_terminate(lgr); + if (peer_gid) /* peer triggered termination */ + lgr->peer_shutdown = 1; list_move(&lgr->list, &lgr_free_list); } } - spin_unlock_bh(&smc_lgr_list.lock); + spin_unlock_bh(&dev->lgr_lock); /* cancel the regular free workers and actually free lgrs */ list_for_each_entry_safe(lgr, l, &lgr_free_list, list) { list_del_init(&lgr->list); - cancel_delayed_work_sync(&lgr->free_work); - if (!peer_gid && vlan == VLAN_VID_MASK) /* dev terminated? */ - smc_ism_signal_shutdown(lgr); - smc_lgr_free(lgr); + schedule_work(&lgr->terminate_work); + } +} + +/* Called when an SMCD device is removed or the smc module is unloaded */ +void smc_smcd_terminate_all(struct smcd_dev *smcd) +{ + struct smc_link_group *lgr, *lg; + LIST_HEAD(lgr_free_list); + + spin_lock_bh(&smcd->lgr_lock); + list_splice_init(&smcd->lgr_list, &lgr_free_list); + list_for_each_entry(lgr, &lgr_free_list, list) + lgr->freeing = 1; + spin_unlock_bh(&smcd->lgr_lock); + + list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) { + list_del_init(&lgr->list); + __smc_lgr_terminate(lgr, false); + } + + if (atomic_read(&smcd->lgr_cnt)) + wait_event(smcd->lgrs_deleted, !atomic_read(&smcd->lgr_cnt)); +} + +/* Called when an SMCR device is removed or the smc module is unloaded. + * If smcibdev is given, all SMCR link groups using this device are terminated. + * If smcibdev is NULL, all SMCR link groups are terminated. + */ +void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) +{ + struct smc_link_group *lgr, *lg; + LIST_HEAD(lgr_free_list); + + spin_lock_bh(&smc_lgr_list.lock); + if (!smcibdev) { + list_splice_init(&smc_lgr_list.list, &lgr_free_list); + list_for_each_entry(lgr, &lgr_free_list, list) + lgr->freeing = 1; + } else { + list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) { + if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev) { + list_move(&lgr->list, &lgr_free_list); + lgr->freeing = 1; + } + } + } + spin_unlock_bh(&smc_lgr_list.lock); + + list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) { + list_del_init(&lgr->list); + __smc_lgr_terminate(lgr, false); + } + + if (smcibdev) { + if (atomic_read(&smcibdev->lnk_cnt)) + wait_event(smcibdev->lnks_deleted, + !atomic_read(&smcibdev->lnk_cnt)); + } else { + if (atomic_read(&lgr_cnt)) + wait_event(lgrs_deleted, !atomic_read(&lgr_cnt)); } } @@ -558,7 +777,7 @@ int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini) } rtnl_lock(); - nest_lvl = dev_get_nest_level(ndev); + nest_lvl = ndev->lower_level; for (i = 0; i < nest_lvl; i++) { struct list_head *lower = &ndev->adj_list.lower; @@ -604,10 +823,14 @@ static bool smcd_lgr_match(struct smc_link_group *lgr, int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) { struct smc_connection *conn = &smc->conn; + struct list_head *lgr_list; struct smc_link_group *lgr; enum smc_lgr_role role; + spinlock_t *lgr_lock; int rc = 0; + lgr_list = ini->is_smcd ? &ini->ism_dev->lgr_list : &smc_lgr_list.list; + lgr_lock = ini->is_smcd ? &ini->ism_dev->lgr_lock : &smc_lgr_list.lock; ini->cln_first_contact = SMC_FIRST_CONTACT; role = smc->listen_smc ? SMC_SERV : SMC_CLNT; if (role == SMC_CLNT && ini->srv_first_contact) @@ -615,8 +838,8 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) goto create; /* determine if an existing link group can be reused */ - spin_lock_bh(&smc_lgr_list.lock); - list_for_each_entry(lgr, &smc_lgr_list.list, list) { + spin_lock_bh(lgr_lock); + list_for_each_entry(lgr, lgr_list, list) { write_lock_bh(&lgr->conns_lock); if ((ini->is_smcd ? smcd_lgr_match(lgr, ini->ism_dev, ini->ism_gid) : @@ -636,7 +859,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) } write_unlock_bh(&lgr->conns_lock); } - spin_unlock_bh(&smc_lgr_list.lock); + spin_unlock_bh(lgr_lock); if (role == SMC_CLNT && !ini->srv_first_contact && ini->cln_first_contact == SMC_FIRST_CONTACT) { @@ -1024,29 +1247,62 @@ int smc_rmb_rtoken_handling(struct smc_connection *conn, return 0; } -/* Called (from smc_exit) when module is removed */ -void smc_core_exit(void) +static void smc_core_going_away(void) { - struct smc_link_group *lgr, *lg; - LIST_HEAD(lgr_freeing_list); + struct smc_ib_device *smcibdev; + struct smcd_dev *smcd; - spin_lock_bh(&smc_lgr_list.lock); - if (!list_empty(&smc_lgr_list.list)) - list_splice_init(&smc_lgr_list.list, &lgr_freeing_list); - spin_unlock_bh(&smc_lgr_list.lock); - list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) { - list_del_init(&lgr->list); - if (!lgr->is_smcd) { - struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + spin_lock(&smc_ib_devices.lock); + list_for_each_entry(smcibdev, &smc_ib_devices.list, list) { + int i; - if (lnk->state == SMC_LNK_ACTIVE) - smc_llc_send_delete_link(lnk, SMC_LLC_REQ, - false); - smc_llc_link_inactive(lnk); - } - cancel_delayed_work_sync(&lgr->free_work); - if (lgr->is_smcd) - smc_ism_signal_shutdown(lgr); - smc_lgr_free(lgr); /* free link group */ + for (i = 0; i < SMC_MAX_PORTS; i++) + set_bit(i, smcibdev->ports_going_away); + } + spin_unlock(&smc_ib_devices.lock); + + spin_lock(&smcd_dev_list.lock); + list_for_each_entry(smcd, &smcd_dev_list.list, list) { + smcd->going_away = 1; } + spin_unlock(&smcd_dev_list.lock); +} + +/* Clean up all SMC link groups */ +static void smc_lgrs_shutdown(void) +{ + struct smcd_dev *smcd; + + smc_core_going_away(); + + smc_smcr_terminate_all(NULL); + + spin_lock(&smcd_dev_list.lock); + list_for_each_entry(smcd, &smcd_dev_list.list, list) + smc_smcd_terminate_all(smcd); + spin_unlock(&smcd_dev_list.lock); +} + +static int smc_core_reboot_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + smc_lgrs_shutdown(); + smc_ib_unregister_client(); + return 0; +} + +static struct notifier_block smc_reboot_notifier = { + .notifier_call = smc_core_reboot_event, +}; + +int __init smc_core_init(void) +{ + return register_reboot_notifier(&smc_reboot_notifier); +} + +/* Called (from smc_exit) when module is removed */ +void smc_core_exit(void) +{ + unregister_reboot_notifier(&smc_reboot_notifier); + smc_lgrs_shutdown(); } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index c00ac61dc129..c472e12951d1 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -202,8 +202,11 @@ struct smc_link_group { u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */ struct delayed_work free_work; /* delayed freeing of an lgr */ + struct work_struct terminate_work; /* abnormal lgr termination */ u8 sync_err : 1; /* lgr no longer fits to peer */ u8 terminating : 1;/* lgr is terminating */ + u8 freefast : 1; /* free worker scheduled fast */ + u8 freeing : 1; /* lgr is being freed */ bool is_smcd; /* SMC-R or SMC-D */ union { @@ -225,6 +228,8 @@ struct smc_link_group { /* Peer GID (remote) */ struct smcd_dev *smcd; /* ISM device for VLAN reg. */ + u8 peer_shutdown : 1; + /* peer triggered shutdownn */ }; }; }; @@ -280,15 +285,23 @@ static inline struct smc_connection *smc_lgr_find_conn( return res; } +static inline void smc_lgr_terminate_sched(struct smc_link_group *lgr) +{ + if (!lgr->terminating && !lgr->freeing) + schedule_work(&lgr->terminate_work); +} + struct smc_sock; struct smc_clc_msg_accept_confirm; struct smc_clc_msg_local; void smc_lgr_forget(struct smc_link_group *lgr); -void smc_lgr_terminate(struct smc_link_group *lgr); +void smc_lgr_terminate(struct smc_link_group *lgr, bool soft); void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport); void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan); +void smc_smcd_terminate_all(struct smcd_dev *dev); +void smc_smcr_terminate_all(struct smc_ib_device *smcibdev); int smc_buf_create(struct smc_sock *smc, bool is_smcd); int smc_uncompress_bufsize(u8 compressed); int smc_rmb_rtoken_handling(struct smc_connection *conn, @@ -305,6 +318,7 @@ void smc_conn_free(struct smc_connection *conn); int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini); void smcd_conn_free(struct smc_connection *conn); void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr); +int smc_core_init(void); void smc_core_exit(void); static inline struct smc_link_group *smc_get_lgr(struct smc_link *link) diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index f38727ecf8b2..e1f64f4ba236 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -39,16 +39,15 @@ static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk) { struct smc_sock *smc = smc_sk(sk); + memset(r, 0, sizeof(*r)); r->diag_family = sk->sk_family; + sock_diag_save_cookie(sk, r->id.idiag_cookie); if (!smc->clcsock) return; r->id.idiag_sport = htons(smc->clcsock->sk->sk_num); r->id.idiag_dport = smc->clcsock->sk->sk_dport; r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if; - sock_diag_save_cookie(sk, r->id.idiag_cookie); if (sk->sk_protocol == SMCPROTO_SMC) { - memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); - memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr; r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr; #if IS_ENABLED(CONFIG_IPV6) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index d14ca4af6f94..548632621f4b 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -15,6 +15,7 @@ #include <linux/random.h> #include <linux/workqueue.h> #include <linux/scatterlist.h> +#include <linux/wait.h> #include <rdma/ib_verbs.h> #include <rdma/ib_cache.h> @@ -242,8 +243,12 @@ static void smc_ib_port_event_work(struct work_struct *work) for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) { smc_ib_remember_port_attr(smcibdev, port_idx + 1); clear_bit(port_idx, &smcibdev->port_event_mask); - if (!smc_ib_port_active(smcibdev, port_idx + 1)) + if (!smc_ib_port_active(smcibdev, port_idx + 1)) { + set_bit(port_idx, smcibdev->ports_going_away); smc_port_terminate(smcibdev, port_idx + 1); + } else { + clear_bit(port_idx, smcibdev->ports_going_away); + } } } @@ -259,8 +264,10 @@ static void smc_ib_global_event_handler(struct ib_event_handler *handler, switch (ibevent->event) { case IB_EVENT_DEVICE_FATAL: /* terminate all ports on device */ - for (port_idx = 0; port_idx < SMC_MAX_PORTS; port_idx++) + for (port_idx = 0; port_idx < SMC_MAX_PORTS; port_idx++) { set_bit(port_idx, &smcibdev->port_event_mask); + set_bit(port_idx, smcibdev->ports_going_away); + } schedule_work(&smcibdev->port_event_work); break; case IB_EVENT_PORT_ERR: @@ -269,6 +276,10 @@ static void smc_ib_global_event_handler(struct ib_event_handler *handler, port_idx = ibevent->element.port_num - 1; if (port_idx < SMC_MAX_PORTS) { set_bit(port_idx, &smcibdev->port_event_mask); + if (ibevent->event == IB_EVENT_PORT_ERR) + set_bit(port_idx, smcibdev->ports_going_away); + else if (ibevent->event == IB_EVENT_PORT_ACTIVE) + clear_bit(port_idx, smcibdev->ports_going_away); schedule_work(&smcibdev->port_event_work); } break; @@ -307,6 +318,7 @@ static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) port_idx = ibevent->element.qp->port - 1; if (port_idx < SMC_MAX_PORTS) { set_bit(port_idx, &smcibdev->port_event_mask); + set_bit(port_idx, smcibdev->ports_going_away); schedule_work(&smcibdev->port_event_work); } break; @@ -509,9 +521,9 @@ static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev) if (!smcibdev->initialized) return; smcibdev->initialized = 0; - smc_wr_remove_dev(smcibdev); ib_destroy_cq(smcibdev->roce_cq_recv); ib_destroy_cq(smcibdev->roce_cq_send); + smc_wr_remove_dev(smcibdev); } static struct ib_client smc_ib_client; @@ -532,7 +544,8 @@ static void smc_ib_add_dev(struct ib_device *ibdev) smcibdev->ibdev = ibdev; INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work); - + atomic_set(&smcibdev->lnk_cnt, 0); + init_waitqueue_head(&smcibdev->lnks_deleted); spin_lock(&smc_ib_devices.lock); list_add_tail(&smcibdev->list, &smc_ib_devices.list); spin_unlock(&smc_ib_devices.lock); @@ -554,7 +567,7 @@ static void smc_ib_add_dev(struct ib_device *ibdev) schedule_work(&smcibdev->port_event_work); } -/* callback function for ib_register_client() */ +/* callback function for ib_unregister_client() */ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) { struct smc_ib_device *smcibdev; @@ -564,6 +577,7 @@ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) spin_lock(&smc_ib_devices.lock); list_del_init(&smcibdev->list); /* remove from smc_ib_devices */ spin_unlock(&smc_ib_devices.lock); + smc_smcr_terminate_all(smcibdev); smc_ib_cleanup_per_ibdev(smcibdev); ib_unregister_event_handler(&smcibdev->event_handler); kfree(smcibdev); diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index da60ab9e8d70..255db87547d3 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -14,6 +14,7 @@ #include <linux/interrupt.h> #include <linux/if_ether.h> +#include <linux/wait.h> #include <rdma/ib_verbs.h> #include <net/smc.h> @@ -47,6 +48,9 @@ struct smc_ib_device { /* ib-device infos for smc */ u8 initialized : 1; /* ib dev CQ, evthdl done */ struct work_struct port_event_work; unsigned long port_event_mask; + DECLARE_BITMAP(ports_going_away, SMC_MAX_PORTS); + atomic_t lnk_cnt; /* number of links on ibdev */ + wait_queue_head_t lnks_deleted; /* wait 4 removal of all links*/ }; struct smc_buf_desc; diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index e89e918b88e0..5c4727d5066e 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -146,6 +146,10 @@ out: int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) { struct smcd_dmb dmb; + int rc = 0; + + if (!dmb_desc->dma_addr) + return rc; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_tok = dmb_desc->token; @@ -153,7 +157,13 @@ int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) dmb.cpu_addr = dmb_desc->cpu_addr; dmb.dma_addr = dmb_desc->dma_addr; dmb.dmb_len = dmb_desc->len; - return smcd->ops->unregister_dmb(smcd, &dmb); + rc = smcd->ops->unregister_dmb(smcd, &dmb); + if (!rc || rc == ISM_ERROR) { + dmb_desc->cpu_addr = NULL; + dmb_desc->dma_addr = 0; + } + + return rc; } int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, @@ -226,6 +236,9 @@ int smc_ism_signal_shutdown(struct smc_link_group *lgr) int rc; union smcd_sw_event_info ev_info; + if (lgr->peer_shutdown) + return 0; + memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE); ev_info.vlan_id = lgr->vlan_id; ev_info.code = ISM_EVENT_REQUEST; @@ -286,7 +299,10 @@ struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, smc_pnetid_by_dev_port(parent, 0, smcd->pnetid); spin_lock_init(&smcd->lock); + spin_lock_init(&smcd->lgr_lock); INIT_LIST_HEAD(&smcd->vlan); + INIT_LIST_HEAD(&smcd->lgr_list); + init_waitqueue_head(&smcd->lgrs_deleted); smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)", WQ_MEM_RECLAIM, name); if (!smcd->event_wq) { @@ -311,11 +327,12 @@ EXPORT_SYMBOL_GPL(smcd_register_dev); void smcd_unregister_dev(struct smcd_dev *smcd) { spin_lock(&smcd_dev_list.lock); - list_del(&smcd->list); + list_del_init(&smcd->list); spin_unlock(&smcd_dev_list.lock); + smcd->going_away = 1; + smc_smcd_terminate_all(smcd); flush_workqueue(smcd->event_wq); destroy_workqueue(smcd->event_wq); - smc_smcd_terminate(smcd, 0, VLAN_VID_MASK); device_del(&smcd->dev); } @@ -342,6 +359,8 @@ void smcd_handle_event(struct smcd_dev *smcd, struct smcd_event *event) { struct smc_ism_event_work *wrk; + if (smcd->going_away) + return; /* copy event to event work queue, and let it be handled there */ wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC); if (!wrk) @@ -367,7 +386,7 @@ void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno) spin_lock_irqsave(&smcd->lock, flags); conn = smcd->conn[dmbno]; - if (conn) + if (conn && !conn->killed) tasklet_schedule(&conn->rx_tsklet); spin_unlock_irqrestore(&smcd->lock, flags); } diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 4fd60c522802..a9f6431dd69a 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -475,7 +475,7 @@ static void smc_llc_rx_delete_link(struct smc_link *link, smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP, true); } smc_llc_send_message(link, llc, sizeof(*llc)); - smc_lgr_schedule_free_work_fast(lgr); + smc_lgr_terminate_sched(lgr); } } @@ -614,7 +614,7 @@ static void smc_llc_testlink_work(struct work_struct *work) rc = wait_for_completion_interruptible_timeout(&link->llc_testlink_resp, SMC_LLC_WAIT_TIME); if (rc <= 0) { - smc_lgr_terminate(smc_get_lgr(link)); + smc_lgr_terminate(smc_get_lgr(link), true); return; } next_interval = link->llc_testlink_time; @@ -656,6 +656,7 @@ void smc_llc_link_active(struct smc_link *link, int testlink_time) void smc_llc_link_deleting(struct smc_link *link) { link->state = SMC_LNK_DELETING; + smc_wr_wakeup_tx_wait(link); } /* called in tasklet context */ @@ -663,6 +664,8 @@ void smc_llc_link_inactive(struct smc_link *link) { link->state = SMC_LNK_INACTIVE; cancel_delayed_work(&link->llc_testlink_wrk); + smc_wr_wakeup_reg_wait(link); + smc_wr_wakeup_tx_wait(link); } /* called in worker context */ @@ -695,9 +698,11 @@ int smc_llc_do_confirm_rkey(struct smc_link *link, int smc_llc_do_delete_rkey(struct smc_link *link, struct smc_buf_desc *rmb_desc) { - int rc; + int rc = 0; mutex_lock(&link->llc_delete_rkey_mutex); + if (link->state != SMC_LNK_ACTIVE) + goto out; reinit_completion(&link->llc_delete_rkey); rc = smc_llc_send_delete_rkey(link, rmb_desc); if (rc) diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index bab2da8cf17a..2a5ed47c3e08 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -376,8 +376,6 @@ static int smc_pnet_fill_entry(struct net *net, return 0; error: - if (pnetelem->ndev) - dev_put(pnetelem->ndev); return rc; } @@ -613,7 +611,7 @@ static const struct genl_ops smc_pnet_ops[] = { { .cmd = SMC_PNETID_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .flags = GENL_ADMIN_PERM, + /* can be retrieved by unprivileged users */ .doit = smc_pnet_get, .dumpit = smc_pnet_dump, .start = smc_pnet_dump_start @@ -718,7 +716,7 @@ static struct net_device *pnet_find_base_ndev(struct net_device *ndev) int i, nest_lvl; rtnl_lock(); - nest_lvl = dev_get_nest_level(ndev); + nest_lvl = ndev->lower_level; for (i = 0; i < nest_lvl; i++) { struct list_head *lower = &ndev->adj_list.lower; @@ -781,6 +779,7 @@ static void smc_pnet_find_rdma_dev(struct net_device *netdev, dev_put(ndev); if (netdev == ndev && smc_ib_port_active(ibdev, i) && + !test_bit(i - 1, ibdev->ports_going_away) && !smc_ib_determine_gid(ibdev, i, ini->vlan_id, ini->ib_gid, NULL)) { ini->ib_dev = ibdev; @@ -820,6 +819,7 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, continue; if (smc_pnet_match(ibdev->pnetid[i - 1], ndev_pnetid) && smc_ib_port_active(ibdev, i) && + !test_bit(i - 1, ibdev->ports_going_away) && !smc_ib_determine_gid(ibdev, i, ini->vlan_id, ini->ib_gid, NULL)) { ini->ib_dev = ibdev; @@ -846,7 +846,8 @@ static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, spin_lock(&smcd_dev_list.lock); list_for_each_entry(ismdev, &smcd_dev_list.list, list) { - if (smc_pnet_match(ismdev->pnetid, ndev_pnetid)) { + if (smc_pnet_match(ismdev->pnetid, ndev_pnetid) && + !ismdev->going_away) { ini->ism_dev = ismdev; break; } diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 413a6abf227e..39d7b34d06d2 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -201,6 +201,8 @@ int smc_rx_wait(struct smc_sock *smc, long *timeo, { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct smc_connection *conn = &smc->conn; + struct smc_cdc_conn_state_flags *cflags = + &conn->local_tx_ctrl.conn_state_flags; struct sock *sk = &smc->sk; int rc; @@ -210,9 +212,10 @@ int smc_rx_wait(struct smc_sock *smc, long *timeo, add_wait_queue(sk_sleep(sk), &wait); rc = sk_wait_event(sk, timeo, sk->sk_err || + cflags->peer_conn_abort || sk->sk_shutdown & RCV_SHUTDOWN || - fcrit(conn) || - smc_cdc_rxed_any_close_or_senddone(conn), + conn->killed || + fcrit(conn), &wait); remove_wait_queue(sk_sleep(sk), &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); @@ -262,6 +265,18 @@ static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len, return -EAGAIN; } +static bool smc_rx_recvmsg_data_available(struct smc_sock *smc) +{ + struct smc_connection *conn = &smc->conn; + + if (smc_rx_data_available(conn)) + return true; + else if (conn->urg_state == SMC_URG_VALID) + /* we received a single urgent Byte - skip */ + smc_rx_update_cons(smc, 0); + return false; +} + /* smc_rx_recvmsg - receive data from RMBE * @msg: copy data to receive buffer * @pipe: copy data to pipe if set - indicates splice() call @@ -303,16 +318,20 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, if (read_done >= target || (pipe && read_done)) break; - if (atomic_read(&conn->bytes_to_rcv)) + if (conn->killed) + break; + + if (smc_rx_recvmsg_data_available(smc)) goto copy; - else if (conn->urg_state == SMC_URG_VALID) - /* we received a single urgent Byte - skip */ - smc_rx_update_cons(smc, 0); - if (sk->sk_shutdown & RCV_SHUTDOWN || - smc_cdc_rxed_any_close_or_senddone(conn) || - conn->local_tx_ctrl.conn_state_flags.peer_conn_abort) + if (sk->sk_shutdown & RCV_SHUTDOWN) { + /* smc_cdc_msg_recv_action() could have run after + * above smc_rx_recvmsg_data_available() + */ + if (smc_rx_recvmsg_data_available(smc)) + goto copy; break; + } if (read_done) { if (sk->sk_err || diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index f0de323d15d6..0d42e7716b91 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -76,18 +76,17 @@ static int smc_tx_wait(struct smc_sock *smc, int flags) DEFINE_WAIT_FUNC(wait, woken_wake_function); struct smc_connection *conn = &smc->conn; struct sock *sk = &smc->sk; - bool noblock; long timeo; int rc = 0; /* similar to sk_stream_wait_memory */ timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); - noblock = timeo ? false : true; add_wait_queue(sk_sleep(sk), &wait); while (1) { sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN) || + conn->killed || conn->local_tx_ctrl.conn_state_flags.peer_done_writing) { rc = -EPIPE; break; @@ -97,8 +96,8 @@ static int smc_tx_wait(struct smc_sock *smc, int flags) break; } if (!timeo) { - if (noblock) - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + /* ensure EPOLLOUT is subsequently generated */ + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); rc = -EAGAIN; break; } @@ -157,7 +156,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) return -ENOTCONN; if (smc->sk.sk_shutdown & SEND_SHUTDOWN || (smc->sk.sk_err == ECONNABORTED) || - conn->local_tx_ctrl.conn_state_flags.peer_conn_abort) + conn->killed) return -EPIPE; if (smc_cdc_rxed_any_close(conn)) return send_done ?: -ECONNRESET; @@ -284,10 +283,8 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, peer_rmbe_offset; rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey; rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL); - if (rc) { - conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; - smc_lgr_terminate(lgr); - } + if (rc) + smc_lgr_terminate(lgr, true); return rc; } @@ -497,10 +494,11 @@ static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) if (smc->sk.sk_err == ECONNABORTED) return sock_error(&smc->sk); + if (conn->killed) + return -EPIPE; rc = 0; - if (conn->alert_token_local) /* connection healthy */ - mod_delayed_work(system_wq, &conn->tx_work, - SMC_TX_WORK_DELAY); + mod_delayed_work(system_wq, &conn->tx_work, + SMC_TX_WORK_DELAY); } return rc; } @@ -549,6 +547,9 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) { int rc; + if (conn->killed || + conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) + return -EPIPE; /* connection being aborted */ if (conn->lgr->is_smcd) rc = smcd_tx_sndbuf_nonempty(conn); else @@ -575,9 +576,7 @@ void smc_tx_work(struct work_struct *work) int rc; lock_sock(&smc->sk); - if (smc->sk.sk_err || - !conn->alert_token_local || - conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) + if (smc->sk.sk_err) goto out; rc = smc_tx_sndbuf_nonempty(conn); @@ -610,8 +609,11 @@ void smc_tx_consumer_update(struct smc_connection *conn, bool force) ((to_confirm > conn->rmbe_update_limit) && ((sender_free <= (conn->rmb_desc->len / 2)) || conn->local_rx_ctrl.prod_flags.write_blocked))) { + if (conn->killed || + conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) + return; if ((smc_cdc_get_slot_and_msg_send(conn) < 0) && - conn->alert_token_local) { /* connection healthy */ + !conn->killed) { schedule_delayed_work(&conn->tx_work, SMC_TX_WORK_DELAY); return; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 253aa75dc2b6..337ee52ad3d3 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -50,6 +50,26 @@ struct smc_wr_tx_pend { /* control data for a pending send request */ /*------------------------------- completion --------------------------------*/ +/* returns true if at least one tx work request is pending on the given link */ +static inline bool smc_wr_is_tx_pend(struct smc_link *link) +{ + if (find_first_bit(link->wr_tx_mask, link->wr_tx_cnt) != + link->wr_tx_cnt) { + return true; + } + return false; +} + +/* wait till all pending tx work requests on the given link are completed */ +static inline int smc_wr_tx_wait_no_pending_sends(struct smc_link *link) +{ + if (wait_event_timeout(link->wr_tx_wait, !smc_wr_is_tx_pend(link), + SMC_WR_TX_WAIT_PENDING_TIME)) + return 0; + else /* timeout */ + return -EPIPE; +} + static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id) { u32 i; @@ -75,7 +95,7 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) link->wr_reg_state = FAILED; else link->wr_reg_state = CONFIRMED; - wake_up(&link->wr_reg_wait); + smc_wr_wakeup_reg_wait(link); return; } @@ -101,7 +121,7 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) clear_bit(i, link->wr_tx_mask); } /* terminate connections of this link group abnormally */ - smc_lgr_terminate(smc_get_lgr(link)); + smc_lgr_terminate_sched(smc_get_lgr(link)); } if (pnd_snd.handler) pnd_snd.handler(&pnd_snd.priv, link, wc->status); @@ -171,6 +191,7 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, struct smc_rdma_wr **wr_rdma_buf, struct smc_wr_tx_pend_priv **wr_pend_priv) { + struct smc_link_group *lgr = smc_get_lgr(link); struct smc_wr_tx_pend *wr_pend; u32 idx = link->wr_tx_cnt; struct ib_send_wr *wr_ib; @@ -179,19 +200,20 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, *wr_buf = NULL; *wr_pend_priv = NULL; - if (in_softirq()) { + if (in_softirq() || lgr->terminating) { rc = smc_wr_tx_get_free_slot_index(link, &idx); if (rc) return rc; } else { - rc = wait_event_timeout( + rc = wait_event_interruptible_timeout( link->wr_tx_wait, link->state == SMC_LNK_INACTIVE || + lgr->terminating || (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY), SMC_WR_TX_WAIT_FREE_SLOT_TIME); if (!rc) { /* timeout - terminate connections */ - smc_lgr_terminate(smc_get_lgr(link)); + smc_lgr_terminate_sched(lgr); return -EPIPE; } if (idx == link->wr_tx_cnt) @@ -227,6 +249,7 @@ int smc_wr_tx_put_slot(struct smc_link *link, memset(&link->wr_tx_bufs[idx], 0, sizeof(link->wr_tx_bufs[idx])); test_and_clear_bit(idx, link->wr_tx_mask); + wake_up(&link->wr_tx_wait); return 1; } @@ -247,7 +270,7 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL); if (rc) { smc_wr_tx_put_slot(link, priv); - smc_lgr_terminate(smc_get_lgr(link)); + smc_lgr_terminate_sched(smc_get_lgr(link)); } return rc; } @@ -272,7 +295,7 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) SMC_WR_REG_MR_WAIT_TIME); if (!rc) { /* timeout - terminate connections */ - smc_lgr_terminate(smc_get_lgr(link)); + smc_lgr_terminate_sched(smc_get_lgr(link)); return -EPIPE; } if (rc == -ERESTARTSYS) @@ -373,7 +396,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) /* terminate connections of this link group * abnormally */ - smc_lgr_terminate(smc_get_lgr(link)); + smc_lgr_terminate_sched(smc_get_lgr(link)); break; default: smc_wr_rx_post(link); /* refill WR RX */ @@ -510,8 +533,10 @@ void smc_wr_free_link(struct smc_link *lnk) { struct ib_device *ibdev; - memset(lnk->wr_tx_mask, 0, - BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask)); + if (smc_wr_tx_wait_no_pending_sends(lnk)) + memset(lnk->wr_tx_mask, 0, + BITS_TO_LONGS(SMC_WR_BUF_CNT) * + sizeof(*lnk->wr_tx_mask)); if (!lnk->smcibdev) return; diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index 09bf32fd3959..3ac99c898418 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -60,6 +60,16 @@ static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val) atomic_long_set(wr_tx_id, val); } +static inline void smc_wr_wakeup_tx_wait(struct smc_link *lnk) +{ + wake_up_all(&lnk->wr_tx_wait); +} + +static inline void smc_wr_wakeup_reg_wait(struct smc_link *lnk) +{ + wake_up(&lnk->wr_reg_wait); +} + /* post a new receive work request to fill a completed old work request entry */ static inline int smc_wr_rx_post(struct smc_link *link) { diff --git a/net/socket.c b/net/socket.c index 6a9ab7a8b1d2..b79a05de7c6e 100644 --- a/net/socket.c +++ b/net/socket.c @@ -100,6 +100,7 @@ #include <linux/if_tun.h> #include <linux/ipv6_route.h> #include <linux/route.h> +#include <linux/termios.h> #include <linux/sockios.h> #include <net/busy_poll.h> #include <linux/errqueue.h> @@ -128,6 +129,18 @@ static ssize_t sock_splice_read(struct file *file, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); +#ifdef CONFIG_PROC_FS +static void sock_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct socket *sock = f->private_data; + + if (sock->ops->show_fdinfo) + sock->ops->show_fdinfo(m, sock); +} +#else +#define sock_show_fdinfo NULL +#endif + /* * Socket files have a set of 'special' operations as well as the generic file ones. These don't appear * in the operation structures but are done directly via the socketcall() multiplexor. @@ -149,6 +162,7 @@ static const struct file_operations socket_file_ops = { .sendpage = sock_sendpage, .splice_write = generic_splice_sendpage, .splice_read = sock_splice_read, + .show_fdinfo = sock_show_fdinfo, }; /* @@ -404,6 +418,7 @@ struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname) sock->file = file; file->private_data = sock; + stream_open(SOCK_INODE(sock), file); return file; } EXPORT_SYMBOL(sock_alloc_file); @@ -793,7 +808,7 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW, sizeof(ts), &ts); } else { - struct timespec ts; + struct __kernel_old_timespec ts; skb_get_timestampns(skb, &ts); put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD, @@ -955,7 +970,7 @@ static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to) .msg_iocb = iocb}; ssize_t res; - if (file->f_flags & O_NONBLOCK) + if (file->f_flags & O_NONBLOCK || (iocb->ki_flags & IOCB_NOWAIT)) msg.msg_flags = MSG_DONTWAIT; if (iocb->ki_pos != 0) @@ -980,7 +995,7 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_pos != 0) return -ESPIPE; - if (file->f_flags & O_NONBLOCK) + if (file->f_flags & O_NONBLOCK || (iocb->ki_flags & IOCB_NOWAIT)) msg.msg_flags = MSG_DONTWAIT; if (sock->type == SOCK_SEQPACKET) @@ -1690,24 +1705,13 @@ SYSCALL_DEFINE2(listen, int, fd, int, backlog) return __sys_listen(fd, backlog); } -/* - * For accept, we attempt to create a new socket, set up the link - * with the client, wake up the client, then return the new - * connected fd. We collect the address of the connector in kernel - * space and move it to user at the very end. This is unclean because - * we open the socket then return an error. - * - * 1003.1g adds the ability to recvmsg() to query connection pending - * status to recvmsg. We need to add that support in a way thats - * clean when we restructure accept also. - */ - -int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, - int __user *upeer_addrlen, int flags) +int __sys_accept4_file(struct file *file, unsigned file_flags, + struct sockaddr __user *upeer_sockaddr, + int __user *upeer_addrlen, int flags) { struct socket *sock, *newsock; struct file *newfile; - int err, len, newfd, fput_needed; + int err, len, newfd; struct sockaddr_storage address; if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) @@ -1716,14 +1720,14 @@ int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; - sock = sockfd_lookup_light(fd, &err, &fput_needed); + sock = sock_from_file(file, &err); if (!sock) goto out; err = -ENFILE; newsock = sock_alloc(); if (!newsock) - goto out_put; + goto out; newsock->type = sock->type; newsock->ops = sock->ops; @@ -1738,20 +1742,21 @@ int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, if (unlikely(newfd < 0)) { err = newfd; sock_release(newsock); - goto out_put; + goto out; } newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name); if (IS_ERR(newfile)) { err = PTR_ERR(newfile); put_unused_fd(newfd); - goto out_put; + goto out; } err = security_socket_accept(sock, newsock); if (err) goto out_fd; - err = sock->ops->accept(sock, newsock, sock->file->f_flags, false); + err = sock->ops->accept(sock, newsock, sock->file->f_flags | file_flags, + false); if (err < 0) goto out_fd; @@ -1772,15 +1777,42 @@ int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, fd_install(newfd, newfile); err = newfd; - -out_put: - fput_light(sock->file, fput_needed); out: return err; out_fd: fput(newfile); put_unused_fd(newfd); - goto out_put; + goto out; + +} + +/* + * For accept, we attempt to create a new socket, set up the link + * with the client, wake up the client, then return the new + * connected fd. We collect the address of the connector in kernel + * space and move it to user at the very end. This is unclean because + * we open the socket then return an error. + * + * 1003.1g adds the ability to recvmsg() to query connection pending + * status to recvmsg. We need to add that support in a way thats + * clean when we restructure accept also. + */ + +int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, + int __user *upeer_addrlen, int flags) +{ + int ret = -EBADF; + struct fd f; + + f = fdget(fd); + if (f.file) { + ret = __sys_accept4_file(f.file, 0, upeer_sockaddr, + upeer_addrlen, flags); + if (f.flags) + fput(f.file); + } + + return ret; } SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr, @@ -1807,32 +1839,46 @@ SYSCALL_DEFINE3(accept, int, fd, struct sockaddr __user *, upeer_sockaddr, * include the -EINPROGRESS status for such sockets. */ -int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen) +int __sys_connect_file(struct file *file, struct sockaddr_storage *address, + int addrlen, int file_flags) { struct socket *sock; - struct sockaddr_storage address; - int err, fput_needed; + int err; - sock = sockfd_lookup_light(fd, &err, &fput_needed); + sock = sock_from_file(file, &err); if (!sock) goto out; - err = move_addr_to_kernel(uservaddr, addrlen, &address); - if (err < 0) - goto out_put; err = - security_socket_connect(sock, (struct sockaddr *)&address, addrlen); + security_socket_connect(sock, (struct sockaddr *)address, addrlen); if (err) - goto out_put; + goto out; - err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen, - sock->file->f_flags); -out_put: - fput_light(sock->file, fput_needed); + err = sock->ops->connect(sock, (struct sockaddr *)address, addrlen, + sock->file->f_flags | file_flags); out: return err; } +int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen) +{ + int ret = -EBADF; + struct fd f; + + f = fdget(fd); + if (f.file) { + struct sockaddr_storage address; + + ret = move_addr_to_kernel(uservaddr, addrlen, &address); + if (!ret) + ret = __sys_connect_file(f.file, &address, addrlen, 0); + if (f.flags) + fput(f.file); + } + + return ret; +} + SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr, int, addrlen) { @@ -2232,15 +2278,10 @@ static int copy_msghdr_from_user(struct msghdr *kmsg, return err < 0 ? err : 0; } -static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, - struct msghdr *msg_sys, unsigned int flags, - struct used_address *used_address, - unsigned int allowed_msghdr_flags) +static int ____sys_sendmsg(struct socket *sock, struct msghdr *msg_sys, + unsigned int flags, struct used_address *used_address, + unsigned int allowed_msghdr_flags) { - struct compat_msghdr __user *msg_compat = - (struct compat_msghdr __user *)msg; - struct sockaddr_storage address; - struct iovec iovstack[UIO_FASTIOV], *iov = iovstack; unsigned char ctl[sizeof(struct cmsghdr) + 20] __aligned(sizeof(__kernel_size_t)); /* 20 is size of ipv6_pktinfo */ @@ -2248,19 +2289,10 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, int ctl_len; ssize_t err; - msg_sys->msg_name = &address; - - if (MSG_CMSG_COMPAT & flags) - err = get_compat_msghdr(msg_sys, msg_compat, NULL, &iov); - else - err = copy_msghdr_from_user(msg_sys, msg, NULL, &iov); - if (err < 0) - return err; - err = -ENOBUFS; if (msg_sys->msg_controllen > INT_MAX) - goto out_freeiov; + goto out; flags |= (msg_sys->msg_flags & allowed_msghdr_flags); ctl_len = msg_sys->msg_controllen; if ((MSG_CMSG_COMPAT & flags) && ctl_len) { @@ -2268,7 +2300,7 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, cmsghdr_from_user_compat_to_kern(msg_sys, sock->sk, ctl, sizeof(ctl)); if (err) - goto out_freeiov; + goto out; ctl_buf = msg_sys->msg_control; ctl_len = msg_sys->msg_controllen; } else if (ctl_len) { @@ -2277,7 +2309,7 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, if (ctl_len > sizeof(ctl)) { ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL); if (ctl_buf == NULL) - goto out_freeiov; + goto out; } err = -EFAULT; /* @@ -2323,7 +2355,47 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, out_freectl: if (ctl_buf != ctl) sock_kfree_s(sock->sk, ctl_buf, ctl_len); -out_freeiov: +out: + return err; +} + +int sendmsg_copy_msghdr(struct msghdr *msg, + struct user_msghdr __user *umsg, unsigned flags, + struct iovec **iov) +{ + int err; + + if (flags & MSG_CMSG_COMPAT) { + struct compat_msghdr __user *msg_compat; + + msg_compat = (struct compat_msghdr __user *) umsg; + err = get_compat_msghdr(msg, msg_compat, NULL, iov); + } else { + err = copy_msghdr_from_user(msg, umsg, NULL, iov); + } + if (err < 0) + return err; + + return 0; +} + +static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, + struct msghdr *msg_sys, unsigned int flags, + struct used_address *used_address, + unsigned int allowed_msghdr_flags) +{ + struct sockaddr_storage address; + struct iovec iovstack[UIO_FASTIOV], *iov = iovstack; + ssize_t err; + + msg_sys->msg_name = &address; + + err = sendmsg_copy_msghdr(msg_sys, msg, flags, &iov); + if (err < 0) + return err; + + err = ____sys_sendmsg(sock, msg_sys, flags, used_address, + allowed_msghdr_flags); kfree(iov); return err; } @@ -2331,12 +2403,14 @@ out_freeiov: /* * BSD sendmsg interface */ -long __sys_sendmsg_sock(struct socket *sock, struct user_msghdr __user *msg, +long __sys_sendmsg_sock(struct socket *sock, struct msghdr *msg, unsigned int flags) { - struct msghdr msg_sys; + /* disallow ancillary data requests from this path */ + if (msg->msg_control || msg->msg_controllen) + return -EINVAL; - return ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL, 0); + return ____sys_sendmsg(sock, msg, flags, NULL, 0); } long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, @@ -2442,33 +2516,41 @@ SYSCALL_DEFINE4(sendmmsg, int, fd, struct mmsghdr __user *, mmsg, return __sys_sendmmsg(fd, mmsg, vlen, flags, true); } -static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg, - struct msghdr *msg_sys, unsigned int flags, int nosec) +int recvmsg_copy_msghdr(struct msghdr *msg, + struct user_msghdr __user *umsg, unsigned flags, + struct sockaddr __user **uaddr, + struct iovec **iov) { - struct compat_msghdr __user *msg_compat = - (struct compat_msghdr __user *)msg; - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov = iovstack; - unsigned long cmsg_ptr; - int len; ssize_t err; - /* kernel mode address */ - struct sockaddr_storage addr; - - /* user mode address pointers */ - struct sockaddr __user *uaddr; - int __user *uaddr_len = COMPAT_NAMELEN(msg); - - msg_sys->msg_name = &addr; + if (MSG_CMSG_COMPAT & flags) { + struct compat_msghdr __user *msg_compat; - if (MSG_CMSG_COMPAT & flags) - err = get_compat_msghdr(msg_sys, msg_compat, &uaddr, &iov); - else - err = copy_msghdr_from_user(msg_sys, msg, &uaddr, &iov); + msg_compat = (struct compat_msghdr __user *) umsg; + err = get_compat_msghdr(msg, msg_compat, uaddr, iov); + } else { + err = copy_msghdr_from_user(msg, umsg, uaddr, iov); + } if (err < 0) return err; + return 0; +} + +static int ____sys_recvmsg(struct socket *sock, struct msghdr *msg_sys, + struct user_msghdr __user *msg, + struct sockaddr __user *uaddr, + unsigned int flags, int nosec) +{ + struct compat_msghdr __user *msg_compat = + (struct compat_msghdr __user *) msg; + int __user *uaddr_len = COMPAT_NAMELEN(msg); + struct sockaddr_storage addr; + unsigned long cmsg_ptr; + int len; + ssize_t err; + + msg_sys->msg_name = &addr; cmsg_ptr = (unsigned long)msg_sys->msg_control; msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT); @@ -2477,9 +2559,14 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg, if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; - err = (nosec ? sock_recvmsg_nosec : sock_recvmsg)(sock, msg_sys, flags); + + if (unlikely(nosec)) + err = sock_recvmsg_nosec(sock, msg_sys, flags); + else + err = sock_recvmsg(sock, msg_sys, flags); + if (err < 0) - goto out_freeiov; + goto out; len = err; if (uaddr != NULL) { @@ -2487,12 +2574,12 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg, msg_sys->msg_namelen, uaddr, uaddr_len); if (err < 0) - goto out_freeiov; + goto out; } err = __put_user((msg_sys->msg_flags & ~MSG_CMSG_COMPAT), COMPAT_FLAGS(msg)); if (err) - goto out_freeiov; + goto out; if (MSG_CMSG_COMPAT & flags) err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr, &msg_compat->msg_controllen); @@ -2500,10 +2587,25 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg, err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr, &msg->msg_controllen); if (err) - goto out_freeiov; + goto out; err = len; +out: + return err; +} -out_freeiov: +static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg, + struct msghdr *msg_sys, unsigned int flags, int nosec) +{ + struct iovec iovstack[UIO_FASTIOV], *iov = iovstack; + /* user mode address pointers */ + struct sockaddr __user *uaddr; + ssize_t err; + + err = recvmsg_copy_msghdr(msg_sys, msg, flags, &uaddr, &iov); + if (err < 0) + return err; + + err = ____sys_recvmsg(sock, msg_sys, msg, uaddr, flags, nosec); kfree(iov); return err; } @@ -2512,12 +2614,15 @@ out_freeiov: * BSD recvmsg interface */ -long __sys_recvmsg_sock(struct socket *sock, struct user_msghdr __user *msg, - unsigned int flags) +long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg, + struct user_msghdr __user *umsg, + struct sockaddr __user *uaddr, unsigned int flags) { - struct msghdr msg_sys; + /* disallow ancillary data requests from this path */ + if (msg->msg_control || msg->msg_controllen) + return -EINVAL; - return ___sys_recvmsg(sock, msg, &msg_sys, flags, 0); + return ____sys_recvmsg(sock, msg, umsg, uaddr, flags, 0); } long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, @@ -2833,7 +2938,7 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) a[2], true); break; case SYS_RECVMMSG: - if (IS_ENABLED(CONFIG_64BIT) || !IS_ENABLED(CONFIG_64BIT_TIME)) + if (IS_ENABLED(CONFIG_64BIT)) err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3], (struct __kernel_timespec __user *)a[4], @@ -3452,6 +3557,8 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCSARP: case SIOCGARP: case SIOCDARP: + case SIOCOUTQ: + case SIOCOUTQNSD: case SIOCATMARK: return sock_do_ioctl(net, sock, cmd, arg); } diff --git a/net/sunrpc/addr.c b/net/sunrpc/addr.c index d024af4be85e..8b4d72b1a066 100644 --- a/net/sunrpc/addr.c +++ b/net/sunrpc/addr.c @@ -175,7 +175,7 @@ static int rpc_parse_scope_id(struct net *net, const char *buf, return 0; len = (buf + buflen) - delim - 1; - p = kstrndup(delim + 1, len, GFP_KERNEL); + p = kmemdup_nul(delim + 1, len, GFP_KERNEL); if (p) { u32 scope_id = 0; struct net_device *dev; diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index cdb05b48de44..5748ad0ba1bd 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -221,55 +221,6 @@ rpcauth_get_gssinfo(rpc_authflavor_t pseudoflavor, struct rpcsec_gss_info *info) } EXPORT_SYMBOL_GPL(rpcauth_get_gssinfo); -/** - * rpcauth_list_flavors - discover registered flavors and pseudoflavors - * @array: array to fill in - * @size: size of "array" - * - * Returns the number of array items filled in, or a negative errno. - * - * The returned array is not sorted by any policy. Callers should not - * rely on the order of the items in the returned array. - */ -int -rpcauth_list_flavors(rpc_authflavor_t *array, int size) -{ - const struct rpc_authops *ops; - rpc_authflavor_t flavor, pseudos[4]; - int i, len, result = 0; - - rcu_read_lock(); - for (flavor = 0; flavor < RPC_AUTH_MAXFLAVOR; flavor++) { - ops = rcu_dereference(auth_flavors[flavor]); - if (result >= size) { - result = -ENOMEM; - break; - } - - if (ops == NULL) - continue; - if (ops->list_pseudoflavors == NULL) { - array[result++] = ops->au_flavor; - continue; - } - len = ops->list_pseudoflavors(pseudos, ARRAY_SIZE(pseudos)); - if (len < 0) { - result = len; - break; - } - for (i = 0; i < len; i++) { - if (result >= size) { - result = -ENOMEM; - break; - } - array[result++] = pseudos[i]; - } - } - rcu_read_unlock(); - return result; -} -EXPORT_SYMBOL_GPL(rpcauth_list_flavors); - struct rpc_auth * rpcauth_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) { diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 4ce42c62458e..24ca861815b1 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1960,7 +1960,7 @@ gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred, if (xdr_buf_subsegment(rcv_buf, &integ_buf, data_offset, integ_len)) goto unwrap_failed; - if (xdr_buf_read_netobj(rcv_buf, &mic, mic_offset)) + if (xdr_buf_read_mic(rcv_buf, &mic, mic_offset)) goto unwrap_failed; maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, &mic); if (maj_stat == GSS_S_CONTEXT_EXPIRED) @@ -2118,7 +2118,6 @@ static const struct rpc_authops authgss_ops = { .hash_cred = gss_hash_cred, .lookup_cred = gss_lookup_cred, .crcreate = gss_create_cred, - .list_pseudoflavors = gss_mech_list_pseudoflavors, .info2flavor = gss_mech_info2flavor, .flavor2info = gss_mech_flavor2info, }; diff --git a/net/sunrpc/auth_gss/gss_krb5_keys.c b/net/sunrpc/auth_gss/gss_krb5_keys.c index 550fdf18d3b3..3b7f721c023b 100644 --- a/net/sunrpc/auth_gss/gss_krb5_keys.c +++ b/net/sunrpc/auth_gss/gss_krb5_keys.c @@ -228,14 +228,11 @@ u32 krb5_derive_key(const struct gss_krb5_enctype *gk5e, ret = 0; err_free_raw: - memset(rawkey, 0, keybytes); - kfree(rawkey); + kzfree(rawkey); err_free_out: - memset(outblockdata, 0, blocksize); - kfree(outblockdata); + kzfree(outblockdata); err_free_in: - memset(inblockdata, 0, blocksize); - kfree(inblockdata); + kzfree(inblockdata); err_free_cipher: crypto_free_sync_skcipher(cipher); err_return: diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 6e5d6d240215..75b3c2e9e8f8 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -253,6 +253,7 @@ gss_import_v1_context(const void *p, const void *end, struct krb5_ctx *ctx) { u32 seq_send; int tmp; + u32 time32; p = simple_get_bytes(p, end, &ctx->initiate, sizeof(ctx->initiate)); if (IS_ERR(p)) @@ -290,9 +291,11 @@ gss_import_v1_context(const void *p, const void *end, struct krb5_ctx *ctx) p = ERR_PTR(-ENOSYS); goto out_err; } - p = simple_get_bytes(p, end, &ctx->endtime, sizeof(ctx->endtime)); + p = simple_get_bytes(p, end, &time32, sizeof(time32)); if (IS_ERR(p)) goto out_err; + /* unsigned 32-bit time overflows in year 2106 */ + ctx->endtime = (time64_t)time32; p = simple_get_bytes(p, end, &seq_send, sizeof(seq_send)); if (IS_ERR(p)) goto out_err; @@ -587,15 +590,18 @@ gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx, { u64 seq_send64; int keylen; + u32 time32; p = simple_get_bytes(p, end, &ctx->flags, sizeof(ctx->flags)); if (IS_ERR(p)) goto out_err; ctx->initiate = ctx->flags & KRB5_CTX_FLAG_INITIATOR; - p = simple_get_bytes(p, end, &ctx->endtime, sizeof(ctx->endtime)); + p = simple_get_bytes(p, end, &time32, sizeof(time32)); if (IS_ERR(p)) goto out_err; + /* unsigned 32-bit time overflows in year 2106 */ + ctx->endtime = (time64_t)time32; p = simple_get_bytes(p, end, &seq_send64, sizeof(seq_send64)); if (IS_ERR(p)) goto out_err; @@ -659,7 +665,7 @@ out_err: static int gss_import_sec_context_kerberos(const void *p, size_t len, struct gss_ctx *ctx_id, - time_t *endtime, + time64_t *endtime, gfp_t gfp_mask) { const void *end = (const void *)((const char *)p + len); diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c index 48fe4a591b54..f1d280accf43 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seal.c +++ b/net/sunrpc/auth_gss/gss_krb5_seal.c @@ -131,14 +131,14 @@ gss_get_mic_v1(struct krb5_ctx *ctx, struct xdr_buf *text, struct xdr_netobj md5cksum = {.len = sizeof(cksumdata), .data = cksumdata}; void *ptr; - s32 now; + time64_t now; u32 seq_send; u8 *cksumkey; dprintk("RPC: %s\n", __func__); BUG_ON(ctx == NULL); - now = get_seconds(); + now = ktime_get_real_seconds(); ptr = setup_token(ctx, token); @@ -170,7 +170,7 @@ gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text, struct xdr_netobj cksumobj = { .len = sizeof(cksumdata), .data = cksumdata}; void *krb5_hdr; - s32 now; + time64_t now; u8 *cksumkey; unsigned int cksum_usage; __be64 seq_send_be64; @@ -198,7 +198,7 @@ gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text, memcpy(krb5_hdr + GSS_KRB5_TOK_HDR_LEN, cksumobj.data, cksumobj.len); - now = get_seconds(); + now = ktime_get_real_seconds(); return (ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE; } diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c index ef2b25b86d2f..aaab91cf24c8 100644 --- a/net/sunrpc/auth_gss/gss_krb5_unseal.c +++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c @@ -124,7 +124,7 @@ gss_verify_mic_v1(struct krb5_ctx *ctx, /* it got through unscathed. Make sure the context is unexpired */ - now = get_seconds(); + now = ktime_get_real_seconds(); if (now > ctx->endtime) return GSS_S_CONTEXT_EXPIRED; @@ -149,7 +149,7 @@ gss_verify_mic_v2(struct krb5_ctx *ctx, char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; struct xdr_netobj cksumobj = {.len = sizeof(cksumdata), .data = cksumdata}; - s32 now; + time64_t now; u8 *ptr = read_token->data; u8 *cksumkey; u8 flags; @@ -194,7 +194,7 @@ gss_verify_mic_v2(struct krb5_ctx *ctx, return GSS_S_BAD_SIG; /* it got through unscathed. Make sure the context is unexpired */ - now = get_seconds(); + now = ktime_get_real_seconds(); if (now > ctx->endtime) return GSS_S_CONTEXT_EXPIRED; diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c index 14a0aff0cd84..6c1920eed771 100644 --- a/net/sunrpc/auth_gss/gss_krb5_wrap.c +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -163,7 +163,7 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset, .data = cksumdata}; int blocksize = 0, plainlen; unsigned char *ptr, *msg_start; - s32 now; + time64_t now; int headlen; struct page **tmp_pages; u32 seq_send; @@ -172,7 +172,7 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset, dprintk("RPC: %s\n", __func__); - now = get_seconds(); + now = ktime_get_real_seconds(); blocksize = crypto_sync_skcipher_blocksize(kctx->enc); gss_krb5_add_padding(buf, offset, blocksize); @@ -268,7 +268,7 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf) char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; struct xdr_netobj md5cksum = {.len = sizeof(cksumdata), .data = cksumdata}; - s32 now; + time64_t now; int direction; s32 seqnum; unsigned char *ptr; @@ -359,7 +359,7 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf) /* it got through unscathed. Make sure the context is unexpired */ - now = get_seconds(); + now = ktime_get_real_seconds(); if (now > kctx->endtime) return GSS_S_CONTEXT_EXPIRED; @@ -439,7 +439,7 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, struct page **pages) { u8 *ptr, *plainhdr; - s32 now; + time64_t now; u8 flags = 0x00; __be16 *be16ptr; __be64 *be64ptr; @@ -481,14 +481,14 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset, if (err) return err; - now = get_seconds(); + now = ktime_get_real_seconds(); return (kctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE; } static u32 gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf) { - s32 now; + time64_t now; u8 *ptr; u8 flags = 0x00; u16 ec, rrc; @@ -557,7 +557,7 @@ gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf) /* do sequencing checks */ /* it got through unscathed. Make sure the context is unexpired */ - now = get_seconds(); + now = ktime_get_real_seconds(); if (now > kctx->endtime) return GSS_S_CONTEXT_EXPIRED; diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index 82060099a429..db550bfc2642 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -20,6 +20,7 @@ #include <linux/sunrpc/sched.h> #include <linux/sunrpc/gss_api.h> #include <linux/sunrpc/clnt.h> +#include <trace/events/rpcgss.h> #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH @@ -158,7 +159,6 @@ struct gss_api_mech *gss_mech_get_by_OID(struct rpcsec_gss_oid *obj) if (sprint_oid(obj->data, obj->len, buf, sizeof(buf)) < 0) return NULL; - dprintk("RPC: %s(%s)\n", __func__, buf); request_module("rpc-auth-gss-%s", buf); rcu_read_lock(); @@ -172,6 +172,8 @@ struct gss_api_mech *gss_mech_get_by_OID(struct rpcsec_gss_oid *obj) } } rcu_read_unlock(); + if (!gm) + trace_rpcgss_oid_to_mech(buf); return gm; } @@ -218,35 +220,6 @@ gss_mech_get_by_pseudoflavor(u32 pseudoflavor) } /** - * gss_mech_list_pseudoflavors - Discover registered GSS pseudoflavors - * @array_ptr: array to fill in - * @size: size of "array" - * - * Returns the number of array items filled in, or a negative errno. - * - * The returned array is not sorted by any policy. Callers should not - * rely on the order of the items in the returned array. - */ -int gss_mech_list_pseudoflavors(rpc_authflavor_t *array_ptr, int size) -{ - struct gss_api_mech *pos = NULL; - int j, i = 0; - - rcu_read_lock(); - list_for_each_entry_rcu(pos, ®istered_mechs, gm_list) { - for (j = 0; j < pos->gm_pf_num; j++) { - if (i >= size) { - spin_unlock(®istered_mechs_lock); - return -ENOMEM; - } - array_ptr[i++] = pos->gm_pfs[j].pseudoflavor; - } - } - rcu_read_unlock(); - return i; -} - -/** * gss_svc_to_pseudoflavor - map a GSS service number to a pseudoflavor * @gm: GSS mechanism handle * @qop: GSS quality-of-protection value @@ -374,7 +347,7 @@ int gss_import_sec_context(const void *input_token, size_t bufsize, struct gss_api_mech *mech, struct gss_ctx **ctx_id, - time_t *endtime, + time64_t *endtime, gfp_t gfp_mask) { if (!(*ctx_id = kzalloc(sizeof(**ctx_id), gfp_mask))) diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 8be2f209982b..65b67b257302 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -49,6 +49,9 @@ #include <linux/sunrpc/svcauth.h> #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/cache.h> + +#include <trace/events/rpcgss.h> + #include "gss_rpc_upcall.h" @@ -200,7 +203,7 @@ static int rsi_parse(struct cache_detail *cd, char *ep; int len; struct rsi rsii, *rsip = NULL; - time_t expiry; + time64_t expiry; int status = -EINVAL; memset(&rsii, 0, sizeof(rsii)); @@ -433,7 +436,7 @@ static int rsc_parse(struct cache_detail *cd, int id; int len, rv; struct rsc rsci, *rscp = NULL; - time_t expiry; + time64_t expiry; int status = -EINVAL; struct gss_api_mech *gm = NULL; @@ -1075,24 +1078,32 @@ gss_read_verf(struct rpc_gss_wire_cred *gc, return 0; } -/* Ok this is really heavily depending on a set of semantics in - * how rqstp is set up by svc_recv and pages laid down by the - * server when reading a request. We are basically guaranteed that - * the token lays all down linearly across a set of pages, starting - * at iov_base in rq_arg.head[0] which happens to be the first of a - * set of pages stored in rq_pages[]. - * rq_arg.head[0].iov_base will provide us the page_base to pass - * to the upcall. - */ -static inline int -gss_read_proxy_verf(struct svc_rqst *rqstp, - struct rpc_gss_wire_cred *gc, __be32 *authp, - struct xdr_netobj *in_handle, - struct gssp_in_token *in_token) +static void gss_free_in_token_pages(struct gssp_in_token *in_token) { - struct kvec *argv = &rqstp->rq_arg.head[0]; u32 inlen; - int res; + int i; + + i = 0; + inlen = in_token->page_len; + while (inlen) { + if (in_token->pages[i]) + put_page(in_token->pages[i]); + inlen -= inlen > PAGE_SIZE ? PAGE_SIZE : inlen; + } + + kfree(in_token->pages); + in_token->pages = NULL; +} + +static int gss_read_proxy_verf(struct svc_rqst *rqstp, + struct rpc_gss_wire_cred *gc, __be32 *authp, + struct xdr_netobj *in_handle, + struct gssp_in_token *in_token) +{ + struct kvec *argv = &rqstp->rq_arg.head[0]; + unsigned int page_base, length; + int pages, i, res; + size_t inlen; res = gss_read_common_verf(gc, argv, authp, in_handle); if (res) @@ -1102,10 +1113,36 @@ gss_read_proxy_verf(struct svc_rqst *rqstp, if (inlen > (argv->iov_len + rqstp->rq_arg.page_len)) return SVC_DENIED; - in_token->pages = rqstp->rq_pages; - in_token->page_base = (ulong)argv->iov_base & ~PAGE_MASK; + pages = DIV_ROUND_UP(inlen, PAGE_SIZE); + in_token->pages = kcalloc(pages, sizeof(struct page *), GFP_KERNEL); + if (!in_token->pages) + return SVC_DENIED; + in_token->page_base = 0; in_token->page_len = inlen; + for (i = 0; i < pages; i++) { + in_token->pages[i] = alloc_page(GFP_KERNEL); + if (!in_token->pages[i]) { + gss_free_in_token_pages(in_token); + return SVC_DENIED; + } + } + length = min_t(unsigned int, inlen, argv->iov_len); + memcpy(page_address(in_token->pages[0]), argv->iov_base, length); + inlen -= length; + + i = 1; + page_base = rqstp->rq_arg.page_base; + while (inlen) { + length = min_t(unsigned int, inlen, PAGE_SIZE); + memcpy(page_address(in_token->pages[i]), + page_address(rqstp->rq_arg.pages[i]) + page_base, + length); + + inlen -= length; + page_base = 0; + i++; + } return 0; } @@ -1184,7 +1221,7 @@ static int gss_proxy_save_rsc(struct cache_detail *cd, static atomic64_t ctxhctr; long long ctxh; struct gss_api_mech *gm = NULL; - time_t expiry; + time64_t expiry; int status = -EINVAL; memset(&rsci, 0, sizeof(rsci)); @@ -1211,6 +1248,7 @@ static int gss_proxy_save_rsc(struct cache_detail *cd, dprintk("RPC: No creds found!\n"); goto out; } else { + struct timespec64 boot; /* steal creds */ rsci.cred = ud->creds; @@ -1231,6 +1269,9 @@ static int gss_proxy_save_rsc(struct cache_detail *cd, &expiry, GFP_KERNEL); if (status) goto out; + + getboottime64(&boot); + expiry -= boot.tv_sec; } rsci.h.expiry_time = expiry; @@ -1270,9 +1311,8 @@ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp, if (status) goto out; - dprintk("RPC: svcauth_gss: gss major status = %d " - "minor status = %d\n", - ud.major_status, ud.minor_status); + trace_rpcgss_accept_upcall(rqstp->rq_xid, ud.major_status, + ud.minor_status); switch (ud.major_status) { case GSS_S_CONTINUE_NEEDED: @@ -1280,8 +1320,11 @@ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp, break; case GSS_S_COMPLETE: status = gss_proxy_save_rsc(sn->rsc_cache, &ud, &handle); - if (status) + if (status) { + pr_info("%s: gss_proxy_save_rsc failed (%d)\n", + __func__, status); goto out; + } cli_handle.data = (u8 *)&handle; cli_handle.len = sizeof(handle); break; @@ -1292,15 +1335,20 @@ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp, /* Got an answer to the upcall; use it: */ if (gss_write_init_verf(sn->rsc_cache, rqstp, - &cli_handle, &ud.major_status)) + &cli_handle, &ud.major_status)) { + pr_info("%s: gss_write_init_verf failed\n", __func__); goto out; + } if (gss_write_resv(resv, PAGE_SIZE, &cli_handle, &ud.out_token, - ud.major_status, ud.minor_status)) + ud.major_status, ud.minor_status)) { + pr_info("%s: gss_write_resv failed\n", __func__); goto out; + } ret = SVC_COMPLETE; out: + gss_free_in_token_pages(&ud.in_token); gssp_free_upcall_data(&ud); return ret; } @@ -1384,10 +1432,10 @@ static ssize_t read_gssp(struct file *file, char __user *buf, return len; } -static const struct file_operations use_gss_proxy_ops = { - .open = nonseekable_open, - .write = write_gssp, - .read = read_gssp, +static const struct proc_ops use_gss_proxy_proc_ops = { + .proc_open = nonseekable_open, + .proc_write = write_gssp, + .proc_read = read_gssp, }; static int create_use_gss_proxy_proc_entry(struct net *net) @@ -1398,7 +1446,7 @@ static int create_use_gss_proxy_proc_entry(struct net *net) sn->use_gss_proxy = -1; *p = proc_create_data("use-gss-proxy", S_IFREG | 0600, sn->proc_net_rpc, - &use_gss_proxy_ops, net); + &use_gss_proxy_proc_ops, net); if (!*p) return -ENOMEM; init_gssp_clnt(sn); diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index 339e8c077c2d..195b40c5dae4 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -220,7 +220,7 @@ void xprt_destroy_bc(struct rpc_xprt *xprt, unsigned int max_reqs) goto out; spin_lock_bh(&xprt->bc_pa_lock); - xprt->bc_alloc_max -= max_reqs; + xprt->bc_alloc_max -= min(max_reqs, xprt->bc_alloc_max); list_for_each_entry_safe(req, tmp, &xprt->bc_pa_list, rq_bc_pa_list) { dprintk("RPC: req=%p\n", req); list_del(&req->rq_bc_pa_list); @@ -307,8 +307,8 @@ void xprt_free_bc_rqst(struct rpc_rqst *req) */ dprintk("RPC: Last session removed req=%p\n", req); xprt_free_allocation(req); - return; } + xprt_put(xprt); } /* @@ -339,7 +339,7 @@ found: spin_unlock(&xprt->bc_pa_lock); if (new) { if (req != new) - xprt_free_bc_rqst(new); + xprt_free_allocation(new); break; } else if (req) break; @@ -368,6 +368,7 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied) set_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state); dprintk("RPC: add callback request to list\n"); + xprt_get(xprt); spin_lock(&bc_serv->sv_cb_lock); list_add(&req->rq_bc_list, &bc_serv->sv_cb_list); wake_up(&bc_serv->sv_cb_waitq); diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 6f1528f271ee..bd843a81afa0 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -42,7 +42,7 @@ static bool cache_listeners_exist(struct cache_detail *detail); static void cache_init(struct cache_head *h, struct cache_detail *detail) { - time_t now = seconds_since_boot(); + time64_t now = seconds_since_boot(); INIT_HLIST_NODE(&h->cache_list); h->flags = 0; kref_init(&h->ref); @@ -53,9 +53,6 @@ static void cache_init(struct cache_head *h, struct cache_detail *detail) h->last_refresh = now; } -static inline int cache_is_valid(struct cache_head *h); -static void cache_fresh_locked(struct cache_head *head, time_t expiry, - struct cache_detail *detail); static void cache_fresh_unlocked(struct cache_head *head, struct cache_detail *detail); @@ -80,6 +77,22 @@ static struct cache_head *sunrpc_cache_find_rcu(struct cache_detail *detail, return NULL; } +static void sunrpc_begin_cache_remove_entry(struct cache_head *ch, + struct cache_detail *cd) +{ + /* Must be called under cd->hash_lock */ + hlist_del_init_rcu(&ch->cache_list); + set_bit(CACHE_CLEANED, &ch->flags); + cd->entries --; +} + +static void sunrpc_end_cache_remove_entry(struct cache_head *ch, + struct cache_detail *cd) +{ + cache_fresh_unlocked(ch, cd); + cache_put(ch, cd); +} + static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail, struct cache_head *key, int hash) @@ -103,11 +116,7 @@ static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail, hlist_for_each_entry_rcu(tmp, head, cache_list) { if (detail->match(tmp, key)) { if (cache_is_expired(detail, tmp)) { - hlist_del_init_rcu(&tmp->cache_list); - detail->entries --; - if (cache_is_valid(tmp) == -EAGAIN) - set_bit(CACHE_NEGATIVE, &tmp->flags); - cache_fresh_locked(tmp, 0, detail); + sunrpc_begin_cache_remove_entry(tmp, detail); freeme = tmp; break; } @@ -123,10 +132,8 @@ static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail, cache_get(new); spin_unlock(&detail->hash_lock); - if (freeme) { - cache_fresh_unlocked(freeme, detail); - cache_put(freeme, detail); - } + if (freeme) + sunrpc_end_cache_remove_entry(freeme, detail); return new; } @@ -145,10 +152,10 @@ EXPORT_SYMBOL_GPL(sunrpc_cache_lookup_rcu); static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch); -static void cache_fresh_locked(struct cache_head *head, time_t expiry, +static void cache_fresh_locked(struct cache_head *head, time64_t expiry, struct cache_detail *detail) { - time_t now = seconds_since_boot(); + time64_t now = seconds_since_boot(); if (now <= detail->flush_time) /* ensure it isn't immediately treated as expired */ now = detail->flush_time + 1; @@ -280,7 +287,7 @@ int cache_check(struct cache_detail *detail, struct cache_head *h, struct cache_req *rqstp) { int rv; - long refresh_age, age; + time64_t refresh_age, age; /* First decide return status as best we can */ rv = cache_is_valid(h); @@ -294,7 +301,7 @@ int cache_check(struct cache_detail *detail, rv = -ENOENT; } else if (rv == -EAGAIN || (h->expiry_time != 0 && age > refresh_age/2)) { - dprintk("RPC: Want update, refage=%ld, age=%ld\n", + dprintk("RPC: Want update, refage=%lld, age=%lld\n", refresh_age, age); if (!test_and_set_bit(CACHE_PENDING, &h->flags)) { switch (cache_make_upcall(detail, h)) { @@ -373,7 +380,7 @@ void sunrpc_init_cache_detail(struct cache_detail *cd) spin_lock(&cache_list_lock); cd->nextcheck = 0; cd->entries = 0; - atomic_set(&cd->readers, 0); + atomic_set(&cd->writers, 0); cd->last_close = 0; cd->last_warn = -1; list_add(&cd->others, &cache_list); @@ -460,8 +467,7 @@ static int cache_clean(void) if (!cache_is_expired(current_detail, ch)) continue; - hlist_del_init_rcu(&ch->cache_list); - current_detail->entries--; + sunrpc_begin_cache_remove_entry(ch, current_detail); rv = 1; break; } @@ -471,11 +477,8 @@ static int cache_clean(void) if (!ch) current_index ++; spin_unlock(&cache_list_lock); - if (ch) { - set_bit(CACHE_CLEANED, &ch->flags); - cache_fresh_unlocked(ch, d); - cache_put(ch, d); - } + if (ch) + sunrpc_end_cache_remove_entry(ch, d); } else spin_unlock(&cache_list_lock); @@ -531,13 +534,9 @@ void cache_purge(struct cache_detail *detail) for (i = 0; i < detail->hash_size; i++) { head = &detail->hash_table[i]; hlist_for_each_entry_safe(ch, tmp, head, cache_list) { - hlist_del_init_rcu(&ch->cache_list); - detail->entries--; - - set_bit(CACHE_CLEANED, &ch->flags); + sunrpc_begin_cache_remove_entry(ch, detail); spin_unlock(&detail->hash_lock); - cache_fresh_unlocked(ch, detail); - cache_put(ch, detail); + sunrpc_end_cache_remove_entry(ch, detail); spin_lock(&detail->hash_lock); } } @@ -1029,11 +1028,13 @@ static int cache_open(struct inode *inode, struct file *filp, } rp->offset = 0; rp->q.reader = 1; - atomic_inc(&cd->readers); + spin_lock(&queue_lock); list_add(&rp->q.list, &cd->queue); spin_unlock(&queue_lock); } + if (filp->f_mode & FMODE_WRITE) + atomic_inc(&cd->writers); filp->private_data = rp; return 0; } @@ -1062,8 +1063,10 @@ static int cache_release(struct inode *inode, struct file *filp, filp->private_data = NULL; kfree(rp); + } + if (filp->f_mode & FMODE_WRITE) { + atomic_dec(&cd->writers); cd->last_close = seconds_since_boot(); - atomic_dec(&cd->readers); } module_put(cd->owner); return 0; @@ -1171,7 +1174,7 @@ static void warn_no_listener(struct cache_detail *detail) static bool cache_listeners_exist(struct cache_detail *detail) { - if (atomic_read(&detail->readers)) + if (atomic_read(&detail->writers)) return true; if (detail->last_close == 0) /* This cache was never opened */ @@ -1406,7 +1409,7 @@ static int c_show(struct seq_file *m, void *p) return cd->cache_show(m, cd, NULL); ifdebug(CACHE) - seq_printf(m, "# expiry=%ld refcnt=%d flags=%lx\n", + seq_printf(m, "# expiry=%lld refcnt=%d flags=%lx\n", convert_to_wallclock(cp->expiry_time), kref_read(&cp->ref), cp->flags); cache_get(cp); @@ -1479,7 +1482,7 @@ static ssize_t read_flush(struct file *file, char __user *buf, char tbuf[22]; size_t len; - len = snprintf(tbuf, sizeof(tbuf), "%lu\n", + len = snprintf(tbuf, sizeof(tbuf), "%llu\n", convert_to_wallclock(cd->flush_time)); return simple_read_from_buffer(buf, count, ppos, tbuf, len); } @@ -1490,7 +1493,7 @@ static ssize_t write_flush(struct file *file, const char __user *buf, { char tbuf[20]; char *ep; - time_t now; + time64_t now; if (*ppos || count > sizeof(tbuf)-1) return -EINVAL; @@ -1520,6 +1523,9 @@ static ssize_t write_flush(struct file *file, const char __user *buf, cd->nextcheck = now; cache_flush(); + if (cd->flush) + cd->flush(); + *ppos += count; return count; } @@ -1570,15 +1576,14 @@ static int cache_release_procfs(struct inode *inode, struct file *filp) return cache_release(inode, filp, cd); } -static const struct file_operations cache_file_operations_procfs = { - .owner = THIS_MODULE, - .llseek = no_llseek, - .read = cache_read_procfs, - .write = cache_write_procfs, - .poll = cache_poll_procfs, - .unlocked_ioctl = cache_ioctl_procfs, /* for FIONREAD */ - .open = cache_open_procfs, - .release = cache_release_procfs, +static const struct proc_ops cache_channel_proc_ops = { + .proc_lseek = no_llseek, + .proc_read = cache_read_procfs, + .proc_write = cache_write_procfs, + .proc_poll = cache_poll_procfs, + .proc_ioctl = cache_ioctl_procfs, /* for FIONREAD */ + .proc_open = cache_open_procfs, + .proc_release = cache_release_procfs, }; static int content_open_procfs(struct inode *inode, struct file *filp) @@ -1595,11 +1600,11 @@ static int content_release_procfs(struct inode *inode, struct file *filp) return content_release(inode, filp, cd); } -static const struct file_operations content_file_operations_procfs = { - .open = content_open_procfs, - .read = seq_read, - .llseek = seq_lseek, - .release = content_release_procfs, +static const struct proc_ops content_proc_ops = { + .proc_open = content_open_procfs, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = content_release_procfs, }; static int open_flush_procfs(struct inode *inode, struct file *filp) @@ -1633,12 +1638,12 @@ static ssize_t write_flush_procfs(struct file *filp, return write_flush(filp, buf, count, ppos, cd); } -static const struct file_operations cache_flush_operations_procfs = { - .open = open_flush_procfs, - .read = read_flush_procfs, - .write = write_flush_procfs, - .release = release_flush_procfs, - .llseek = no_llseek, +static const struct proc_ops cache_flush_proc_ops = { + .proc_open = open_flush_procfs, + .proc_read = read_flush_procfs, + .proc_write = write_flush_procfs, + .proc_release = release_flush_procfs, + .proc_lseek = no_llseek, }; static void remove_cache_proc_entries(struct cache_detail *cd) @@ -1661,19 +1666,19 @@ static int create_cache_proc_entries(struct cache_detail *cd, struct net *net) goto out_nomem; p = proc_create_data("flush", S_IFREG | 0600, - cd->procfs, &cache_flush_operations_procfs, cd); + cd->procfs, &cache_flush_proc_ops, cd); if (p == NULL) goto out_nomem; if (cd->cache_request || cd->cache_parse) { p = proc_create_data("channel", S_IFREG | 0600, cd->procfs, - &cache_file_operations_procfs, cd); + &cache_channel_proc_ops, cd); if (p == NULL) goto out_nomem; } if (cd->cache_show) { p = proc_create_data("content", S_IFREG | 0400, cd->procfs, - &content_file_operations_procfs, cd); + &content_proc_ops, cd); if (p == NULL) goto out_nomem; } @@ -1885,10 +1890,9 @@ void sunrpc_cache_unhash(struct cache_detail *cd, struct cache_head *h) { spin_lock(&cd->hash_lock); if (!hlist_unhashed(&h->cache_list)){ - hlist_del_init_rcu(&h->cache_list); - cd->entries--; + sunrpc_begin_cache_remove_entry(h, cd); spin_unlock(&cd->hash_lock); - cache_put(h, cd); + sunrpc_end_cache_remove_entry(h, cd); } else spin_unlock(&cd->hash_lock); } diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index d8679b6027e9..7324b21f923e 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -591,6 +591,9 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args) xprt->resvport = 1; if (args->flags & RPC_CLNT_CREATE_NONPRIVPORT) xprt->resvport = 0; + xprt->reuseport = 0; + if (args->flags & RPC_CLNT_CREATE_REUSEPORT) + xprt->reuseport = 1; clnt = rpc_create_xprt(args, xprt); if (IS_ERR(clnt) || args->nconnect <= 1) @@ -1676,8 +1679,6 @@ call_reserveresult(struct rpc_task *task) return; } - printk(KERN_ERR "%s: status=%d, but no request slot, exiting\n", - __func__, status); rpc_call_rpcerror(task, -EIO); return; } @@ -1686,11 +1687,8 @@ call_reserveresult(struct rpc_task *task) * Even though there was an error, we may have acquired * a request slot somehow. Make sure not to leak it. */ - if (task->tk_rqstp) { - printk(KERN_ERR "%s: status=%d, request allocated anyway\n", - __func__, status); + if (task->tk_rqstp) xprt_release(task); - } switch (status) { case -ENOMEM: @@ -1699,14 +1697,9 @@ call_reserveresult(struct rpc_task *task) case -EAGAIN: /* woken up; retry */ task->tk_action = call_retry_reserve; return; - case -EIO: /* probably a shutdown */ - break; default: - printk(KERN_ERR "%s: unrecognized error %d, exiting\n", - __func__, status); - break; + rpc_call_rpcerror(task, status); } - rpc_call_rpcerror(task, status); } /* @@ -1837,7 +1830,7 @@ call_allocate(struct rpc_task *task) return; } - rpc_exit(task, -ERESTARTSYS); + rpc_call_rpcerror(task, -ERESTARTSYS); } static int @@ -1862,6 +1855,7 @@ rpc_xdr_encode(struct rpc_task *task) req->rq_rbuffer, req->rq_rcvsize); + req->rq_reply_bytes_recvd = 0; req->rq_snd_buf.head[0].iov_len = 0; xdr_init_encode(&xdr, &req->rq_snd_buf, req->rq_snd_buf.head[0].iov_base, req); @@ -1881,6 +1875,8 @@ call_encode(struct rpc_task *task) if (!rpc_task_need_encode(task)) goto out; dprint_status(task); + /* Dequeue task from the receive queue while we're encoding */ + xprt_request_dequeue_xprt(task); /* Encode here so that rpcsec_gss can use correct sequence number. */ rpc_xdr_encode(task); /* Did the encode result in an error condition? */ @@ -1970,6 +1966,7 @@ call_bind(struct rpc_task *task) static void call_bind_status(struct rpc_task *task) { + struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; int status = -EIO; if (rpc_task_transmitted(task)) { @@ -1977,14 +1974,15 @@ call_bind_status(struct rpc_task *task) return; } - if (task->tk_status >= 0) { - dprint_status(task); + dprint_status(task); + trace_rpc_bind_status(task); + if (task->tk_status >= 0) + goto out_next; + if (xprt_bound(xprt)) { task->tk_status = 0; - task->tk_action = call_connect; - return; + goto out_next; } - trace_rpc_bind_status(task); switch (task->tk_status) { case -ENOMEM: dprintk("RPC: %5u rpcbind out of memory\n", task->tk_pid); @@ -2003,6 +2001,9 @@ call_bind_status(struct rpc_task *task) task->tk_rebind_retry--; rpc_delay(task, 3*HZ); goto retry_timeout; + case -ENOBUFS: + rpc_delay(task, HZ >> 2); + goto retry_timeout; case -EAGAIN: goto retry_timeout; case -ETIMEDOUT: @@ -2026,7 +2027,6 @@ call_bind_status(struct rpc_task *task) case -ENETDOWN: case -EHOSTUNREACH: case -ENETUNREACH: - case -ENOBUFS: case -EPIPE: dprintk("RPC: %5u remote rpcbind unreachable: %d\n", task->tk_pid, task->tk_status); @@ -2043,7 +2043,9 @@ call_bind_status(struct rpc_task *task) rpc_call_rpcerror(task, status); return; - +out_next: + task->tk_action = call_connect; + return; retry_timeout: task->tk_status = 0; task->tk_action = call_bind; @@ -2090,6 +2092,7 @@ call_connect(struct rpc_task *task) static void call_connect_status(struct rpc_task *task) { + struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; struct rpc_clnt *clnt = task->tk_client; int status = task->tk_status; @@ -2099,8 +2102,17 @@ call_connect_status(struct rpc_task *task) } dprint_status(task); - trace_rpc_connect_status(task); + + if (task->tk_status == 0) { + clnt->cl_stats->netreconn++; + goto out_next; + } + if (xprt_connected(xprt)) { + task->tk_status = 0; + goto out_next; + } + task->tk_status = 0; switch (status) { case -ECONNREFUSED: @@ -2117,9 +2129,8 @@ call_connect_status(struct rpc_task *task) case -ENETDOWN: case -ENETUNREACH: case -EHOSTUNREACH: - case -EADDRINUSE: - case -ENOBUFS: case -EPIPE: + case -EPROTO: xprt_conditional_disconnect(task->tk_rqstp->rq_xprt, task->tk_rqstp->rq_connect_cookie); if (RPC_IS_SOFTCONN(task)) @@ -2127,17 +2138,20 @@ call_connect_status(struct rpc_task *task) /* retry with existing socket, after a delay */ rpc_delay(task, 3*HZ); /* fall through */ + case -EADDRINUSE: case -ENOTCONN: case -EAGAIN: case -ETIMEDOUT: goto out_retry; - case 0: - clnt->cl_stats->netreconn++; - task->tk_action = call_transmit; - return; + case -ENOBUFS: + rpc_delay(task, HZ >> 2); + goto out_retry; } rpc_call_rpcerror(task, status); return; +out_next: + task->tk_action = call_transmit; + return; out_retry: /* Check for timeouts before looping back to call_bind */ task->tk_action = call_bind; @@ -2365,7 +2379,7 @@ call_status(struct rpc_task *task) case -ECONNABORTED: case -ENOTCONN: rpc_force_rebind(clnt); - /* fall through */ + break; case -EADDRINUSE: rpc_delay(task, 3*HZ); /* fall through */ @@ -2462,6 +2476,7 @@ call_decode(struct rpc_task *task) struct rpc_clnt *clnt = task->tk_client; struct rpc_rqst *req = task->tk_rqstp; struct xdr_stream xdr; + int err; dprint_status(task); @@ -2484,6 +2499,15 @@ call_decode(struct rpc_task *task) * before it changed req->rq_reply_bytes_recvd. */ smp_rmb(); + + /* + * Did we ever call xprt_complete_rqst()? If not, we should assume + * the message is incomplete. + */ + err = -EAGAIN; + if (!req->rq_reply_bytes_recvd) + goto out; + req->rq_rcv_buf.len = req->rq_private_buf.len; /* Check that the softirq receive buffer is valid */ @@ -2492,7 +2516,9 @@ call_decode(struct rpc_task *task) xdr_init_decode(&xdr, &req->rq_rcv_buf, req->rq_rcv_buf.head[0].iov_base, req); - switch (rpc_decode_header(task, &xdr)) { + err = rpc_decode_header(task, &xdr); +out: + switch (err) { case 0: task->tk_action = rpc_exit_task; task->tk_status = rpcauth_unwrap_resp(task, &xdr); @@ -2501,9 +2527,6 @@ call_decode(struct rpc_task *task) return; case -EAGAIN: task->tk_status = 0; - xdr_free_bvec(&req->rq_rcv_buf); - req->rq_reply_bytes_recvd = 0; - req->rq_rcv_buf.len = 0; if (task->tk_client->cl_discrtry) xprt_conditional_disconnect(req->rq_xprt, req->rq_connect_cookie); @@ -2544,7 +2567,7 @@ rpc_encode_header(struct rpc_task *task, struct xdr_stream *xdr) return 0; out_fail: trace_rpc_bad_callhdr(task); - rpc_exit(task, error); + rpc_call_rpcerror(task, error); return error; } @@ -2611,7 +2634,7 @@ out_garbage: return -EAGAIN; } out_err: - rpc_exit(task, error); + rpc_call_rpcerror(task, error); return error; out_unparsable: @@ -2877,7 +2900,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt; unsigned long connect_timeout; unsigned long reconnect_timeout; - unsigned char resvport; + unsigned char resvport, reuseport; int ret = 0; rcu_read_lock(); @@ -2889,6 +2912,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt, return -EAGAIN; } resvport = xprt->resvport; + reuseport = xprt->reuseport; connect_timeout = xprt->connect_timeout; reconnect_timeout = xprt->max_reconnect_timeout; rcu_read_unlock(); @@ -2899,6 +2923,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt, goto out_put_switch; } xprt->resvport = resvport; + xprt->reuseport = reuseport; if (xprt->ops->set_connect_timeout != NULL) xprt->ops->set_connect_timeout(xprt, connect_timeout, diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 748bac601e47..39e14d5edaf1 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -51,7 +51,7 @@ static BLOCKING_NOTIFIER_HEAD(rpc_pipefs_notifier_list); int rpc_pipefs_notifier_register(struct notifier_block *nb) { - return blocking_notifier_chain_cond_register(&rpc_pipefs_notifier_list, nb); + return blocking_notifier_chain_register(&rpc_pipefs_notifier_list, nb); } EXPORT_SYMBOL_GPL(rpc_pipefs_notifier_register); @@ -1416,8 +1416,7 @@ EXPORT_SYMBOL_GPL(gssd_running); static int rpc_fs_get_tree(struct fs_context *fc) { - fc->s_fs_info = get_net(fc->net_ns); - return vfs_get_super(fc, vfs_get_keyed_super, rpc_fill_super); + return get_tree_keyed(fc, rpc_fill_super, get_net(fc->net_ns)); } static void rpc_fs_free_fc(struct fs_context *fc) diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 1f275aba786f..55e900255b0c 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -260,7 +260,7 @@ static void __rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const c rpc_reset_waitqueue_priority(queue); queue->qlen = 0; queue->timer_list.expires = 0; - INIT_DEFERRABLE_WORK(&queue->timer_list.dwork, __rpc_queue_timer_fn); + INIT_DELAYED_WORK(&queue->timer_list.dwork, __rpc_queue_timer_fn); INIT_LIST_HEAD(&queue->timer_list.list); rpc_assign_waitqueue_name(queue, qname); } @@ -541,33 +541,14 @@ rpc_wake_up_task_on_wq_queue_action_locked(struct workqueue_struct *wq, return NULL; } -static void -rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq, - struct rpc_wait_queue *queue, struct rpc_task *task) -{ - rpc_wake_up_task_on_wq_queue_action_locked(wq, queue, task, NULL, NULL); -} - /* * Wake up a queued task while the queue lock is being held */ -static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task) +static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, + struct rpc_task *task) { - rpc_wake_up_task_on_wq_queue_locked(rpciod_workqueue, queue, task); -} - -/* - * Wake up a task on a specific queue - */ -void rpc_wake_up_queued_task_on_wq(struct workqueue_struct *wq, - struct rpc_wait_queue *queue, - struct rpc_task *task) -{ - if (!RPC_IS_QUEUED(task)) - return; - spin_lock(&queue->lock); - rpc_wake_up_task_on_wq_queue_locked(wq, queue, task); - spin_unlock(&queue->lock); + rpc_wake_up_task_on_wq_queue_action_locked(rpciod_workqueue, queue, + task, NULL, NULL); } /* @@ -843,6 +824,7 @@ rpc_reset_task_statistics(struct rpc_task *task) */ void rpc_exit_task(struct rpc_task *task) { + trace_rpc_task_end(task, task->tk_action); task->tk_action = NULL; if (task->tk_ops->rpc_count_stats) task->tk_ops->rpc_count_stats(task, task->tk_calldata); @@ -864,6 +846,8 @@ void rpc_signal_task(struct rpc_task *task) if (!RPC_IS_ACTIVATED(task)) return; + + trace_rpc_task_signalled(task, task->tk_action); set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate); smp_mb__after_atomic(); queue = READ_ONCE(task->tk_waitqueue); @@ -930,8 +914,10 @@ static void __rpc_execute(struct rpc_task *task) /* * Signalled tasks should exit rather than sleep. */ - if (RPC_SIGNALLED(task)) + if (RPC_SIGNALLED(task)) { + task->tk_rpc_status = -ERESTARTSYS; rpc_exit(task, -ERESTARTSYS); + } /* * The queue->lock protects against races with @@ -965,8 +951,9 @@ static void __rpc_execute(struct rpc_task *task) * clean up after sleeping on some queue, we don't * break the loop here, but go around once more. */ - dprintk("RPC: %5u got signal\n", task->tk_pid); + trace_rpc_task_signalled(task, task->tk_action); set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate); + task->tk_rpc_status = -ERESTARTSYS; rpc_exit(task, -ERESTARTSYS); } dprintk("RPC: %5u sync task resuming\n", task->tk_pid); diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c index 7c74197c2ecf..c964b48eaaba 100644 --- a/net/sunrpc/stats.c +++ b/net/sunrpc/stats.c @@ -69,12 +69,11 @@ static int rpc_proc_open(struct inode *inode, struct file *file) return single_open(file, rpc_proc_show, PDE_DATA(inode)); } -static const struct file_operations rpc_proc_fops = { - .owner = THIS_MODULE, - .open = rpc_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, +static const struct proc_ops rpc_proc_ops = { + .proc_open = rpc_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, }; /* @@ -281,19 +280,19 @@ EXPORT_SYMBOL_GPL(rpc_clnt_show_stats); */ static inline struct proc_dir_entry * do_register(struct net *net, const char *name, void *data, - const struct file_operations *fops) + const struct proc_ops *proc_ops) { struct sunrpc_net *sn; dprintk("RPC: registering /proc/net/rpc/%s\n", name); sn = net_generic(net, sunrpc_net_id); - return proc_create_data(name, 0, sn->proc_net_rpc, fops, data); + return proc_create_data(name, 0, sn->proc_net_rpc, proc_ops, data); } struct proc_dir_entry * rpc_proc_register(struct net *net, struct rpc_stat *statp) { - return do_register(net, statp->program->name, statp, &rpc_proc_fops); + return do_register(net, statp->program->name, statp, &rpc_proc_ops); } EXPORT_SYMBOL_GPL(rpc_proc_register); @@ -308,9 +307,9 @@ rpc_proc_unregister(struct net *net, const char *name) EXPORT_SYMBOL_GPL(rpc_proc_unregister); struct proc_dir_entry * -svc_proc_register(struct net *net, struct svc_stat *statp, const struct file_operations *fops) +svc_proc_register(struct net *net, struct svc_stat *statp, const struct proc_ops *proc_ops) { - return do_register(net, statp->program->pg_name, statp, fops); + return do_register(net, statp->program->pg_name, statp, proc_ops); } EXPORT_SYMBOL_GPL(svc_proc_register); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 220b79988000..187dd4e73d64 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1233,8 +1233,8 @@ svc_generic_init_request(struct svc_rqst *rqstp, if (rqstp->rq_vers >= progp->pg_nvers ) goto err_bad_vers; - versp = progp->pg_vers[rqstp->rq_vers]; - if (!versp) + versp = progp->pg_vers[rqstp->rq_vers]; + if (!versp) goto err_bad_vers; /* @@ -1337,6 +1337,8 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) auth_stat = rpc_autherr_badcred; auth_res = progp->pg_authenticate(rqstp); } + if (auth_res != SVC_OK) + trace_svc_authenticate(rqstp, auth_res, auth_stat); switch (auth_res) { case SVC_OK: break; diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c index 550b214cb001..552617e3467b 100644 --- a/net/sunrpc/svcauth.c +++ b/net/sunrpc/svcauth.c @@ -19,6 +19,8 @@ #include <linux/err.h> #include <linux/hash.h> +#include <trace/events/sunrpc.h> + #define RPCDBG_FACILITY RPCDBG_AUTH diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 5c04ba7d456b..04aa80a2d752 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -166,7 +166,7 @@ static void ip_map_request(struct cache_detail *cd, } static struct ip_map *__ip_map_lookup(struct cache_detail *cd, char *class, struct in6_addr *addr); -static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm, struct unix_domain *udom, time_t expiry); +static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm, struct unix_domain *udom, time64_t expiry); static int ip_map_parse(struct cache_detail *cd, char *mesg, int mlen) @@ -187,7 +187,7 @@ static int ip_map_parse(struct cache_detail *cd, struct ip_map *ipmp; struct auth_domain *dom; - time_t expiry; + time64_t expiry; if (mesg[mlen-1] != '\n') return -EINVAL; @@ -308,7 +308,7 @@ static inline struct ip_map *ip_map_lookup(struct net *net, char *class, } static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm, - struct unix_domain *udom, time_t expiry) + struct unix_domain *udom, time64_t expiry) { struct ip_map ip; struct cache_head *ch; @@ -328,7 +328,7 @@ static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm, } static inline int ip_map_update(struct net *net, struct ip_map *ipm, - struct unix_domain *udom, time_t expiry) + struct unix_domain *udom, time64_t expiry) { struct sunrpc_net *sn; @@ -491,7 +491,7 @@ static int unix_gid_parse(struct cache_detail *cd, int rv; int i; int err; - time_t expiry; + time64_t expiry; struct unix_gid ug, *ugp; if (mesg[mlen - 1] != '\n') diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 48c93b9e525e..e5497dc2475b 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -436,13 +436,12 @@ xdr_shrink_bufhead(struct xdr_buf *buf, size_t len) } /** - * xdr_shrink_pagelen + * xdr_shrink_pagelen - shrinks buf->pages by up to @len bytes * @buf: xdr_buf * @len: bytes to remove from buf->pages * - * Shrinks XDR buffer's page array buf->pages by - * 'len' bytes. The extra data is not lost, but is instead - * moved into the tail. + * The extra data is not lost, but is instead moved into buf->tail. + * Returns the actual number of bytes moved. */ static unsigned int xdr_shrink_pagelen(struct xdr_buf *buf, size_t len) @@ -455,8 +454,8 @@ xdr_shrink_pagelen(struct xdr_buf *buf, size_t len) result = 0; tail = buf->tail; - BUG_ON (len > pglen); - + if (len > buf->page_len) + len = buf-> page_len; tailbuf_len = buf->buflen - buf->head->iov_len - buf->page_len; /* Shift the tail first */ @@ -560,7 +559,7 @@ EXPORT_SYMBOL_GPL(xdr_init_encode); * required at the end of encoding, or any other time when the xdr_buf * data might be read. */ -void xdr_commit_encode(struct xdr_stream *xdr) +inline void xdr_commit_encode(struct xdr_stream *xdr) { int shift = xdr->scratch.iov_len; void *page; @@ -1080,7 +1079,7 @@ void xdr_enter_page(struct xdr_stream *xdr, unsigned int len) } EXPORT_SYMBOL_GPL(xdr_enter_page); -static struct kvec empty_iov = {.iov_base = NULL, .iov_len = 0}; +static const struct kvec empty_iov = {.iov_base = NULL, .iov_len = 0}; void xdr_buf_from_iov(struct kvec *iov, struct xdr_buf *buf) @@ -1236,43 +1235,60 @@ xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj) } EXPORT_SYMBOL_GPL(xdr_encode_word); -/* If the netobj starting offset bytes from the start of xdr_buf is contained - * entirely in the head or the tail, set object to point to it; otherwise - * try to find space for it at the end of the tail, copy it there, and - * set obj to point to it. */ -int xdr_buf_read_netobj(struct xdr_buf *buf, struct xdr_netobj *obj, unsigned int offset) +/** + * xdr_buf_read_mic() - obtain the address of the GSS mic from xdr buf + * @buf: pointer to buffer containing a mic + * @mic: on success, returns the address of the mic + * @offset: the offset in buf where mic may be found + * + * This function may modify the xdr buf if the mic is found to be straddling + * a boundary between head, pages, and tail. On success the mic can be read + * from the address returned. There is no need to free the mic. + * + * Return: Success returns 0, otherwise an integer error. + */ +int xdr_buf_read_mic(struct xdr_buf *buf, struct xdr_netobj *mic, unsigned int offset) { struct xdr_buf subbuf; + unsigned int boundary; - if (xdr_decode_word(buf, offset, &obj->len)) + if (xdr_decode_word(buf, offset, &mic->len)) return -EFAULT; - if (xdr_buf_subsegment(buf, &subbuf, offset + 4, obj->len)) + offset += 4; + + /* Is the mic partially in the head? */ + boundary = buf->head[0].iov_len; + if (offset < boundary && (offset + mic->len) > boundary) + xdr_shift_buf(buf, boundary - offset); + + /* Is the mic partially in the pages? */ + boundary += buf->page_len; + if (offset < boundary && (offset + mic->len) > boundary) + xdr_shrink_pagelen(buf, boundary - offset); + + if (xdr_buf_subsegment(buf, &subbuf, offset, mic->len)) return -EFAULT; - /* Is the obj contained entirely in the head? */ - obj->data = subbuf.head[0].iov_base; - if (subbuf.head[0].iov_len == obj->len) + /* Is the mic contained entirely in the head? */ + mic->data = subbuf.head[0].iov_base; + if (subbuf.head[0].iov_len == mic->len) return 0; - /* ..or is the obj contained entirely in the tail? */ - obj->data = subbuf.tail[0].iov_base; - if (subbuf.tail[0].iov_len == obj->len) + /* ..or is the mic contained entirely in the tail? */ + mic->data = subbuf.tail[0].iov_base; + if (subbuf.tail[0].iov_len == mic->len) return 0; - /* use end of tail as storage for obj: - * (We don't copy to the beginning because then we'd have - * to worry about doing a potentially overlapping copy. - * This assumes the object is at most half the length of the - * tail.) */ - if (obj->len > buf->buflen - buf->len) + /* Find a contiguous area in @buf to hold all of @mic */ + if (mic->len > buf->buflen - buf->len) return -ENOMEM; if (buf->tail[0].iov_len != 0) - obj->data = buf->tail[0].iov_base + buf->tail[0].iov_len; + mic->data = buf->tail[0].iov_base + buf->tail[0].iov_len; else - obj->data = buf->head[0].iov_base + buf->head[0].iov_len; - __read_bytes_from_xdr_buf(&subbuf, obj->data, obj->len); + mic->data = buf->head[0].iov_base + buf->head[0].iov_len; + __read_bytes_from_xdr_buf(&subbuf, mic->data, mic->len); return 0; } -EXPORT_SYMBOL_GPL(xdr_buf_read_netobj); +EXPORT_SYMBOL_GPL(xdr_buf_read_mic); /* Returns 0 on success, or else a negative error code. */ static int diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 783748dc5e6f..1aafe8d3f3f4 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -205,20 +205,20 @@ int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task) if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) { if (task == xprt->snd_task) - return 1; + goto out_locked; goto out_sleep; } if (test_bit(XPRT_WRITE_SPACE, &xprt->state)) goto out_unlock; xprt->snd_task = task; +out_locked: + trace_xprt_reserve_xprt(xprt, task); return 1; out_unlock: xprt_clear_locked(xprt); out_sleep: - dprintk("RPC: %5u failed to lock transport %p\n", - task->tk_pid, xprt); task->tk_status = -EAGAIN; if (RPC_IS_SOFT(task)) rpc_sleep_on_timeout(&xprt->sending, task, NULL, @@ -269,23 +269,22 @@ int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) { if (task == xprt->snd_task) - return 1; + goto out_locked; goto out_sleep; } if (req == NULL) { xprt->snd_task = task; - return 1; + goto out_locked; } if (test_bit(XPRT_WRITE_SPACE, &xprt->state)) goto out_unlock; if (!xprt_need_congestion_window_wait(xprt)) { xprt->snd_task = task; - return 1; + goto out_locked; } out_unlock: xprt_clear_locked(xprt); out_sleep: - dprintk("RPC: %5u failed to lock transport %p\n", task->tk_pid, xprt); task->tk_status = -EAGAIN; if (RPC_IS_SOFT(task)) rpc_sleep_on_timeout(&xprt->sending, task, NULL, @@ -293,6 +292,9 @@ out_sleep: else rpc_sleep_on(&xprt->sending, task, NULL); return 0; +out_locked: + trace_xprt_reserve_cong(xprt, task); + return 1; } EXPORT_SYMBOL_GPL(xprt_reserve_xprt_cong); @@ -357,6 +359,7 @@ void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task) xprt_clear_locked(xprt); __xprt_lock_write_next(xprt); } + trace_xprt_release_xprt(xprt, task); } EXPORT_SYMBOL_GPL(xprt_release_xprt); @@ -374,6 +377,7 @@ void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) xprt_clear_locked(xprt); __xprt_lock_write_next_cong(xprt); } + trace_xprt_release_cong(xprt, task); } EXPORT_SYMBOL_GPL(xprt_release_xprt_cong); @@ -395,8 +399,7 @@ __xprt_get_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) { if (req->rq_cong) return 1; - dprintk("RPC: %5u xprt_cwnd_limited cong = %lu cwnd = %lu\n", - req->rq_task->tk_pid, xprt->cong, xprt->cwnd); + trace_xprt_get_cong(xprt, req->rq_task); if (RPCXPRT_CONGESTED(xprt)) { xprt_set_congestion_window_wait(xprt); return 0; @@ -418,6 +421,7 @@ __xprt_put_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) req->rq_cong = 0; xprt->cong -= RPC_CWNDSCALE; xprt_test_and_clear_congestion_window_wait(xprt); + trace_xprt_put_cong(xprt, req->rq_task); __xprt_lock_write_next_cong(xprt); } @@ -456,6 +460,12 @@ void xprt_release_rqst_cong(struct rpc_task *task) } EXPORT_SYMBOL_GPL(xprt_release_rqst_cong); +static void xprt_clear_congestion_window_wait_locked(struct rpc_xprt *xprt) +{ + if (test_and_clear_bit(XPRT_CWND_WAIT, &xprt->state)) + __xprt_lock_write_next_cong(xprt); +} + /* * Clear the congestion window wait flag and wake up the next * entry on xprt->sending @@ -671,6 +681,7 @@ void xprt_disconnect_done(struct rpc_xprt *xprt) spin_lock(&xprt->transport_lock); xprt_clear_connected(xprt); xprt_clear_write_space_locked(xprt); + xprt_clear_congestion_window_wait_locked(xprt); xprt_wake_pending_tasks(xprt, -ENOTCONN); spin_unlock(&xprt->transport_lock); } @@ -1324,6 +1335,36 @@ xprt_request_dequeue_transmit(struct rpc_task *task) } /** + * xprt_request_dequeue_xprt - remove a task from the transmit+receive queue + * @task: pointer to rpc_task + * + * Remove a task from the transmit and receive queues, and ensure that + * it is not pinned by the receive work item. + */ +void +xprt_request_dequeue_xprt(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate) || + test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) || + xprt_is_pinned_rqst(req)) { + spin_lock(&xprt->queue_lock); + xprt_request_dequeue_transmit_locked(task); + xprt_request_dequeue_receive_locked(task); + while (xprt_is_pinned_rqst(req)) { + set_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate); + spin_unlock(&xprt->queue_lock); + xprt_wait_on_pinned_rqst(req); + spin_lock(&xprt->queue_lock); + clear_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate); + } + spin_unlock(&xprt->queue_lock); + } +} + +/** * xprt_request_prepare - prepare an encoded request for transport * @req: pointer to rpc_rqst * @@ -1408,13 +1449,6 @@ xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task) status = -EBADMSG; goto out_dequeue; } - if (task->tk_ops->rpc_call_prepare_transmit) { - task->tk_ops->rpc_call_prepare_transmit(task, - task->tk_calldata); - status = task->tk_status; - if (status < 0) - goto out_dequeue; - } if (RPC_SIGNALLED(task)) { status = -ERESTARTSYS; goto out_dequeue; @@ -1754,28 +1788,6 @@ void xprt_retry_reserve(struct rpc_task *task) xprt_do_reserve(xprt, task); } -static void -xprt_request_dequeue_all(struct rpc_task *task, struct rpc_rqst *req) -{ - struct rpc_xprt *xprt = req->rq_xprt; - - if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate) || - test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) || - xprt_is_pinned_rqst(req)) { - spin_lock(&xprt->queue_lock); - xprt_request_dequeue_transmit_locked(task); - xprt_request_dequeue_receive_locked(task); - while (xprt_is_pinned_rqst(req)) { - set_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate); - spin_unlock(&xprt->queue_lock); - xprt_wait_on_pinned_rqst(req); - spin_lock(&xprt->queue_lock); - clear_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate); - } - spin_unlock(&xprt->queue_lock); - } -} - /** * xprt_release - release an RPC request slot * @task: task which is finished with the slot @@ -1795,7 +1807,7 @@ void xprt_release(struct rpc_task *task) } xprt = req->rq_xprt; - xprt_request_dequeue_all(task, req); + xprt_request_dequeue_xprt(task); spin_lock(&xprt->transport_lock); xprt->ops->release_xprt(xprt, task); if (xprt->ops->release_request) @@ -1935,6 +1947,11 @@ static void xprt_destroy_cb(struct work_struct *work) rpc_destroy_wait_queue(&xprt->backlog); kfree(xprt->servername); /* + * Destroy any existing back channel + */ + xprt_destroy_backchannel(xprt, UINT_MAX); + + /* * Tear down transport state and free the rpc_xprt */ xprt->ops->destroy(xprt); diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 59e624b1d7a0..1a0ae0c61353 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -54,9 +54,7 @@ size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt) unsigned int xprt_rdma_bc_max_slots(struct rpc_xprt *xprt) { - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - - return r_xprt->rx_buf.rb_bc_srv_max_requests; + return RPCRDMA_BACKWARD_WRS >> 1; } static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) @@ -81,7 +79,7 @@ static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) *p = xdr_zero; if (rpcrdma_prepare_send_sges(r_xprt, req, RPCRDMA_HDRLEN_MIN, - &rqst->rq_snd_buf, rpcrdma_noch)) + &rqst->rq_snd_buf, rpcrdma_noch_pullup)) return -EIO; trace_xprtrdma_cb_reply(rqst); @@ -165,6 +163,7 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst) spin_lock(&xprt->bc_pa_lock); list_add_tail(&rqst->rq_bc_pa_list, &xprt->bc_pa_list); spin_unlock(&xprt->bc_pa_lock); + xprt_put(xprt); } static struct rpc_rqst *rpcrdma_bc_rqst_get(struct rpcrdma_xprt *r_xprt) @@ -195,6 +194,10 @@ create_req: req = rpcrdma_req_create(r_xprt, size, GFP_KERNEL); if (!req) return NULL; + if (rpcrdma_req_setup(r_xprt, req)) { + rpcrdma_req_destroy(req); + return NULL; + } xprt->bc_alloc_count++; rqst = &req->rl_slot; @@ -261,6 +264,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, /* Queue rqst for ULP's callback service */ bc_serv = xprt->bc_serv; + xprt_get(xprt); spin_lock(&bc_serv->sv_cb_lock); list_add(&rqst->rq_bc_list, &bc_serv->sv_cb_list); spin_unlock(&bc_serv->sv_cb_lock); diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 0b6dad7580a1..125297c9aa3e 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -7,67 +7,37 @@ /* Lightweight memory registration using Fast Registration Work * Requests (FRWR). * - * FRWR features ordered asynchronous registration and deregistration - * of arbitrarily sized memory regions. This is the fastest and safest + * FRWR features ordered asynchronous registration and invalidation + * of arbitrarily-sized memory regions. This is the fastest and safest * but most complex memory registration mode. */ /* Normal operation * - * A Memory Region is prepared for RDMA READ or WRITE using a FAST_REG + * A Memory Region is prepared for RDMA Read or Write using a FAST_REG * Work Request (frwr_map). When the RDMA operation is finished, this * Memory Region is invalidated using a LOCAL_INV Work Request - * (frwr_unmap_sync). + * (frwr_unmap_async and frwr_unmap_sync). * - * Typically these Work Requests are not signaled, and neither are RDMA - * SEND Work Requests (with the exception of signaling occasionally to - * prevent provider work queue overflows). This greatly reduces HCA + * Typically FAST_REG Work Requests are not signaled, and neither are + * RDMA Send Work Requests (with the exception of signaling occasionally + * to prevent provider work queue overflows). This greatly reduces HCA * interrupt workload. - * - * As an optimization, frwr_unmap marks MRs INVALID before the - * LOCAL_INV WR is posted. If posting succeeds, the MR is placed on - * rb_mrs immediately so that no work (like managing a linked list - * under a spinlock) is needed in the completion upcall. - * - * But this means that frwr_map() can occasionally encounter an MR - * that is INVALID but the LOCAL_INV WR has not completed. Work Queue - * ordering prevents a subsequent FAST_REG WR from executing against - * that MR while it is still being invalidated. */ /* Transport recovery * - * ->op_map and the transport connect worker cannot run at the same - * time, but ->op_unmap can fire while the transport connect worker - * is running. Thus MR recovery is handled in ->op_map, to guarantee - * that recovered MRs are owned by a sending RPC, and not one where - * ->op_unmap could fire at the same time transport reconnect is - * being done. - * - * When the underlying transport disconnects, MRs are left in one of - * four states: - * - * INVALID: The MR was not in use before the QP entered ERROR state. - * - * VALID: The MR was registered before the QP entered ERROR state. - * - * FLUSHED_FR: The MR was being registered when the QP entered ERROR - * state, and the pending WR was flushed. - * - * FLUSHED_LI: The MR was being invalidated when the QP entered ERROR - * state, and the pending WR was flushed. - * - * When frwr_map encounters FLUSHED and VALID MRs, they are recovered - * with ib_dereg_mr and then are re-initialized. Because MR recovery - * allocates fresh resources, it is deferred to a workqueue, and the - * recovered MRs are placed back on the rb_mrs list when recovery is - * complete. frwr_map allocates another MR for the current RPC while - * the broken MR is reset. - * - * To ensure that frwr_map doesn't encounter an MR that is marked - * INVALID but that is about to be flushed due to a previous transport - * disconnect, the transport connect worker attempts to drain all - * pending send queue WRs before the transport is reconnected. + * frwr_map and frwr_unmap_* cannot run at the same time the transport + * connect worker is running. The connect worker holds the transport + * send lock, just as ->send_request does. This prevents frwr_map and + * the connect worker from running concurrently. When a connection is + * closed, the Receive completion queue is drained before the allowing + * the connect worker to get control. This prevents frwr_unmap and the + * connect worker from running concurrently. + * + * When the underlying transport disconnects, MRs that are in flight + * are flushed and are likely unusable. Thus all MRs are destroyed. + * New MRs are created on demand. */ #include <linux/sunrpc/rpc_rdma.h> @@ -81,28 +51,6 @@ #endif /** - * frwr_is_supported - Check if device supports FRWR - * @device: interface adapter to check - * - * Returns true if device supports FRWR, otherwise false - */ -bool frwr_is_supported(struct ib_device *device) -{ - struct ib_device_attr *attrs = &device->attrs; - - if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) - goto out_not_supported; - if (attrs->max_fast_reg_page_list_len == 0) - goto out_not_supported; - return true; - -out_not_supported: - pr_info("rpcrdma: 'frwr' mode is not supported by device %s\n", - device->name); - return false; -} - -/** * frwr_release_mr - Destroy one MR * @mr: MR allocated by frwr_init_mr * @@ -118,13 +66,8 @@ void frwr_release_mr(struct rpcrdma_mr *mr) kfree(mr); } -/* MRs are dynamically allocated, so simply clean up and release the MR. - * A replacement MR will subsequently be allocated on demand. - */ -static void -frwr_mr_recycle_worker(struct work_struct *work) +static void frwr_mr_recycle(struct rpcrdma_mr *mr) { - struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr, mr_recycle); struct rpcrdma_xprt *r_xprt = mr->mr_xprt; trace_xprtrdma_mr_recycle(mr); @@ -136,10 +79,10 @@ frwr_mr_recycle_worker(struct work_struct *work) mr->mr_dir = DMA_NONE; } - spin_lock(&r_xprt->rx_buf.rb_mrlock); + spin_lock(&r_xprt->rx_buf.rb_lock); list_del(&mr->mr_all); r_xprt->rx_stats.mrs_recycled++; - spin_unlock(&r_xprt->rx_buf.rb_mrlock); + spin_unlock(&r_xprt->rx_buf.rb_lock); frwr_release_mr(mr); } @@ -156,12 +99,10 @@ frwr_mr_recycle_worker(struct work_struct *work) */ void frwr_reset(struct rpcrdma_req *req) { - while (!list_empty(&req->rl_registered)) { - struct rpcrdma_mr *mr; + struct rpcrdma_mr *mr; - mr = rpcrdma_mr_pop(&req->rl_registered); - rpcrdma_mr_unmap_and_put(mr); - } + while ((mr = rpcrdma_mr_pop(&req->rl_registered))) + rpcrdma_mr_put(mr); } /** @@ -183,14 +124,13 @@ int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) if (IS_ERR(frmr)) goto out_mr_err; - sg = kcalloc(depth, sizeof(*sg), GFP_KERNEL); + sg = kcalloc(depth, sizeof(*sg), GFP_NOFS); if (!sg) goto out_list_err; mr->frwr.fr_mr = frmr; mr->mr_dir = DMA_NONE; INIT_LIST_HEAD(&mr->mr_list); - INIT_WORK(&mr->mr_recycle, frwr_mr_recycle_worker); init_completion(&mr->frwr.fr_linv_done); sg_init_table(sg, depth); @@ -203,33 +143,53 @@ out_mr_err: return rc; out_list_err: - dprintk("RPC: %s: sg allocation failure\n", - __func__); ib_dereg_mr(frmr); return -ENOMEM; } /** - * frwr_open - Prepare an endpoint for use with FRWR - * @ia: interface adapter this endpoint will use - * @ep: endpoint to prepare + * frwr_query_device - Prepare a transport for use with FRWR + * @r_xprt: controlling transport instance + * @device: RDMA device to query * * On success, sets: - * ep->rep_attr.cap.max_send_wr - * ep->rep_attr.cap.max_recv_wr + * ep->rep_attr * ep->rep_max_requests - * ia->ri_max_segs + * ia->ri_max_rdma_segs * * And these FRWR-related fields: * ia->ri_max_frwr_depth * ia->ri_mrtype * - * On failure, a negative errno is returned. + * Return values: + * On success, returns zero. + * %-EINVAL - the device does not support FRWR memory registration + * %-ENOMEM - the device is not sufficiently capable for NFS/RDMA */ -int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep) +int frwr_query_device(struct rpcrdma_xprt *r_xprt, + const struct ib_device *device) { - struct ib_device_attr *attrs = &ia->ri_id->device->attrs; + const struct ib_device_attr *attrs = &device->attrs; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_ep *ep = &r_xprt->rx_ep; int max_qp_wr, depth, delta; + unsigned int max_sge; + + if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) || + attrs->max_fast_reg_page_list_len == 0) { + pr_err("rpcrdma: 'frwr' mode is not supported by device %s\n", + device->name); + return -EINVAL; + } + + max_sge = min_t(unsigned int, attrs->max_send_sge, + RPCRDMA_MAX_SEND_SGES); + if (max_sge < RPCRDMA_MIN_SEND_SGES) { + pr_err("rpcrdma: HCA provides only %u send SGEs\n", max_sge); + return -ENOMEM; + } + ep->rep_attr.cap.max_send_sge = max_sge; + ep->rep_attr.cap.max_recv_sge = 1; ia->ri_mrtype = IB_MR_TYPE_MEM_REG; if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG) @@ -239,14 +199,12 @@ int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep) * capability, but perform optimally when the MRs are not larger * than a page. */ - if (attrs->max_sge_rd > 1) + if (attrs->max_sge_rd > RPCRDMA_MAX_HDR_SEGS) ia->ri_max_frwr_depth = attrs->max_sge_rd; else ia->ri_max_frwr_depth = attrs->max_fast_reg_page_list_len; if (ia->ri_max_frwr_depth > RPCRDMA_MAX_DATA_SEGS) ia->ri_max_frwr_depth = RPCRDMA_MAX_DATA_SEGS; - dprintk("RPC: %s: max FR page list depth = %u\n", - __func__, ia->ri_max_frwr_depth); /* Add room for frwr register and invalidate WRs. * 1. FRWR reg WR for head @@ -270,7 +228,7 @@ int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep) } while (delta > 0); } - max_qp_wr = ia->ri_id->device->attrs.max_qp_wr; + max_qp_wr = attrs->max_qp_wr; max_qp_wr -= RPCRDMA_BACKWARD_WRS; max_qp_wr -= 1; if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE) @@ -281,7 +239,7 @@ int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep) if (ep->rep_attr.cap.max_send_wr > max_qp_wr) { ep->rep_max_requests = max_qp_wr / depth; if (!ep->rep_max_requests) - return -EINVAL; + return -ENOMEM; ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth; } ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS; @@ -290,30 +248,22 @@ int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep) ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */ - ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / - ia->ri_max_frwr_depth); + ia->ri_max_rdma_segs = + DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ia->ri_max_frwr_depth); /* Reply chunks require segments for head and tail buffers */ - ia->ri_max_segs += 2; - if (ia->ri_max_segs > RPCRDMA_MAX_HDR_SEGS) - ia->ri_max_segs = RPCRDMA_MAX_HDR_SEGS; - return 0; -} - -/** - * frwr_maxpages - Compute size of largest payload - * @r_xprt: transport - * - * Returns maximum size of an RPC message, in pages. - * - * FRWR mode conveys a list of pages per chunk segment. The - * maximum length of that list is the FRWR page list depth. - */ -size_t frwr_maxpages(struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_ia *ia = &r_xprt->rx_ia; + ia->ri_max_rdma_segs += 2; + if (ia->ri_max_rdma_segs > RPCRDMA_MAX_HDR_SEGS) + ia->ri_max_rdma_segs = RPCRDMA_MAX_HDR_SEGS; + + /* Ensure the underlying device is capable of conveying the + * largest r/wsize NFS will ask for. This guarantees that + * failing over from one RDMA device to another will not + * break NFS I/O. + */ + if ((ia->ri_max_rdma_segs * ia->ri_max_frwr_depth) < RPCRDMA_MAX_SEGS) + return -ENOMEM; - return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, - (ia->ri_max_segs - 2) * ia->ri_max_frwr_depth); + return 0; } /** @@ -323,31 +273,25 @@ size_t frwr_maxpages(struct rpcrdma_xprt *r_xprt) * @nsegs: number of segments remaining * @writing: true when RDMA Write will be used * @xid: XID of RPC using the registered memory - * @out: initialized MR + * @mr: MR to fill in * * Prepare a REG_MR Work Request to register a memory region * for remote access via RDMA READ or RDMA WRITE. * * Returns the next segment or a negative errno pointer. - * On success, the prepared MR is planted in @out. + * On success, @mr is filled in. */ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, int nsegs, bool writing, __be32 xid, - struct rpcrdma_mr **out) + struct rpcrdma_mr *mr) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; - bool holes_ok = ia->ri_mrtype == IB_MR_TYPE_SG_GAPS; - struct rpcrdma_mr *mr; - struct ib_mr *ibmr; struct ib_reg_wr *reg_wr; - int i, n; + int i, n, dma_nents; + struct ib_mr *ibmr; u8 key; - mr = rpcrdma_mr_get(r_xprt); - if (!mr) - goto out_getmr_err; - if (nsegs > ia->ri_max_frwr_depth) nsegs = ia->ri_max_frwr_depth; for (i = 0; i < nsegs;) { @@ -362,22 +306,23 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, ++seg; ++i; - if (holes_ok) + if (ia->ri_mrtype == IB_MR_TYPE_SG_GAPS) continue; if ((i < nsegs && offset_in_page(seg->mr_offset)) || offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) break; } mr->mr_dir = rpcrdma_data_dir(writing); + mr->mr_nents = i; - mr->mr_nents = - ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, i, mr->mr_dir); - if (!mr->mr_nents) + dma_nents = ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, mr->mr_nents, + mr->mr_dir); + if (!dma_nents) goto out_dmamap_err; ibmr = mr->frwr.fr_mr; - n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE); - if (unlikely(n != mr->mr_nents)) + n = ib_map_mr_sg(ibmr, mr->mr_sg, dma_nents, NULL, PAGE_SIZE); + if (n != dma_nents) goto out_mapmr_err; ibmr->iova &= 0x00000000ffffffff; @@ -397,22 +342,15 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, mr->mr_offset = ibmr->iova; trace_xprtrdma_mr_map(mr); - *out = mr; return seg; -out_getmr_err: - xprt_wait_for_buffer_space(&r_xprt->rx_xprt); - return ERR_PTR(-EAGAIN); - out_dmamap_err: mr->mr_dir = DMA_NONE; trace_xprtrdma_frwr_sgerr(mr, i); - rpcrdma_mr_put(mr); return ERR_PTR(-EIO); out_mapmr_err: trace_xprtrdma_frwr_maperr(mr, n); - rpcrdma_mr_recycle(mr); return ERR_PTR(-EIO); } @@ -449,7 +387,7 @@ int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req) struct ib_send_wr *post_wr; struct rpcrdma_mr *mr; - post_wr = &req->rl_sendctx->sc_wr; + post_wr = &req->rl_wr; list_for_each_entry(mr, &req->rl_registered, mr_list) { struct rpcrdma_frwr *frwr; @@ -465,9 +403,6 @@ int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req) post_wr = &frwr->fr_regwr.wr; } - /* If ib_post_send fails, the next ->send_request for - * @req will queue these MRs for recovery. - */ return ib_post_send(ia->ri_id->qp, post_wr, NULL); } @@ -485,7 +420,7 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) if (mr->mr_handle == rep->rr_inv_rkey) { list_del_init(&mr->mr_list); trace_xprtrdma_mr_remoteinv(mr); - rpcrdma_mr_unmap_and_put(mr); + rpcrdma_mr_put(mr); break; /* only one invalidated MR per RPC */ } } @@ -493,9 +428,9 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr) { if (wc->status != IB_WC_SUCCESS) - rpcrdma_mr_recycle(mr); + frwr_mr_recycle(mr); else - rpcrdma_mr_unmap_and_put(mr); + rpcrdma_mr_put(mr); } /** @@ -532,8 +467,8 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_li_wake(wc, frwr); - complete(&frwr->fr_linv_done); __frwr_release_mr(wc, mr); + complete(&frwr->fr_linv_done); } /** @@ -562,8 +497,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) */ frwr = NULL; prev = &first; - while (!list_empty(&req->rl_registered)) { - mr = rpcrdma_mr_pop(&req->rl_registered); + while ((mr = rpcrdma_mr_pop(&req->rl_registered))) { trace_xprtrdma_mr_localinv(mr); r_xprt->rx_stats.local_inv_needed++; @@ -596,7 +530,6 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) */ bad_wr = NULL; rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr); - trace_xprtrdma_post_send(req, rc); /* The final LOCAL_INV WR in the chain is supposed to * do the wake. If it was never posted, the wake will @@ -609,6 +542,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) /* Recycle MRs in the LOCAL_INV chain that did not get posted. */ + trace_xprtrdma_post_linv(req, rc); while (bad_wr) { frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr); @@ -616,7 +550,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) bad_wr = bad_wr->next; list_del_init(&mr->mr_list); - rpcrdma_mr_recycle(mr); + frwr_mr_recycle(mr); } } @@ -632,11 +566,15 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) struct rpcrdma_frwr *frwr = container_of(cqe, struct rpcrdma_frwr, fr_cqe); struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); + struct rpcrdma_rep *rep = mr->mr_req->rl_reply; /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_li_done(wc, frwr); - rpcrdma_complete_rqst(frwr->fr_req->rl_reply); __frwr_release_mr(wc, mr); + + /* Ensure @rep is generated before __frwr_release_mr */ + smp_rmb(); + rpcrdma_complete_rqst(rep); } /** @@ -662,15 +600,13 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) */ frwr = NULL; prev = &first; - while (!list_empty(&req->rl_registered)) { - mr = rpcrdma_mr_pop(&req->rl_registered); + while ((mr = rpcrdma_mr_pop(&req->rl_registered))) { trace_xprtrdma_mr_localinv(mr); r_xprt->rx_stats.local_inv_needed++; frwr = &mr->frwr; frwr->fr_cqe.done = frwr_wc_localinv; - frwr->fr_req = req; last = &frwr->fr_invwr; last->next = NULL; last->wr_cqe = &frwr->fr_cqe; @@ -697,18 +633,18 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) */ bad_wr = NULL; rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr); - trace_xprtrdma_post_send(req, rc); if (!rc) return; /* Recycle MRs in the LOCAL_INV chain that did not get posted. */ + trace_xprtrdma_post_linv(req, rc); while (bad_wr) { frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr); mr = container_of(frwr, struct rpcrdma_mr, frwr); bad_wr = bad_wr->next; - rpcrdma_mr_recycle(mr); + frwr_mr_recycle(mr); } /* The final LOCAL_INV WR in the chain is supposed to diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 4345e6912392..28020ec104d4 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -78,8 +78,6 @@ static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs) size += rpcrdma_segment_maxsz * sizeof(__be32); size += sizeof(__be32); /* list discriminator */ - dprintk("RPC: %s: max call header size = %u\n", - __func__, size); return size; } @@ -100,8 +98,6 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32); size += sizeof(__be32); /* list discriminator */ - dprintk("RPC: %s: max reply header size = %u\n", - __func__, size); return size; } @@ -115,7 +111,7 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) */ void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt) { - unsigned int maxsegs = r_xprt->rx_ia.ri_max_segs; + unsigned int maxsegs = r_xprt->rx_ia.ri_max_rdma_segs; struct rpcrdma_ep *ep = &r_xprt->rx_ep; ep->rep_max_inline_send = @@ -149,7 +145,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, remaining -= min_t(unsigned int, PAGE_SIZE - offset, remaining); offset = 0; - if (++count > r_xprt->rx_ia.ri_max_send_sges) + if (++count > r_xprt->rx_ep.rep_attr.cap.max_send_sge) return false; } } @@ -342,6 +338,31 @@ encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr, return 0; } +static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct rpcrdma_mr_seg *seg, + int nsegs, bool writing, + struct rpcrdma_mr **mr) +{ + *mr = rpcrdma_mr_pop(&req->rl_free_mrs); + if (!*mr) { + *mr = rpcrdma_mr_get(r_xprt); + if (!*mr) + goto out_getmr_err; + trace_xprtrdma_mr_get(req); + (*mr)->mr_req = req; + } + + rpcrdma_mr_push(*mr, &req->rl_registered); + return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr); + +out_getmr_err: + trace_xprtrdma_nomrs(req); + xprt_wait_for_buffer_space(&r_xprt->rx_xprt); + rpcrdma_mrs_refresh(r_xprt); + return ERR_PTR(-EAGAIN); +} + /* Register and XDR encode the Read list. Supports encoding a list of read * segments that belong to a single read chunk. * @@ -356,9 +377,10 @@ encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr, * * Only a single @pos value is currently supported. */ -static noinline int -rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, - struct rpc_rqst *rqst, enum rpcrdma_chunktype rtype) +static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct rpc_rqst *rqst, + enum rpcrdma_chunktype rtype) { struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_mr_seg *seg; @@ -366,7 +388,7 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, unsigned int pos; int nsegs; - if (rtype == rpcrdma_noch) + if (rtype == rpcrdma_noch_pullup || rtype == rpcrdma_noch_mapped) goto done; pos = rqst->rq_snd_buf.head[0].iov_len; @@ -379,10 +401,9 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, return nsegs; do { - seg = frwr_map(r_xprt, seg, nsegs, false, rqst->rq_xid, &mr); + seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, false, &mr); if (IS_ERR(seg)) return PTR_ERR(seg); - rpcrdma_mr_push(mr, &req->rl_registered); if (encode_read_segment(xdr, mr, pos) < 0) return -EMSGSIZE; @@ -411,9 +432,10 @@ done: * * Only a single Write chunk is currently supported. */ -static noinline int -rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, - struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype) +static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct rpc_rqst *rqst, + enum rpcrdma_chunktype wtype) { struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_mr_seg *seg; @@ -440,10 +462,9 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, nchunks = 0; do { - seg = frwr_map(r_xprt, seg, nsegs, true, rqst->rq_xid, &mr); + seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr); if (IS_ERR(seg)) return PTR_ERR(seg); - rpcrdma_mr_push(mr, &req->rl_registered); if (encode_rdma_segment(xdr, mr) < 0) return -EMSGSIZE; @@ -474,9 +495,10 @@ done: * Returns zero on success, or a negative errno if a failure occurred. * @xdr is advanced to the next position in the stream. */ -static noinline int -rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, - struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype) +static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct rpc_rqst *rqst, + enum rpcrdma_chunktype wtype) { struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_mr_seg *seg; @@ -501,10 +523,9 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, nchunks = 0; do { - seg = frwr_map(r_xprt, seg, nsegs, true, rqst->rq_xid, &mr); + seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr); if (IS_ERR(seg)) return PTR_ERR(seg); - rpcrdma_mr_push(mr, &req->rl_registered); if (encode_rdma_segment(xdr, mr) < 0) return -EMSGSIZE; @@ -539,6 +560,7 @@ static void rpcrdma_sendctx_done(struct kref *kref) */ void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc) { + struct rpcrdma_regbuf *rb = sc->sc_req->rl_sendbuf; struct ib_sge *sge; if (!sc->sc_unmap_count) @@ -550,7 +572,7 @@ void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc) */ for (sge = &sc->sc_sges[2]; sc->sc_unmap_count; ++sge, --sc->sc_unmap_count) - ib_dma_unmap_page(sc->sc_device, sge->addr, sge->length, + ib_dma_unmap_page(rdmab_device(rb), sge->addr, sge->length, DMA_TO_DEVICE); kref_put(&sc->sc_req->rl_kref, rpcrdma_sendctx_done); @@ -558,152 +580,228 @@ void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc) /* Prepare an SGE for the RPC-over-RDMA transport header. */ -static bool rpcrdma_prepare_hdr_sge(struct rpcrdma_xprt *r_xprt, +static void rpcrdma_prepare_hdr_sge(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, u32 len) { struct rpcrdma_sendctx *sc = req->rl_sendctx; struct rpcrdma_regbuf *rb = req->rl_rdmabuf; - struct ib_sge *sge = sc->sc_sges; + struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++]; + + sge->addr = rdmab_addr(rb); + sge->length = len; + sge->lkey = rdmab_lkey(rb); + + ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length, + DMA_TO_DEVICE); +} + +/* The head iovec is straightforward, as it is usually already + * DMA-mapped. Sync the content that has changed. + */ +static bool rpcrdma_prepare_head_iov(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, unsigned int len) +{ + struct rpcrdma_sendctx *sc = req->rl_sendctx; + struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++]; + struct rpcrdma_regbuf *rb = req->rl_sendbuf; if (!rpcrdma_regbuf_dma_map(r_xprt, rb)) - goto out_regbuf; + return false; + sge->addr = rdmab_addr(rb); sge->length = len; sge->lkey = rdmab_lkey(rb); ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length, DMA_TO_DEVICE); - sc->sc_wr.num_sge++; return true; +} -out_regbuf: - pr_err("rpcrdma: failed to DMA map a Send buffer\n"); +/* If there is a page list present, DMA map and prepare an + * SGE for each page to be sent. + */ +static bool rpcrdma_prepare_pagelist(struct rpcrdma_req *req, + struct xdr_buf *xdr) +{ + struct rpcrdma_sendctx *sc = req->rl_sendctx; + struct rpcrdma_regbuf *rb = req->rl_sendbuf; + unsigned int page_base, len, remaining; + struct page **ppages; + struct ib_sge *sge; + + ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); + page_base = offset_in_page(xdr->page_base); + remaining = xdr->page_len; + while (remaining) { + sge = &sc->sc_sges[req->rl_wr.num_sge++]; + len = min_t(unsigned int, PAGE_SIZE - page_base, remaining); + sge->addr = ib_dma_map_page(rdmab_device(rb), *ppages, + page_base, len, DMA_TO_DEVICE); + if (ib_dma_mapping_error(rdmab_device(rb), sge->addr)) + goto out_mapping_err; + + sge->length = len; + sge->lkey = rdmab_lkey(rb); + + sc->sc_unmap_count++; + ppages++; + remaining -= len; + page_base = 0; + } + + return true; + +out_mapping_err: + trace_xprtrdma_dma_maperr(sge->addr); return false; } -/* Prepare the Send SGEs. The head and tail iovec, and each entry - * in the page list, gets its own SGE. +/* The tail iovec may include an XDR pad for the page list, + * as well as additional content, and may not reside in the + * same page as the head iovec. */ -static bool rpcrdma_prepare_msg_sges(struct rpcrdma_xprt *r_xprt, - struct rpcrdma_req *req, +static bool rpcrdma_prepare_tail_iov(struct rpcrdma_req *req, struct xdr_buf *xdr, - enum rpcrdma_chunktype rtype) + unsigned int page_base, unsigned int len) { struct rpcrdma_sendctx *sc = req->rl_sendctx; - unsigned int sge_no, page_base, len, remaining; + struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++]; struct rpcrdma_regbuf *rb = req->rl_sendbuf; - struct ib_sge *sge = sc->sc_sges; - struct page *page, **ppages; + struct page *page = virt_to_page(xdr->tail[0].iov_base); - /* The head iovec is straightforward, as it is already - * DMA-mapped. Sync the content that has changed. - */ - if (!rpcrdma_regbuf_dma_map(r_xprt, rb)) - goto out_regbuf; - sc->sc_device = rdmab_device(rb); - sge_no = 1; - sge[sge_no].addr = rdmab_addr(rb); - sge[sge_no].length = xdr->head[0].iov_len; - sge[sge_no].lkey = rdmab_lkey(rb); - ib_dma_sync_single_for_device(rdmab_device(rb), sge[sge_no].addr, - sge[sge_no].length, DMA_TO_DEVICE); + sge->addr = ib_dma_map_page(rdmab_device(rb), page, page_base, len, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(rdmab_device(rb), sge->addr)) + goto out_mapping_err; - /* If there is a Read chunk, the page list is being handled - * via explicit RDMA, and thus is skipped here. However, the - * tail iovec may include an XDR pad for the page list, as - * well as additional content, and may not reside in the - * same page as the head iovec. - */ - if (rtype == rpcrdma_readch) { - len = xdr->tail[0].iov_len; + sge->length = len; + sge->lkey = rdmab_lkey(rb); + ++sc->sc_unmap_count; + return true; - /* Do not include the tail if it is only an XDR pad */ - if (len < 4) - goto out; +out_mapping_err: + trace_xprtrdma_dma_maperr(sge->addr); + return false; +} - page = virt_to_page(xdr->tail[0].iov_base); - page_base = offset_in_page(xdr->tail[0].iov_base); +/* Copy the tail to the end of the head buffer. + */ +static void rpcrdma_pullup_tail_iov(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct xdr_buf *xdr) +{ + unsigned char *dst; - /* If the content in the page list is an odd length, - * xdr_write_pages() has added a pad at the beginning - * of the tail iovec. Force the tail's non-pad content - * to land at the next XDR position in the Send message. - */ - page_base += len & 3; - len -= len & 3; - goto map_tail; - } + dst = (unsigned char *)xdr->head[0].iov_base; + dst += xdr->head[0].iov_len + xdr->page_len; + memmove(dst, xdr->tail[0].iov_base, xdr->tail[0].iov_len); + r_xprt->rx_stats.pullup_copy_count += xdr->tail[0].iov_len; +} - /* If there is a page list present, temporarily DMA map - * and prepare an SGE for each page to be sent. - */ - if (xdr->page_len) { - ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); - page_base = offset_in_page(xdr->page_base); - remaining = xdr->page_len; - while (remaining) { - sge_no++; - if (sge_no > RPCRDMA_MAX_SEND_SGES - 2) - goto out_mapping_overflow; - - len = min_t(u32, PAGE_SIZE - page_base, remaining); - sge[sge_no].addr = - ib_dma_map_page(rdmab_device(rb), *ppages, - page_base, len, DMA_TO_DEVICE); - if (ib_dma_mapping_error(rdmab_device(rb), - sge[sge_no].addr)) - goto out_mapping_err; - sge[sge_no].length = len; - sge[sge_no].lkey = rdmab_lkey(rb); - - sc->sc_unmap_count++; - ppages++; - remaining -= len; - page_base = 0; - } +/* Copy pagelist content into the head buffer. + */ +static void rpcrdma_pullup_pagelist(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct xdr_buf *xdr) +{ + unsigned int len, page_base, remaining; + struct page **ppages; + unsigned char *src, *dst; + + dst = (unsigned char *)xdr->head[0].iov_base; + dst += xdr->head[0].iov_len; + ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); + page_base = offset_in_page(xdr->page_base); + remaining = xdr->page_len; + while (remaining) { + src = page_address(*ppages); + src += page_base; + len = min_t(unsigned int, PAGE_SIZE - page_base, remaining); + memcpy(dst, src, len); + r_xprt->rx_stats.pullup_copy_count += len; + + ppages++; + dst += len; + remaining -= len; + page_base = 0; } +} - /* The tail iovec is not always constructed in the same - * page where the head iovec resides (see, for example, - * gss_wrap_req_priv). To neatly accommodate that case, - * DMA map it separately. - */ - if (xdr->tail[0].iov_len) { - page = virt_to_page(xdr->tail[0].iov_base); - page_base = offset_in_page(xdr->tail[0].iov_base); - len = xdr->tail[0].iov_len; +/* Copy the contents of @xdr into @rl_sendbuf and DMA sync it. + * When the head, pagelist, and tail are small, a pull-up copy + * is considerably less costly than DMA mapping the components + * of @xdr. + * + * Assumptions: + * - the caller has already verified that the total length + * of the RPC Call body will fit into @rl_sendbuf. + */ +static bool rpcrdma_prepare_noch_pullup(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct xdr_buf *xdr) +{ + if (unlikely(xdr->tail[0].iov_len)) + rpcrdma_pullup_tail_iov(r_xprt, req, xdr); -map_tail: - sge_no++; - sge[sge_no].addr = - ib_dma_map_page(rdmab_device(rb), page, page_base, len, - DMA_TO_DEVICE); - if (ib_dma_mapping_error(rdmab_device(rb), sge[sge_no].addr)) - goto out_mapping_err; - sge[sge_no].length = len; - sge[sge_no].lkey = rdmab_lkey(rb); - sc->sc_unmap_count++; - } + if (unlikely(xdr->page_len)) + rpcrdma_pullup_pagelist(r_xprt, req, xdr); -out: - sc->sc_wr.num_sge += sge_no; - if (sc->sc_unmap_count) + /* The whole RPC message resides in the head iovec now */ + return rpcrdma_prepare_head_iov(r_xprt, req, xdr->len); +} + +static bool rpcrdma_prepare_noch_mapped(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct xdr_buf *xdr) +{ + struct kvec *tail = &xdr->tail[0]; + + if (!rpcrdma_prepare_head_iov(r_xprt, req, xdr->head[0].iov_len)) + return false; + if (xdr->page_len) + if (!rpcrdma_prepare_pagelist(req, xdr)) + return false; + if (tail->iov_len) + if (!rpcrdma_prepare_tail_iov(req, xdr, + offset_in_page(tail->iov_base), + tail->iov_len)) + return false; + + if (req->rl_sendctx->sc_unmap_count) kref_get(&req->rl_kref); return true; +} -out_regbuf: - pr_err("rpcrdma: failed to DMA map a Send buffer\n"); - return false; +static bool rpcrdma_prepare_readch(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct xdr_buf *xdr) +{ + if (!rpcrdma_prepare_head_iov(r_xprt, req, xdr->head[0].iov_len)) + return false; -out_mapping_overflow: - rpcrdma_sendctx_unmap(sc); - pr_err("rpcrdma: too many Send SGEs (%u)\n", sge_no); - return false; + /* If there is a Read chunk, the page list is being handled + * via explicit RDMA, and thus is skipped here. + */ -out_mapping_err: - rpcrdma_sendctx_unmap(sc); - trace_xprtrdma_dma_maperr(sge[sge_no].addr); - return false; + /* Do not include the tail if it is only an XDR pad */ + if (xdr->tail[0].iov_len > 3) { + unsigned int page_base, len; + + /* If the content in the page list is an odd length, + * xdr_write_pages() adds a pad at the beginning of + * the tail iovec. Force the tail's non-pad content to + * land at the next XDR position in the Send message. + */ + page_base = offset_in_page(xdr->tail[0].iov_base); + len = xdr->tail[0].iov_len; + page_base += len & 3; + len -= len & 3; + if (!rpcrdma_prepare_tail_iov(req, xdr, page_base, len)) + return false; + kref_get(&req->rl_kref); + } + + return true; } /** @@ -716,31 +814,52 @@ out_mapping_err: * * Returns 0 on success; otherwise a negative errno is returned. */ -int -rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, - struct rpcrdma_req *req, u32 hdrlen, - struct xdr_buf *xdr, enum rpcrdma_chunktype rtype) +inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, u32 hdrlen, + struct xdr_buf *xdr, + enum rpcrdma_chunktype rtype) { int ret; ret = -EAGAIN; req->rl_sendctx = rpcrdma_sendctx_get_locked(r_xprt); if (!req->rl_sendctx) - goto err; - req->rl_sendctx->sc_wr.num_sge = 0; + goto out_nosc; req->rl_sendctx->sc_unmap_count = 0; req->rl_sendctx->sc_req = req; kref_init(&req->rl_kref); + req->rl_wr.wr_cqe = &req->rl_sendctx->sc_cqe; + req->rl_wr.sg_list = req->rl_sendctx->sc_sges; + req->rl_wr.num_sge = 0; + req->rl_wr.opcode = IB_WR_SEND; + + rpcrdma_prepare_hdr_sge(r_xprt, req, hdrlen); ret = -EIO; - if (!rpcrdma_prepare_hdr_sge(r_xprt, req, hdrlen)) - goto err; - if (rtype != rpcrdma_areadch) - if (!rpcrdma_prepare_msg_sges(r_xprt, req, xdr, rtype)) - goto err; + switch (rtype) { + case rpcrdma_noch_pullup: + if (!rpcrdma_prepare_noch_pullup(r_xprt, req, xdr)) + goto out_unmap; + break; + case rpcrdma_noch_mapped: + if (!rpcrdma_prepare_noch_mapped(r_xprt, req, xdr)) + goto out_unmap; + break; + case rpcrdma_readch: + if (!rpcrdma_prepare_readch(r_xprt, req, xdr)) + goto out_unmap; + break; + case rpcrdma_areadch: + break; + default: + goto out_unmap; + } + return 0; -err: +out_unmap: + rpcrdma_sendctx_unmap(req->rl_sendctx); +out_nosc: trace_xprtrdma_prepsend_failed(&req->rl_slot, ret); return ret; } @@ -770,6 +889,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) struct rpcrdma_req *req = rpcr_to_rdmar(rqst); struct xdr_stream *xdr = &req->rl_stream; enum rpcrdma_chunktype rtype, wtype; + struct xdr_buf *buf = &rqst->rq_snd_buf; bool ddp_allowed; __be32 *p; int ret; @@ -785,7 +905,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) goto out_err; *p++ = rqst->rq_xid; *p++ = rpcrdma_version; - *p++ = cpu_to_be32(r_xprt->rx_buf.rb_max_requests); + *p++ = r_xprt->rx_buf.rb_max_requests; /* When the ULP employs a GSS flavor that guarantees integrity * or privacy, direct data placement of individual data items @@ -827,8 +947,9 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) */ if (rpcrdma_args_inline(r_xprt, rqst)) { *p++ = rdma_msg; - rtype = rpcrdma_noch; - } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) { + rtype = buf->len < rdmab_length(req->rl_sendbuf) ? + rpcrdma_noch_pullup : rpcrdma_noch_mapped; + } else if (ddp_allowed && buf->flags & XDRBUF_WRITE) { *p++ = rdma_msg; rtype = rpcrdma_readch; } else { @@ -837,17 +958,6 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) rtype = rpcrdma_areadch; } - /* If this is a retransmit, discard previously registered - * chunks. Very likely the connection has been replaced, - * so these registrations are invalid and unusable. - */ - while (unlikely(!list_empty(&req->rl_registered))) { - struct rpcrdma_mr *mr; - - mr = rpcrdma_mr_pop(&req->rl_registered); - rpcrdma_mr_recycle(mr); - } - /* This implementation supports the following combinations * of chunk lists in one RPC-over-RDMA Call message: * @@ -881,7 +991,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) goto out_err; ret = rpcrdma_prepare_send_sges(r_xprt, req, req->rl_hdrbuf.len, - &rqst->rq_snd_buf, rtype); + buf, rtype); if (ret) goto out_err; @@ -895,6 +1005,40 @@ out_err: return ret; } +static void __rpcrdma_update_cwnd_locked(struct rpc_xprt *xprt, + struct rpcrdma_buffer *buf, + u32 grant) +{ + buf->rb_credits = grant; + xprt->cwnd = grant << RPC_CWNDSHIFT; +} + +static void rpcrdma_update_cwnd(struct rpcrdma_xprt *r_xprt, u32 grant) +{ + struct rpc_xprt *xprt = &r_xprt->rx_xprt; + + spin_lock(&xprt->transport_lock); + __rpcrdma_update_cwnd_locked(xprt, &r_xprt->rx_buf, grant); + spin_unlock(&xprt->transport_lock); +} + +/** + * rpcrdma_reset_cwnd - Reset the xprt's congestion window + * @r_xprt: controlling transport instance + * + * Prepare @r_xprt for the next connection by reinitializing + * its credit grant to one (see RFC 8166, Section 3.3.3). + */ +void rpcrdma_reset_cwnd(struct rpcrdma_xprt *r_xprt) +{ + struct rpc_xprt *xprt = &r_xprt->rx_xprt; + + spin_lock(&xprt->transport_lock); + xprt->cong = 0; + __rpcrdma_update_cwnd_locked(xprt, &r_xprt->rx_buf, 1); + spin_unlock(&xprt->transport_lock); +} + /** * rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs * @rqst: controlling RPC request @@ -934,7 +1078,6 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) curlen = rqst->rq_rcv_buf.head[0].iov_len; if (curlen > copy_len) curlen = copy_len; - trace_xprtrdma_fixup(rqst, copy_len, curlen); srcp += curlen; copy_len -= curlen; @@ -954,8 +1097,6 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) if (curlen > pagelist_len) curlen = pagelist_len; - trace_xprtrdma_fixup_pg(rqst, i, srcp, - copy_len, curlen); destp = kmap_atomic(ppages[i]); memcpy(destp + page_base, srcp, curlen); flush_dcache_page(ppages[i]); @@ -987,6 +1128,8 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) rqst->rq_private_buf.tail[0].iov_base = srcp; } + if (fixup_copy_count) + trace_xprtrdma_fixup(rqst, fixup_copy_count); return fixup_copy_count; } @@ -1240,8 +1383,6 @@ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) struct rpc_rqst *rqst = rep->rr_rqst; int status; - xprt->reestablish_timeout = 0; - switch (rep->rr_proc) { case rdma_msg: status = rpcrdma_decode_msg(r_xprt, rep, rqst); @@ -1300,6 +1441,12 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) u32 credits; __be32 *p; + /* Any data means we had a useful conversation, so + * then we don't need to delay the next reconnect. + */ + if (xprt->reestablish_timeout) + xprt->reestablish_timeout = 0; + /* Fixed transport header fields */ xdr_init_decode(&rep->rr_stream, &rep->rr_hdrbuf, rep->rr_hdrbuf.head[0].iov_base, NULL); @@ -1329,14 +1476,11 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) if (credits == 0) credits = 1; /* don't deadlock */ - else if (credits > buf->rb_max_requests) - credits = buf->rb_max_requests; - if (buf->rb_credits != credits) { - spin_lock(&xprt->transport_lock); - buf->rb_credits = credits; - xprt->cwnd = credits << RPC_CWNDSHIFT; - spin_unlock(&xprt->transport_lock); - } + else if (credits > r_xprt->rx_ep.rep_max_requests) + credits = r_xprt->rx_ep.rep_max_requests; + if (buf->rb_credits != credits) + rpcrdma_update_cwnd(r_xprt, credits); + rpcrdma_post_recvs(r_xprt, false); req = rpcr_to_rdmar(rqst); if (req->rl_reply) { diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index abdb3004a1e3..97bca509a391 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -73,8 +73,6 @@ atomic_t rdma_stat_rq_prod; atomic_t rdma_stat_sq_poll; atomic_t rdma_stat_sq_prod; -struct workqueue_struct *svc_rdma_wq; - /* * This function implements reading and resetting an atomic_t stat * variable through read/write to a proc file. Any write to the file @@ -230,7 +228,6 @@ static struct ctl_table svcrdma_root_table[] = { void svc_rdma_cleanup(void) { dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n"); - destroy_workqueue(svc_rdma_wq); if (svcrdma_table_header) { unregister_sysctl_table(svcrdma_table_header); svcrdma_table_header = NULL; @@ -246,10 +243,6 @@ int svc_rdma_init(void) dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests); dprintk("\tmax_inline : %d\n", svcrdma_max_req_size); - svc_rdma_wq = alloc_workqueue("svc_rdma", 0, 0); - if (!svc_rdma_wq) - return -ENOMEM; - if (!svcrdma_table_header) svcrdma_table_header = register_sysctl_table(svcrdma_root_table); diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index d1fcc41d5eb5..908e78bb87c6 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -195,6 +195,7 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst) pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer); #endif + rqst->rq_xtime = ktime_get(); rc = svc_rdma_bc_sendto(rdma, rqst, ctxt); if (rc) { svc_rdma_send_ctxt_put(rdma, ctxt); diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 65e2fb9aac65..96bccd398469 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -172,9 +172,10 @@ static void svc_rdma_recv_ctxt_destroy(struct svcxprt_rdma *rdma, void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma) { struct svc_rdma_recv_ctxt *ctxt; + struct llist_node *node; - while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_recv_ctxts))) { - list_del(&ctxt->rc_list); + while ((node = llist_del_first(&rdma->sc_recv_ctxts))) { + ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node); svc_rdma_recv_ctxt_destroy(rdma, ctxt); } } @@ -183,21 +184,18 @@ static struct svc_rdma_recv_ctxt * svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma) { struct svc_rdma_recv_ctxt *ctxt; + struct llist_node *node; - spin_lock(&rdma->sc_recv_lock); - ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_recv_ctxts); - if (!ctxt) + node = llist_del_first(&rdma->sc_recv_ctxts); + if (!node) goto out_empty; - list_del(&ctxt->rc_list); - spin_unlock(&rdma->sc_recv_lock); + ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node); out: ctxt->rc_page_count = 0; return ctxt; out_empty: - spin_unlock(&rdma->sc_recv_lock); - ctxt = svc_rdma_recv_ctxt_alloc(rdma); if (!ctxt) return NULL; @@ -218,11 +216,9 @@ void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma, for (i = 0; i < ctxt->rc_page_count; i++) put_page(ctxt->rc_pages[i]); - if (!ctxt->rc_temp) { - spin_lock(&rdma->sc_recv_lock); - list_add(&ctxt->rc_list, &rdma->sc_recv_ctxts); - spin_unlock(&rdma->sc_recv_lock); - } else + if (!ctxt->rc_temp) + llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts); + else svc_rdma_recv_ctxt_destroy(rdma, ctxt); } diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 6fdba72f89f4..f3f108090aa4 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -233,11 +233,15 @@ void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma, /* The first SGE contains the transport header, which * remains mapped until @ctxt is destroyed. */ - for (i = 1; i < ctxt->sc_send_wr.num_sge; i++) + for (i = 1; i < ctxt->sc_send_wr.num_sge; i++) { ib_dma_unmap_page(device, ctxt->sc_sges[i].addr, ctxt->sc_sges[i].length, DMA_TO_DEVICE); + trace_svcrdma_dma_unmap_page(rdma, + ctxt->sc_sges[i].addr, + ctxt->sc_sges[i].length); + } for (i = 0; i < ctxt->sc_page_count; ++i) put_page(ctxt->sc_pages[i]); @@ -490,6 +494,7 @@ static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma, dma_addr_t dma_addr; dma_addr = ib_dma_map_page(dev, page, offset, len, DMA_TO_DEVICE); + trace_svcrdma_dma_map_page(rdma, dma_addr, len); if (ib_dma_mapping_error(dev, dma_addr)) goto out_maperr; @@ -499,7 +504,6 @@ static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma, return 0; out_maperr: - trace_svcrdma_dma_map_page(rdma, page); return -EIO; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 3fe665152d95..145a3615c319 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -140,14 +140,13 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); INIT_LIST_HEAD(&cma_xprt->sc_send_ctxts); - INIT_LIST_HEAD(&cma_xprt->sc_recv_ctxts); + init_llist_head(&cma_xprt->sc_recv_ctxts); INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts); init_waitqueue_head(&cma_xprt->sc_send_wait); spin_lock_init(&cma_xprt->sc_lock); spin_lock_init(&cma_xprt->sc_rq_dto_lock); spin_lock_init(&cma_xprt->sc_send_lock); - spin_lock_init(&cma_xprt->sc_recv_lock); spin_lock_init(&cma_xprt->sc_rw_ctxt_lock); /* @@ -454,14 +453,14 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dprintk("svcrdma: error creating PD for connect request\n"); goto errout; } - newxprt->sc_sq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_sq_depth, - 0, IB_POLL_WORKQUEUE); + newxprt->sc_sq_cq = ib_alloc_cq_any(dev, newxprt, newxprt->sc_sq_depth, + IB_POLL_WORKQUEUE); if (IS_ERR(newxprt->sc_sq_cq)) { dprintk("svcrdma: error creating SQ CQ for connect request\n"); goto errout; } - newxprt->sc_rq_cq = ib_alloc_cq(dev, newxprt, rq_depth, - 0, IB_POLL_WORKQUEUE); + newxprt->sc_rq_cq = + ib_alloc_cq_any(dev, newxprt, rq_depth, IB_POLL_WORKQUEUE); if (IS_ERR(newxprt->sc_rq_cq)) { dprintk("svcrdma: error creating RQ CQ for connect request\n"); goto errout; @@ -630,8 +629,9 @@ static void svc_rdma_free(struct svc_xprt *xprt) { struct svcxprt_rdma *rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); + INIT_WORK(&rdma->sc_work, __svc_rdma_free); - queue_work(svc_rdma_wq, &rdma->sc_work); + schedule_work(&rdma->sc_work); } static int svc_rdma_has_wspace(struct svc_xprt *xprt) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 2ec349ed4770..3cfeba68ee9a 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -243,16 +243,13 @@ xprt_rdma_connect_worker(struct work_struct *work) rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia); xprt_clear_connecting(xprt); if (r_xprt->rx_ep.rep_connected > 0) { - if (!xprt_test_and_set_connected(xprt)) { - xprt->stat.connect_count++; - xprt->stat.connect_time += (long)jiffies - - xprt->stat.connect_start; - xprt_wake_pending_tasks(xprt, -EAGAIN); - } - } else { - if (xprt_test_and_clear_connected(xprt)) - xprt_wake_pending_tasks(xprt, rc); + xprt->stat.connect_count++; + xprt->stat.connect_time += (long)jiffies - + xprt->stat.connect_start; + xprt_set_connected(xprt); + rc = -EAGAIN; } + xprt_wake_pending_tasks(xprt, rc); } /** @@ -319,7 +316,8 @@ xprt_setup_rdma(struct xprt_create *args) if (args->addrlen > sizeof(xprt->addr)) return ERR_PTR(-EBADF); - xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt), 0, 0); + xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt), 0, + xprt_rdma_slot_table_entries); if (!xprt) return ERR_PTR(-ENOMEM); @@ -361,19 +359,13 @@ xprt_setup_rdma(struct xprt_create *args) if (rc) goto out3; - INIT_DELAYED_WORK(&new_xprt->rx_connect_worker, - xprt_rdma_connect_worker); - - xprt->max_payload = frwr_maxpages(new_xprt); - if (xprt->max_payload == 0) - goto out4; - xprt->max_payload <<= PAGE_SHIFT; - dprintk("RPC: %s: transport data payload maximum: %zu bytes\n", - __func__, xprt->max_payload); - if (!try_module_get(THIS_MODULE)) goto out4; + INIT_DELAYED_WORK(&new_xprt->rx_connect_worker, + xprt_rdma_connect_worker); + xprt->max_payload = RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT; + dprintk("RPC: %s: %s:%s\n", __func__, xprt->address_strings[RPC_DISPLAY_ADDR], xprt->address_strings[RPC_DISPLAY_PORT]); @@ -423,17 +415,10 @@ void xprt_rdma_close(struct rpc_xprt *xprt) if (ep->rep_connected == -ENODEV) return; - if (ep->rep_connected > 0) - xprt->reestablish_timeout = 0; rpcrdma_ep_disconnect(ep, ia); - /* Prepare @xprt for the next connection by reinitializing - * its credit grant to one (see RFC 8166, Section 3.3.3). - */ - r_xprt->rx_buf.rb_credits = 1; - xprt->cwnd = RPC_CWNDSHIFT; - out: + xprt->reestablish_timeout = 0; ++xprt->connect_cookie; xprt_disconnect_done(xprt); } @@ -451,12 +436,6 @@ xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port) struct sockaddr *sap = (struct sockaddr *)&xprt->addr; char buf[8]; - dprintk("RPC: %s: setting port for xprt %p (%s:%s) to %u\n", - __func__, xprt, - xprt->address_strings[RPC_DISPLAY_ADDR], - xprt->address_strings[RPC_DISPLAY_PORT], - port); - rpc_set_port(sap, port); kfree(xprt->address_strings[RPC_DISPLAY_PORT]); @@ -466,6 +445,9 @@ xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port) kfree(xprt->address_strings[RPC_DISPLAY_HEX_PORT]); snprintf(buf, sizeof(buf), "%4hx", port); xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL); + + trace_xprtrdma_op_setport(container_of(xprt, struct rpcrdma_xprt, + rx_xprt)); } /** @@ -494,9 +476,9 @@ xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task) * @reconnect_timeout: reconnect timeout after server disconnects * */ -static void xprt_rdma_tcp_set_connect_timeout(struct rpc_xprt *xprt, - unsigned long connect_timeout, - unsigned long reconnect_timeout) +static void xprt_rdma_set_connect_timeout(struct rpc_xprt *xprt, + unsigned long connect_timeout, + unsigned long reconnect_timeout) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); @@ -537,13 +519,12 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); unsigned long delay; - trace_xprtrdma_op_connect(r_xprt); - delay = 0; if (r_xprt->rx_ep.rep_connected != 0) { delay = xprt_reconnect_delay(xprt); xprt_reconnect_backoff(xprt, RPCRDMA_INIT_REEST_TO); } + trace_xprtrdma_op_connect(r_xprt, delay); queue_delayed_work(xprtiod_workqueue, &r_xprt->rx_connect_worker, delay); } @@ -571,6 +552,7 @@ xprt_rdma_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) return; out_sleep: + set_bit(XPRT_CONGESTED, &xprt->state); rpc_sleep_on(&xprt->backlog, task, NULL); task->tk_status = -EAGAIN; } @@ -589,7 +571,8 @@ xprt_rdma_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *rqst) memset(rqst, 0, sizeof(*rqst)); rpcrdma_buffer_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst)); - rpc_wake_up_next(&xprt->backlog); + if (unlikely(!rpc_wake_up_next(&xprt->backlog))) + clear_bit(XPRT_CONGESTED, &xprt->state); } static bool rpcrdma_check_regbuf(struct rpcrdma_xprt *r_xprt, @@ -803,7 +786,7 @@ static const struct rpc_xprt_ops xprt_rdma_procs = { .send_request = xprt_rdma_send_request, .close = xprt_rdma_close, .destroy = xprt_rdma_destroy, - .set_connect_timeout = xprt_rdma_tcp_set_connect_timeout, + .set_connect_timeout = xprt_rdma_set_connect_timeout, .print_stats = xprt_rdma_print_stats, .enable_swap = xprt_rdma_enable_swap, .disable_swap = xprt_rdma_disable_swap, diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 805b1f35e1ca..353f61ac8d51 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -53,6 +53,7 @@ #include <linux/slab.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/svc_rdma.h> +#include <linux/log2.h> #include <asm-generic/barrier.h> #include <asm/bitops.h> @@ -73,15 +74,21 @@ /* * internal functions */ -static void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc); +static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt); +static void rpcrdma_sendctxs_destroy(struct rpcrdma_xprt *r_xprt); +static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_sendctx *sc); +static int rpcrdma_reqs_setup(struct rpcrdma_xprt *r_xprt); +static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt); +static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep); +static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); -static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf); +static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt); static struct rpcrdma_regbuf * rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction, gfp_t flags); static void rpcrdma_regbuf_dma_unmap(struct rpcrdma_regbuf *rb); static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb); -static void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp); /* Wait for outstanding transport work to finish. ib_drain_qp * handles the drains in the wrong order for us, so open code @@ -122,7 +129,7 @@ rpcrdma_qp_event_handler(struct ib_event *event, void *context) /** * rpcrdma_wc_send - Invoked by RDMA provider for each polled Send WC - * @cq: completion queue (ignored) + * @cq: completion queue * @wc: completed WR * */ @@ -135,7 +142,7 @@ rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_send(sc, wc); - rpcrdma_sendctx_put_locked(sc); + rpcrdma_sendctx_put_locked((struct rpcrdma_xprt *)cq->cq_context, sc); } /** @@ -167,19 +174,18 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) rdmab_addr(rep->rr_rdmabuf), wc->byte_len, DMA_FROM_DEVICE); - rpcrdma_post_recvs(r_xprt, false); rpcrdma_reply_handler(rep); return; out_flushed: - rpcrdma_recv_buffer_put(rep); + rpcrdma_rep_destroy(rep); } -static void -rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt, - struct rdma_conn_param *param) +static void rpcrdma_update_cm_private(struct rpcrdma_xprt *r_xprt, + struct rdma_conn_param *param) { const struct rpcrdma_connect_private *pmsg = param->private_data; + struct rpcrdma_ep *ep = &r_xprt->rx_ep; unsigned int rsize, wsize; /* Default settings for RPC-over-RDMA Version One */ @@ -195,13 +201,11 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt, wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size); } - if (rsize < r_xprt->rx_ep.rep_inline_recv) - r_xprt->rx_ep.rep_inline_recv = rsize; - if (wsize < r_xprt->rx_ep.rep_inline_send) - r_xprt->rx_ep.rep_inline_send = wsize; - dprintk("RPC: %s: max send %u, max recv %u\n", __func__, - r_xprt->rx_ep.rep_inline_send, - r_xprt->rx_ep.rep_inline_recv); + if (rsize < ep->rep_inline_recv) + ep->rep_inline_recv = rsize; + if (wsize < ep->rep_inline_send) + ep->rep_inline_send = wsize; + rpcrdma_set_max_header_sizes(r_xprt); } @@ -244,6 +248,7 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) ia->ri_id->device->name, rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt)); #endif + init_completion(&ia->ri_remove_done); set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags); ep->rep_connected = -ENODEV; xprt_force_disconnect(xprt); @@ -255,7 +260,8 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) case RDMA_CM_EVENT_ESTABLISHED: ++xprt->connect_cookie; ep->rep_connected = 1; - rpcrdma_update_connect_private(r_xprt, &event->param.conn); + rpcrdma_update_cm_private(r_xprt, &event->param.conn); + trace_xprtrdma_inline_thresh(r_xprt); wake_up_all(&ep->rep_connect_wait); break; case RDMA_CM_EVENT_CONNECT_ERROR: @@ -295,10 +301,7 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia) struct rdma_cm_id *id; int rc; - trace_xprtrdma_conn_start(xprt); - init_completion(&ia->ri_done); - init_completion(&ia->ri_remove_done); id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_cm_event_handler, xprt, RDMA_PS_TCP, IB_QPT_RC); @@ -312,10 +315,8 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia) if (rc) goto out; rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout); - if (rc < 0) { - trace_xprtrdma_conn_tout(xprt); + if (rc < 0) goto out; - } rc = ia->ri_async_rc; if (rc) @@ -326,10 +327,8 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia) if (rc) goto out; rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout); - if (rc < 0) { - trace_xprtrdma_conn_tout(xprt); + if (rc < 0) goto out; - } rc = ia->ri_async_rc; if (rc) goto out; @@ -371,18 +370,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt) goto out_err; } - switch (xprt_rdma_memreg_strategy) { - case RPCRDMA_FRWR: - if (frwr_is_supported(ia->ri_id->device)) - break; - /*FALLTHROUGH*/ - default: - pr_err("rpcrdma: Device %s does not support memreg mode %d\n", - ia->ri_id->device->name, xprt_rdma_memreg_strategy); - rc = -EINVAL; - goto out_err; - } - return 0; out_err: @@ -396,6 +383,8 @@ out_err: * * Divest transport H/W resources associated with this adapter, * but allow it to be restored later. + * + * Caller must hold the transport send lock. */ void rpcrdma_ia_remove(struct rpcrdma_ia *ia) @@ -403,11 +392,6 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); struct rpcrdma_ep *ep = &r_xprt->rx_ep; - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct rpcrdma_req *req; - struct rpcrdma_rep *rep; - - cancel_delayed_work_sync(&buf->rb_refresh_worker); /* This is similar to rpcrdma_ep_destroy, but: * - Don't cancel the connect worker. @@ -429,14 +413,10 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) /* The ULP is responsible for ensuring all DMA * mappings and MRs are gone. */ - list_for_each_entry(rep, &buf->rb_recv_bufs, rr_list) - rpcrdma_regbuf_dma_unmap(rep->rr_rdmabuf); - list_for_each_entry(req, &buf->rb_allreqs, rl_all) { - rpcrdma_regbuf_dma_unmap(req->rl_rdmabuf); - rpcrdma_regbuf_dma_unmap(req->rl_sendbuf); - rpcrdma_regbuf_dma_unmap(req->rl_recvbuf); - } - rpcrdma_mrs_destroy(buf); + rpcrdma_reps_unmap(r_xprt); + rpcrdma_reqs_reset(r_xprt); + rpcrdma_mrs_destroy(r_xprt); + rpcrdma_sendctxs_destroy(r_xprt); ib_dealloc_pd(ia->ri_pd); ia->ri_pd = NULL; @@ -479,30 +459,20 @@ int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private; struct ib_cq *sendcq, *recvcq; - unsigned int max_sge; int rc; - ep->rep_max_requests = xprt_rdma_slot_table_entries; + ep->rep_max_requests = r_xprt->rx_xprt.max_reqs; ep->rep_inline_send = xprt_rdma_max_inline_write; ep->rep_inline_recv = xprt_rdma_max_inline_read; - max_sge = min_t(unsigned int, ia->ri_id->device->attrs.max_send_sge, - RPCRDMA_MAX_SEND_SGES); - if (max_sge < RPCRDMA_MIN_SEND_SGES) { - pr_warn("rpcrdma: HCA provides only %d send SGEs\n", max_sge); - return -ENOMEM; - } - ia->ri_max_send_sges = max_sge; - - rc = frwr_open(ia, ep); + rc = frwr_query_device(r_xprt, ia->ri_id->device); if (rc) return rc; + r_xprt->rx_buf.rb_max_requests = cpu_to_be32(ep->rep_max_requests); ep->rep_attr.event_handler = rpcrdma_qp_event_handler; ep->rep_attr.qp_context = ep; ep->rep_attr.srq = NULL; - ep->rep_attr.cap.max_send_sge = max_sge; - ep->rep_attr.cap.max_recv_sge = 1; ep->rep_attr.cap.max_inline_data = 0; ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; ep->rep_attr.qp_type = IB_QPT_RC; @@ -521,18 +491,17 @@ int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) init_waitqueue_head(&ep->rep_connect_wait); ep->rep_receive_count = 0; - sendcq = ib_alloc_cq(ia->ri_id->device, NULL, - ep->rep_attr.cap.max_send_wr + 1, - ia->ri_id->device->num_comp_vectors > 1 ? 1 : 0, - IB_POLL_WORKQUEUE); + sendcq = ib_alloc_cq_any(ia->ri_id->device, r_xprt, + ep->rep_attr.cap.max_send_wr + 1, + IB_POLL_WORKQUEUE); if (IS_ERR(sendcq)) { rc = PTR_ERR(sendcq); goto out1; } - recvcq = ib_alloc_cq(ia->ri_id->device, NULL, - ep->rep_attr.cap.max_recv_wr + 1, - 0, IB_POLL_WORKQUEUE); + recvcq = ib_alloc_cq_any(ia->ri_id->device, NULL, + ep->rep_attr.cap.max_recv_wr + 1, + IB_POLL_WORKQUEUE); if (IS_ERR(recvcq)) { rc = PTR_ERR(recvcq); goto out2; @@ -605,10 +574,11 @@ void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt) * Unlike a normal reconnection, a fresh PD and a new set * of MRs and buffers is needed. */ -static int -rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, - struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) +static int rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, + struct ib_qp_init_attr *qp_init_attr) { + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_ep *ep = &r_xprt->rx_ep; int rc, err; trace_xprtrdma_reinsert(r_xprt); @@ -623,15 +593,14 @@ rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, pr_err("rpcrdma: rpcrdma_ep_create returned %d\n", err); goto out2; } + memcpy(qp_init_attr, &ep->rep_attr, sizeof(*qp_init_attr)); rc = -ENETUNREACH; - err = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); + err = rdma_create_qp(ia->ri_id, ia->ri_pd, qp_init_attr); if (err) { pr_err("rpcrdma: rdma_create_qp returned %d\n", err); goto out3; } - - rpcrdma_mrs_create(r_xprt); return 0; out3: @@ -642,16 +611,14 @@ out1: return rc; } -static int -rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep, - struct rpcrdma_ia *ia) +static int rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, + struct ib_qp_init_attr *qp_init_attr) { + struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rdma_cm_id *id, *old; int err, rc; - trace_xprtrdma_reconnect(r_xprt); - - rpcrdma_ep_disconnect(ep, ia); + rpcrdma_ep_disconnect(&r_xprt->rx_ep, ia); rc = -EHOSTUNREACH; id = rpcrdma_create_id(r_xprt, ia); @@ -673,7 +640,7 @@ rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep, goto out_destroy; } - err = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr); + err = rdma_create_qp(id, ia->ri_pd, qp_init_attr); if (err) goto out_destroy; @@ -698,25 +665,26 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); struct rpc_xprt *xprt = &r_xprt->rx_xprt; + struct ib_qp_init_attr qp_init_attr; int rc; retry: + memcpy(&qp_init_attr, &ep->rep_attr, sizeof(qp_init_attr)); switch (ep->rep_connected) { case 0: - dprintk("RPC: %s: connecting...\n", __func__); - rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); + rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &qp_init_attr); if (rc) { rc = -ENETUNREACH; goto out_noupdate; } break; case -ENODEV: - rc = rpcrdma_ep_recreate_xprt(r_xprt, ep, ia); + rc = rpcrdma_ep_recreate_xprt(r_xprt, &qp_init_attr); if (rc) goto out_noupdate; break; default: - rc = rpcrdma_ep_reconnect(r_xprt, ep, ia); + rc = rpcrdma_ep_reconnect(r_xprt, &qp_init_attr); if (rc) goto out; } @@ -724,12 +692,19 @@ retry: ep->rep_connected = 0; xprt_clear_connected(xprt); + rpcrdma_reset_cwnd(r_xprt); rpcrdma_post_recvs(r_xprt, true); + rc = rpcrdma_sendctxs_create(r_xprt); + if (rc) + goto out; + rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); if (rc) goto out; + if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO) + xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0); if (ep->rep_connected <= 0) { if (ep->rep_connected == -EAGAIN) @@ -738,13 +713,19 @@ retry: goto out; } - dprintk("RPC: %s: connected\n", __func__); + rc = rpcrdma_reqs_setup(r_xprt); + if (rc) { + rpcrdma_ep_disconnect(ep, ia); + goto out; + } + rpcrdma_mrs_create(r_xprt); out: if (rc) ep->rep_connected = rc; out_noupdate: + trace_xprtrdma_connect(r_xprt, rc); return rc; } @@ -753,11 +734,8 @@ out_noupdate: * @ep: endpoint to disconnect * @ia: associated interface adapter * - * This is separate from destroy to facilitate the ability - * to reconnect without recreating the endpoint. - * - * This call is not reentrant, and must not be made in parallel - * on the same endpoint. + * Caller serializes. Either the transport send lock is held, + * or we're being called to destroy the transport. */ void rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) @@ -776,6 +754,9 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) trace_xprtrdma_disconnect(r_xprt, rc); rpcrdma_xprt_drain(r_xprt); + rpcrdma_reqs_reset(r_xprt); + rpcrdma_mrs_destroy(r_xprt); + rpcrdma_sendctxs_destroy(r_xprt); } /* Fixed-size circular FIFO queue. This implementation is wait-free and @@ -795,27 +776,28 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) * queue activity, and rpcrdma_xprt_drain has flushed all remaining * Send requests. */ -static void rpcrdma_sendctxs_destroy(struct rpcrdma_buffer *buf) +static void rpcrdma_sendctxs_destroy(struct rpcrdma_xprt *r_xprt) { + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; unsigned long i; + if (!buf->rb_sc_ctxs) + return; for (i = 0; i <= buf->rb_sc_last; i++) kfree(buf->rb_sc_ctxs[i]); kfree(buf->rb_sc_ctxs); + buf->rb_sc_ctxs = NULL; } -static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ia *ia) +static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ep *ep) { struct rpcrdma_sendctx *sc; - sc = kzalloc(struct_size(sc, sc_sges, ia->ri_max_send_sges), + sc = kzalloc(struct_size(sc, sc_sges, ep->rep_attr.cap.max_send_sge), GFP_KERNEL); if (!sc) return NULL; - sc->sc_wr.wr_cqe = &sc->sc_cqe; - sc->sc_wr.sg_list = sc->sc_sges; - sc->sc_wr.opcode = IB_WR_SEND; sc->sc_cqe.done = rpcrdma_wc_send; return sc; } @@ -831,22 +813,22 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt) * the ->send_request call to fail temporarily before too many * Sends are posted. */ - i = buf->rb_max_requests + RPCRDMA_MAX_BC_REQUESTS; - dprintk("RPC: %s: allocating %lu send_ctxs\n", __func__, i); + i = r_xprt->rx_ep.rep_max_requests + RPCRDMA_MAX_BC_REQUESTS; buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), GFP_KERNEL); if (!buf->rb_sc_ctxs) return -ENOMEM; buf->rb_sc_last = i - 1; for (i = 0; i <= buf->rb_sc_last; i++) { - sc = rpcrdma_sendctx_create(&r_xprt->rx_ia); + sc = rpcrdma_sendctx_create(&r_xprt->rx_ep); if (!sc) return -ENOMEM; - sc->sc_xprt = r_xprt; buf->rb_sc_ctxs[i] = sc; } + buf->rb_sc_head = 0; + buf->rb_sc_tail = 0; return 0; } @@ -906,6 +888,7 @@ out_emptyq: /** * rpcrdma_sendctx_put_locked - Release a send context + * @r_xprt: controlling transport instance * @sc: send context to release * * Usage: Called from Send completion to return a sendctxt @@ -913,10 +896,10 @@ out_emptyq: * * The caller serializes calls to this function (per transport). */ -static void -rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc) +static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_sendctx *sc) { - struct rpcrdma_buffer *buf = &sc->sc_xprt->rx_buf; + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; unsigned long next_tail; /* Unmap SGEs of previously completed but unsignaled @@ -934,7 +917,7 @@ rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc) /* Paired with READ_ONCE */ smp_store_release(&buf->rb_sc_tail, next_tail); - xprt_write_space(&sc->sc_xprt->rx_xprt); + xprt_write_space(&r_xprt->rx_xprt); } static void @@ -943,14 +926,12 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_ia *ia = &r_xprt->rx_ia; unsigned int count; - LIST_HEAD(free); - LIST_HEAD(all); - for (count = 0; count < ia->ri_max_segs; count++) { + for (count = 0; count < ia->ri_max_rdma_segs; count++) { struct rpcrdma_mr *mr; int rc; - mr = kzalloc(sizeof(*mr), GFP_KERNEL); + mr = kzalloc(sizeof(*mr), GFP_NOFS); if (!mr) break; @@ -962,15 +943,13 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) mr->mr_xprt = r_xprt; - list_add(&mr->mr_list, &free); - list_add(&mr->mr_all, &all); + spin_lock(&buf->rb_lock); + rpcrdma_mr_push(mr, &buf->rb_mrs); + list_add(&mr->mr_all, &buf->rb_all_mrs); + spin_unlock(&buf->rb_lock); } - spin_lock(&buf->rb_mrlock); - list_splice(&free, &buf->rb_mrs); - list_splice(&all, &buf->rb_all); r_xprt->rx_stats.mrs_allocated += count; - spin_unlock(&buf->rb_mrlock); trace_xprtrdma_createmrs(r_xprt, count); } @@ -978,7 +957,7 @@ static void rpcrdma_mr_refresh_worker(struct work_struct *work) { struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer, - rb_refresh_worker.work); + rb_refresh_worker); struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt, rx_buf); @@ -987,6 +966,28 @@ rpcrdma_mr_refresh_worker(struct work_struct *work) } /** + * rpcrdma_mrs_refresh - Wake the MR refresh worker + * @r_xprt: controlling transport instance + * + */ +void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_ep *ep = &r_xprt->rx_ep; + + /* If there is no underlying device, it's no use to + * wake the refresh worker. + */ + if (ep->rep_connected != -ENODEV) { + /* The work is scheduled on a WQ_MEM_RECLAIM + * workqueue in order to prevent MR allocation + * from recursing into NFS during direct reclaim. + */ + queue_work(xprtiod_workqueue, &buf->rb_refresh_worker); + } +} + +/** * rpcrdma_req_create - Allocate an rpcrdma_req object * @r_xprt: controlling r_xprt * @size: initial size, in bytes, of send and receive buffers @@ -998,45 +999,120 @@ struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size, gfp_t flags) { struct rpcrdma_buffer *buffer = &r_xprt->rx_buf; - struct rpcrdma_regbuf *rb; struct rpcrdma_req *req; req = kzalloc(sizeof(*req), flags); if (req == NULL) goto out1; - rb = rpcrdma_regbuf_alloc(RPCRDMA_HDRBUF_SIZE, DMA_TO_DEVICE, flags); - if (!rb) - goto out2; - req->rl_rdmabuf = rb; - xdr_buf_init(&req->rl_hdrbuf, rdmab_data(rb), rdmab_length(rb)); - req->rl_sendbuf = rpcrdma_regbuf_alloc(size, DMA_TO_DEVICE, flags); if (!req->rl_sendbuf) - goto out3; + goto out2; req->rl_recvbuf = rpcrdma_regbuf_alloc(size, DMA_NONE, flags); if (!req->rl_recvbuf) - goto out4; + goto out3; + INIT_LIST_HEAD(&req->rl_free_mrs); INIT_LIST_HEAD(&req->rl_registered); spin_lock(&buffer->rb_lock); list_add(&req->rl_all, &buffer->rb_allreqs); spin_unlock(&buffer->rb_lock); return req; -out4: - kfree(req->rl_sendbuf); out3: - kfree(req->rl_rdmabuf); + kfree(req->rl_sendbuf); out2: kfree(req); out1: return NULL; } -static struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, - bool temp) +/** + * rpcrdma_req_setup - Per-connection instance setup of an rpcrdma_req object + * @r_xprt: controlling transport instance + * @req: rpcrdma_req object to set up + * + * Returns zero on success, and a negative errno on failure. + */ +int rpcrdma_req_setup(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) +{ + struct rpcrdma_regbuf *rb; + size_t maxhdrsize; + + /* Compute maximum header buffer size in bytes */ + maxhdrsize = rpcrdma_fixed_maxsz + 3 + + r_xprt->rx_ia.ri_max_rdma_segs * rpcrdma_readchunk_maxsz; + maxhdrsize *= sizeof(__be32); + rb = rpcrdma_regbuf_alloc(__roundup_pow_of_two(maxhdrsize), + DMA_TO_DEVICE, GFP_KERNEL); + if (!rb) + goto out; + + if (!__rpcrdma_regbuf_dma_map(r_xprt, rb)) + goto out_free; + + req->rl_rdmabuf = rb; + xdr_buf_init(&req->rl_hdrbuf, rdmab_data(rb), rdmab_length(rb)); + return 0; + +out_free: + rpcrdma_regbuf_free(rb); +out: + return -ENOMEM; +} + +/* ASSUMPTION: the rb_allreqs list is stable for the duration, + * and thus can be walked without holding rb_lock. Eg. the + * caller is holding the transport send lock to exclude + * device removal or disconnection. + */ +static int rpcrdma_reqs_setup(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_req *req; + int rc; + + list_for_each_entry(req, &buf->rb_allreqs, rl_all) { + rc = rpcrdma_req_setup(r_xprt, req); + if (rc) + return rc; + } + return 0; +} + +static void rpcrdma_req_reset(struct rpcrdma_req *req) +{ + /* Credits are valid for only one connection */ + req->rl_slot.rq_cong = 0; + + rpcrdma_regbuf_free(req->rl_rdmabuf); + req->rl_rdmabuf = NULL; + + rpcrdma_regbuf_dma_unmap(req->rl_sendbuf); + rpcrdma_regbuf_dma_unmap(req->rl_recvbuf); +} + +/* ASSUMPTION: the rb_allreqs list is stable for the duration, + * and thus can be walked without holding rb_lock. Eg. the + * caller is holding the transport send lock to exclude + * device removal or disconnection. + */ +static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_req *req; + + list_for_each_entry(req, &buf->rb_allreqs, rl_all) + rpcrdma_req_reset(req); +} + +/* No locking needed here. This function is called only by the + * Receive completion handler. + */ +static noinline +struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, + bool temp) { struct rpcrdma_rep *rep; @@ -1049,6 +1125,9 @@ static struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, if (!rep->rr_rdmabuf) goto out_free; + if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf)) + goto out_free_regbuf; + xdr_buf_init(&rep->rr_hdrbuf, rdmab_data(rep->rr_rdmabuf), rdmab_length(rep->rr_rdmabuf)); rep->rr_cqe.done = rpcrdma_wc_receive; @@ -1058,14 +1137,63 @@ static struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov; rep->rr_recv_wr.num_sge = 1; rep->rr_temp = temp; + list_add(&rep->rr_all, &r_xprt->rx_buf.rb_all_reps); return rep; +out_free_regbuf: + rpcrdma_regbuf_free(rep->rr_rdmabuf); out_free: kfree(rep); out: return NULL; } +/* No locking needed here. This function is invoked only by the + * Receive completion handler, or during transport shutdown. + */ +static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep) +{ + list_del(&rep->rr_all); + rpcrdma_regbuf_free(rep->rr_rdmabuf); + kfree(rep); +} + +static struct rpcrdma_rep *rpcrdma_rep_get_locked(struct rpcrdma_buffer *buf) +{ + struct llist_node *node; + + /* Calls to llist_del_first are required to be serialized */ + node = llist_del_first(&buf->rb_free_reps); + if (!node) + return NULL; + return llist_entry(node, struct rpcrdma_rep, rr_node); +} + +static void rpcrdma_rep_put(struct rpcrdma_buffer *buf, + struct rpcrdma_rep *rep) +{ + llist_add(&rep->rr_node, &buf->rb_free_reps); +} + +static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_rep *rep; + + list_for_each_entry(rep, &buf->rb_all_reps, rr_all) { + rpcrdma_regbuf_dma_unmap(rep->rr_rdmabuf); + rep->rr_temp = true; + } +} + +static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_rep *rep; + + while ((rep = rpcrdma_rep_get_locked(buf)) != NULL) + rpcrdma_rep_destroy(rep); +} + /** * rpcrdma_buffer_create - Create initial set of req/rep objects * @r_xprt: transport instance to (re)initialize @@ -1077,37 +1205,28 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) struct rpcrdma_buffer *buf = &r_xprt->rx_buf; int i, rc; - buf->rb_max_requests = r_xprt->rx_ep.rep_max_requests; buf->rb_bc_srv_max_requests = 0; - spin_lock_init(&buf->rb_mrlock); spin_lock_init(&buf->rb_lock); INIT_LIST_HEAD(&buf->rb_mrs); - INIT_LIST_HEAD(&buf->rb_all); - INIT_DELAYED_WORK(&buf->rb_refresh_worker, - rpcrdma_mr_refresh_worker); - - rpcrdma_mrs_create(r_xprt); + INIT_LIST_HEAD(&buf->rb_all_mrs); + INIT_WORK(&buf->rb_refresh_worker, rpcrdma_mr_refresh_worker); INIT_LIST_HEAD(&buf->rb_send_bufs); INIT_LIST_HEAD(&buf->rb_allreqs); + INIT_LIST_HEAD(&buf->rb_all_reps); rc = -ENOMEM; - for (i = 0; i < buf->rb_max_requests; i++) { + for (i = 0; i < r_xprt->rx_xprt.max_reqs; i++) { struct rpcrdma_req *req; - req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE, + req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE * 2, GFP_KERNEL); if (!req) goto out; list_add(&req->rl_list, &buf->rb_send_bufs); } - buf->rb_credits = 1; - INIT_LIST_HEAD(&buf->rb_recv_bufs); - - rc = rpcrdma_sendctxs_create(r_xprt); - if (rc) - goto out; + init_llist_head(&buf->rb_free_reps); return 0; out: @@ -1115,58 +1234,62 @@ out: return rc; } -static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep) -{ - rpcrdma_regbuf_free(rep->rr_rdmabuf); - kfree(rep); -} - /** * rpcrdma_req_destroy - Destroy an rpcrdma_req object * @req: unused object to be destroyed * - * This function assumes that the caller prevents concurrent device - * unload and transport tear-down. + * Relies on caller holding the transport send lock to protect + * removing req->rl_all from buf->rb_all_reqs safely. */ -void -rpcrdma_req_destroy(struct rpcrdma_req *req) +void rpcrdma_req_destroy(struct rpcrdma_req *req) { + struct rpcrdma_mr *mr; + list_del(&req->rl_all); + while ((mr = rpcrdma_mr_pop(&req->rl_free_mrs))) { + struct rpcrdma_buffer *buf = &mr->mr_xprt->rx_buf; + + spin_lock(&buf->rb_lock); + list_del(&mr->mr_all); + spin_unlock(&buf->rb_lock); + + frwr_release_mr(mr); + } + rpcrdma_regbuf_free(req->rl_recvbuf); rpcrdma_regbuf_free(req->rl_sendbuf); rpcrdma_regbuf_free(req->rl_rdmabuf); kfree(req); } -static void -rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf) +/** + * rpcrdma_mrs_destroy - Release all of a transport's MRs + * @r_xprt: controlling transport instance + * + * Relies on caller holding the transport send lock to protect + * removing mr->mr_list from req->rl_free_mrs safely. + */ +static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt) { - struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt, - rx_buf); + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_mr *mr; - unsigned int count; - count = 0; - spin_lock(&buf->rb_mrlock); - while (!list_empty(&buf->rb_all)) { - mr = list_entry(buf->rb_all.next, struct rpcrdma_mr, mr_all); - list_del(&mr->mr_all); + cancel_work_sync(&buf->rb_refresh_worker); - spin_unlock(&buf->rb_mrlock); - - /* Ensure MW is not on any rl_registered list */ - if (!list_empty(&mr->mr_list)) - list_del(&mr->mr_list); + spin_lock(&buf->rb_lock); + while ((mr = list_first_entry_or_null(&buf->rb_all_mrs, + struct rpcrdma_mr, + mr_all)) != NULL) { + list_del(&mr->mr_list); + list_del(&mr->mr_all); + spin_unlock(&buf->rb_lock); frwr_release_mr(mr); - count++; - spin_lock(&buf->rb_mrlock); - } - spin_unlock(&buf->rb_mrlock); - r_xprt->rx_stats.mrs_allocated = 0; - dprintk("RPC: %s: released %u MRs\n", __func__, count); + spin_lock(&buf->rb_lock); + } + spin_unlock(&buf->rb_lock); } /** @@ -1180,18 +1303,7 @@ rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf) void rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { - cancel_delayed_work_sync(&buf->rb_refresh_worker); - - rpcrdma_sendctxs_destroy(buf); - - while (!list_empty(&buf->rb_recv_bufs)) { - struct rpcrdma_rep *rep; - - rep = list_first_entry(&buf->rb_recv_bufs, - struct rpcrdma_rep, rr_list); - list_del(&rep->rr_list); - rpcrdma_rep_destroy(rep); - } + rpcrdma_reps_destroy(buf); while (!list_empty(&buf->rb_send_bufs)) { struct rpcrdma_req *req; @@ -1201,8 +1313,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) list_del(&req->rl_list); rpcrdma_req_destroy(req); } - - rpcrdma_mrs_destroy(buf); } /** @@ -1216,54 +1326,20 @@ struct rpcrdma_mr * rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct rpcrdma_mr *mr = NULL; - - spin_lock(&buf->rb_mrlock); - if (!list_empty(&buf->rb_mrs)) - mr = rpcrdma_mr_pop(&buf->rb_mrs); - spin_unlock(&buf->rb_mrlock); + struct rpcrdma_mr *mr; - if (!mr) - goto out_nomrs; + spin_lock(&buf->rb_lock); + mr = rpcrdma_mr_pop(&buf->rb_mrs); + spin_unlock(&buf->rb_lock); return mr; - -out_nomrs: - trace_xprtrdma_nomrs(r_xprt); - if (r_xprt->rx_ep.rep_connected != -ENODEV) - schedule_delayed_work(&buf->rb_refresh_worker, 0); - - /* Allow the reply handler and refresh worker to run */ - cond_resched(); - - return NULL; -} - -static void -__rpcrdma_mr_put(struct rpcrdma_buffer *buf, struct rpcrdma_mr *mr) -{ - spin_lock(&buf->rb_mrlock); - rpcrdma_mr_push(mr, &buf->rb_mrs); - spin_unlock(&buf->rb_mrlock); } /** - * rpcrdma_mr_put - Release an rpcrdma_mr object - * @mr: object to release + * rpcrdma_mr_put - DMA unmap an MR and release it + * @mr: MR to release * */ -void -rpcrdma_mr_put(struct rpcrdma_mr *mr) -{ - __rpcrdma_mr_put(&mr->mr_xprt->rx_buf, mr); -} - -/** - * rpcrdma_mr_unmap_and_put - DMA unmap an MR and release it - * @mr: object to release - * - */ -void -rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr) +void rpcrdma_mr_put(struct rpcrdma_mr *mr) { struct rpcrdma_xprt *r_xprt = mr->mr_xprt; @@ -1273,7 +1349,8 @@ rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr) mr->mr_sg, mr->mr_nents, mr->mr_dir); mr->mr_dir = DMA_NONE; } - __rpcrdma_mr_put(&r_xprt->rx_buf, mr); + + rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs); } /** @@ -1304,39 +1381,24 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) */ void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) { - struct rpcrdma_rep *rep = req->rl_reply; - + if (req->rl_reply) + rpcrdma_rep_put(buffers, req->rl_reply); req->rl_reply = NULL; spin_lock(&buffers->rb_lock); list_add(&req->rl_list, &buffers->rb_send_bufs); - if (rep) { - if (!rep->rr_temp) { - list_add(&rep->rr_list, &buffers->rb_recv_bufs); - rep = NULL; - } - } spin_unlock(&buffers->rb_lock); - if (rep) - rpcrdma_rep_destroy(rep); } -/* - * Put reply buffers back into pool when not attached to - * request. This happens in error conditions. +/** + * rpcrdma_recv_buffer_put - Release rpcrdma_rep back to free list + * @rep: rep to release + * + * Used after error conditions. */ -void -rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) +void rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) { - struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf; - - if (!rep->rr_temp) { - spin_lock(&buffers->rb_lock); - list_add(&rep->rr_list, &buffers->rb_recv_bufs); - spin_unlock(&buffers->rb_lock); - } else { - rpcrdma_rep_destroy(rep); - } + rpcrdma_rep_put(&rep->rr_rxprt->rx_buf, rep); } /* Returns a pointer to a rpcrdma_regbuf object, or NULL. @@ -1453,7 +1515,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, struct rpcrdma_req *req) { - struct ib_send_wr *send_wr = &req->rl_sendctx->sc_wr; + struct ib_send_wr *send_wr = &req->rl_wr; int rc; if (!ep->rep_send_count || kref_read(&req->rl_kref) > 1) { @@ -1471,12 +1533,17 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, return 0; } -static void -rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) +/** + * rpcrdma_post_recvs - Refill the Receive Queue + * @r_xprt: controlling transport instance + * @temp: mark Receive buffers to be deleted after use + * + */ +void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_ep *ep = &r_xprt->rx_ep; - struct ib_recv_wr *i, *wr, *bad_wr; + struct ib_recv_wr *wr, *bad_wr; struct rpcrdma_rep *rep; int needed, count, rc; @@ -1484,7 +1551,7 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) count = 0; needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1); - if (ep->rep_receive_count > needed) + if (likely(ep->rep_receive_count > needed)) goto out; needed -= ep->rep_receive_count; if (!temp) @@ -1492,42 +1559,26 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) /* fast path: all needed reps can be found on the free list */ wr = NULL; - spin_lock(&buf->rb_lock); while (needed) { - rep = list_first_entry_or_null(&buf->rb_recv_bufs, - struct rpcrdma_rep, rr_list); + rep = rpcrdma_rep_get_locked(buf); + if (rep && rep->rr_temp) { + rpcrdma_rep_destroy(rep); + continue; + } if (!rep) - break; - - list_del(&rep->rr_list); - rep->rr_recv_wr.next = wr; - wr = &rep->rr_recv_wr; - --needed; - } - spin_unlock(&buf->rb_lock); - - while (needed) { - rep = rpcrdma_rep_create(r_xprt, temp); + rep = rpcrdma_rep_create(r_xprt, temp); if (!rep) break; + trace_xprtrdma_post_recv(rep); rep->rr_recv_wr.next = wr; wr = &rep->rr_recv_wr; --needed; + ++count; } if (!wr) goto out; - for (i = wr; i; i = i->next) { - rep = container_of(i, struct rpcrdma_rep, rr_recv_wr); - - if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf)) - goto release_wrs; - - trace_xprtrdma_post_recv(rep->rr_recv_wr.wr_cqe); - ++count; - } - rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr, (const struct ib_recv_wr **)&bad_wr); out: @@ -1544,11 +1595,4 @@ out: } ep->rep_receive_count += count; return; - -release_wrs: - for (i = wr; i;) { - rep = container_of(i, struct rpcrdma_rep, rr_recv_wr); - i = i->next; - rpcrdma_recv_buffer_put(rep); - } } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 92ce09fcea74..37d5080c250b 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -47,6 +47,7 @@ #include <linux/atomic.h> /* atomic_t, etc */ #include <linux/kref.h> /* struct kref */ #include <linux/workqueue.h> /* struct work_struct */ +#include <linux/llist.h> #include <rdma/rdma_cm.h> /* RDMA connection api */ #include <rdma/ib_verbs.h> /* RDMA verbs api */ @@ -70,9 +71,8 @@ struct rpcrdma_ia { struct rdma_cm_id *ri_id; struct ib_pd *ri_pd; int ri_async_rc; - unsigned int ri_max_segs; + unsigned int ri_max_rdma_segs; unsigned int ri_max_frwr_depth; - unsigned int ri_max_send_sges; bool ri_implicit_roundup; enum ib_mr_type ri_mrtype; unsigned long ri_flags; @@ -98,7 +98,7 @@ struct rpcrdma_ep { wait_queue_head_t rep_connect_wait; struct rpcrdma_connect_private rep_cm_private; struct rdma_conn_param rep_remote_cma; - unsigned int rep_max_requests; /* set by /proc */ + unsigned int rep_max_requests; /* depends on device */ unsigned int rep_inline_send; /* negotiated */ unsigned int rep_inline_recv; /* negotiated */ int rep_receive_count; @@ -117,9 +117,6 @@ struct rpcrdma_ep { #endif /* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV - * - * The below structure appears at the front of a large region of kmalloc'd - * memory, which always starts on a good alignment boundary. */ struct rpcrdma_regbuf { @@ -158,25 +155,22 @@ static inline void *rdmab_data(const struct rpcrdma_regbuf *rb) /* To ensure a transport can always make forward progress, * the number of RDMA segments allowed in header chunk lists - * is capped at 8. This prevents less-capable devices and - * memory registrations from overrunning the Send buffer - * while building chunk lists. + * is capped at 16. This prevents less-capable devices from + * overrunning the Send buffer while building chunk lists. * * Elements of the Read list take up more room than the - * Write list or Reply chunk. 8 read segments means the Read - * list (or Write list or Reply chunk) cannot consume more - * than - * - * ((8 + 2) * read segment size) + 1 XDR words, or 244 bytes. + * Write list or Reply chunk. 16 read segments means the + * chunk lists cannot consume more than * - * And the fixed part of the header is another 24 bytes. + * ((16 + 2) * read segment size) + 1 XDR words, * - * The smallest inline threshold is 1024 bytes, ensuring that - * at least 750 bytes are available for RPC messages. + * or about 400 bytes. The fixed part of the header is + * another 24 bytes. Thus when the inline threshold is + * 1024 bytes, at least 600 bytes are available for RPC + * message bodies. */ enum { - RPCRDMA_MAX_HDR_SEGS = 8, - RPCRDMA_HDRBUF_SIZE = 256, + RPCRDMA_MAX_HDR_SEGS = 16, }; /* @@ -206,8 +200,9 @@ struct rpcrdma_rep { struct rpc_rqst *rr_rqst; struct xdr_buf rr_hdrbuf; struct xdr_stream rr_stream; - struct list_head rr_list; + struct llist_node rr_node; struct ib_recv_wr rr_recv_wr; + struct list_head rr_all; }; /* To reduce the rate at which a transport invokes ib_post_recv @@ -223,12 +218,8 @@ enum { /* struct rpcrdma_sendctx - DMA mapped SGEs to unmap after Send completes */ struct rpcrdma_req; -struct rpcrdma_xprt; struct rpcrdma_sendctx { - struct ib_send_wr sc_wr; struct ib_cqe sc_cqe; - struct ib_device *sc_device; - struct rpcrdma_xprt *sc_xprt; struct rpcrdma_req *sc_req; unsigned int sc_unmap_count; struct ib_sge sc_sges[]; @@ -240,20 +231,20 @@ struct rpcrdma_sendctx { * An external memory region is any buffer or page that is registered * on the fly (ie, not pre-registered). */ -struct rpcrdma_req; struct rpcrdma_frwr { struct ib_mr *fr_mr; struct ib_cqe fr_cqe; struct completion fr_linv_done; - struct rpcrdma_req *fr_req; union { struct ib_reg_wr fr_regwr; struct ib_send_wr fr_invwr; }; }; +struct rpcrdma_req; struct rpcrdma_mr { struct list_head mr_list; + struct rpcrdma_req *mr_req; struct scatterlist *mr_sg; int mr_nents; enum dma_data_direction mr_dir; @@ -262,7 +253,6 @@ struct rpcrdma_mr { u32 mr_handle; u32 mr_length; u64 mr_offset; - struct work_struct mr_recycle; struct list_head mr_all; }; @@ -323,6 +313,7 @@ struct rpcrdma_req { struct rpcrdma_rep *rl_reply; struct xdr_stream rl_stream; struct xdr_buf rl_hdrbuf; + struct ib_send_wr rl_wr; struct rpcrdma_sendctx *rl_sendctx; struct rpcrdma_regbuf *rl_rdmabuf; /* xprt header */ struct rpcrdma_regbuf *rl_sendbuf; /* rq_snd_buf */ @@ -331,7 +322,8 @@ struct rpcrdma_req { struct list_head rl_all; struct kref rl_kref; - struct list_head rl_registered; /* registered segments */ + struct list_head rl_free_mrs; + struct list_head rl_registered; struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; }; @@ -344,7 +336,7 @@ rpcr_to_rdmar(const struct rpc_rqst *rqst) static inline void rpcrdma_mr_push(struct rpcrdma_mr *mr, struct list_head *list) { - list_add_tail(&mr->mr_list, list); + list_add(&mr->mr_list, list); } static inline struct rpcrdma_mr * @@ -352,8 +344,9 @@ rpcrdma_mr_pop(struct list_head *list) { struct rpcrdma_mr *mr; - mr = list_first_entry(list, struct rpcrdma_mr, mr_list); - list_del_init(&mr->mr_list); + mr = list_first_entry_or_null(list, struct rpcrdma_mr, mr_list); + if (mr) + list_del_init(&mr->mr_list); return mr; } @@ -364,27 +357,28 @@ rpcrdma_mr_pop(struct list_head *list) * One of these is associated with a transport instance */ struct rpcrdma_buffer { - spinlock_t rb_mrlock; /* protect rb_mrs list */ + spinlock_t rb_lock; + struct list_head rb_send_bufs; struct list_head rb_mrs; - struct list_head rb_all; unsigned long rb_sc_head; unsigned long rb_sc_tail; unsigned long rb_sc_last; struct rpcrdma_sendctx **rb_sc_ctxs; - spinlock_t rb_lock; /* protect buf lists */ - struct list_head rb_send_bufs; - struct list_head rb_recv_bufs; struct list_head rb_allreqs; + struct list_head rb_all_mrs; + struct list_head rb_all_reps; - u32 rb_max_requests; + struct llist_head rb_free_reps; + + __be32 rb_max_requests; u32 rb_credits; /* most recent credit grant */ u32 rb_bc_srv_max_requests; u32 rb_bc_max_requests; - struct delayed_work rb_refresh_worker; + struct work_struct rb_refresh_worker; }; /* @@ -477,12 +471,14 @@ void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *, struct rpcrdma_req *); +void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp); /* * Buffer calls - xprtrdma/verbs.c */ struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size, gfp_t flags); +int rpcrdma_req_setup(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); void rpcrdma_req_destroy(struct rpcrdma_req *req); int rpcrdma_buffer_create(struct rpcrdma_xprt *); void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); @@ -490,13 +486,7 @@ struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt); struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt); void rpcrdma_mr_put(struct rpcrdma_mr *mr); -void rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr); - -static inline void -rpcrdma_mr_recycle(struct rpcrdma_mr *mr) -{ - schedule_work(&mr->mr_recycle); -} +void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt); struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, @@ -545,16 +535,15 @@ rpcrdma_data_dir(bool writing) /* Memory registration calls xprtrdma/frwr_ops.c */ -bool frwr_is_supported(struct ib_device *device); void frwr_reset(struct rpcrdma_req *req); -int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep); +int frwr_query_device(struct rpcrdma_xprt *r_xprt, + const struct ib_device *device); int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr); void frwr_release_mr(struct rpcrdma_mr *mr); -size_t frwr_maxpages(struct rpcrdma_xprt *r_xprt); struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, int nsegs, bool writing, __be32 xid, - struct rpcrdma_mr **mr); + struct rpcrdma_mr *mr); int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req); void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs); void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); @@ -566,6 +555,8 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); enum rpcrdma_chunktype { rpcrdma_noch = 0, + rpcrdma_noch_pullup, + rpcrdma_noch_mapped, rpcrdma_readch, rpcrdma_areadch, rpcrdma_writech, @@ -579,6 +570,7 @@ int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc); int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst); void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *); +void rpcrdma_reset_cwnd(struct rpcrdma_xprt *r_xprt); void rpcrdma_complete_rqst(struct rpcrdma_rep *rep); void rpcrdma_reply_handler(struct rpcrdma_rep *rep); @@ -590,7 +582,6 @@ static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len) /* RPC/RDMA module init - xprtrdma/transport.c */ -extern unsigned int xprt_rdma_slot_table_entries; extern unsigned int xprt_rdma_max_inline_read; extern unsigned int xprt_rdma_max_inline_write; void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index e2176c167a57..d86c664ea6af 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -562,10 +562,14 @@ xs_read_stream_call(struct sock_xprt *transport, struct msghdr *msg, int flags) printk(KERN_WARNING "Callback slot table overflowed\n"); return -ESHUTDOWN; } + if (transport->recv.copied && !req->rq_private_buf.len) + return -ESHUTDOWN; ret = xs_read_stream_request(transport, msg, flags, req); if (msg->msg_flags & (MSG_EOR|MSG_TRUNC)) xprt_complete_bc_request(req, transport->recv.copied); + else + req->rq_private_buf.len = transport->recv.copied; return ret; } @@ -587,7 +591,7 @@ xs_read_stream_reply(struct sock_xprt *transport, struct msghdr *msg, int flags) /* Look up and lock the request corresponding to the given XID */ spin_lock(&xprt->queue_lock); req = xprt_lookup_rqst(xprt, transport->recv.xid); - if (!req) { + if (!req || (transport->recv.copied && !req->rq_private_buf.len)) { msg->msg_flags |= MSG_TRUNC; goto out; } @@ -599,6 +603,8 @@ xs_read_stream_reply(struct sock_xprt *transport, struct msghdr *msg, int flags) spin_lock(&xprt->queue_lock); if (msg->msg_flags & (MSG_EOR|MSG_TRUNC)) xprt_complete_rqst(req->rq_task, transport->recv.copied); + else + req->rq_private_buf.len = transport->recv.copied; xprt_unpin_rqst(req); out: spin_unlock(&xprt->queue_lock); @@ -1243,19 +1249,21 @@ static void xs_error_report(struct sock *sk) { struct sock_xprt *transport; struct rpc_xprt *xprt; - int err; read_lock_bh(&sk->sk_callback_lock); if (!(xprt = xprt_from_sock(sk))) goto out; transport = container_of(xprt, struct sock_xprt, xprt); - err = -sk->sk_err; - if (err == 0) + transport->xprt_err = -sk->sk_err; + if (transport->xprt_err == 0) goto out; dprintk("RPC: xs_error_report client %p, error=%d...\n", - xprt, -err); - trace_rpc_socket_error(xprt, sk->sk_socket, err); + xprt, -transport->xprt_err); + trace_rpc_socket_error(xprt, sk->sk_socket, transport->xprt_err); + + /* barrier ensures xprt_err is set before XPRT_SOCK_WAKE_ERROR */ + smp_mb__before_atomic(); xs_run_error_worker(transport, XPRT_SOCK_WAKE_ERROR); out: read_unlock_bh(&sk->sk_callback_lock); @@ -1744,7 +1752,7 @@ static void xs_set_port(struct rpc_xprt *xprt, unsigned short port) static void xs_set_srcport(struct sock_xprt *transport, struct socket *sock) { - if (transport->srcport == 0) + if (transport->srcport == 0 && transport->xprt.reuseport) transport->srcport = xs_sock_getport(sock); } @@ -2470,7 +2478,6 @@ static void xs_wake_write(struct sock_xprt *transport) static void xs_wake_error(struct sock_xprt *transport) { int sockerr; - int sockerr_len = sizeof(sockerr); if (!test_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state)) return; @@ -2479,9 +2486,7 @@ static void xs_wake_error(struct sock_xprt *transport) goto out; if (!test_and_clear_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state)) goto out; - if (kernel_getsockopt(transport->sock, SOL_SOCKET, SO_ERROR, - (char *)&sockerr, &sockerr_len) != 0) - goto out; + sockerr = xchg(&transport->xprt_err, 0); if (sockerr < 0) xprt_wake_pending_tasks(&transport->xprt, sockerr); out: @@ -2654,6 +2659,8 @@ static int bc_sendto(struct rpc_rqst *req) .iov_len = sizeof(marker), }; + req->rq_xtime = ktime_get(); + len = kernel_sendmsg(transport->sock, &msg, &iov, 1, iov.iov_len); if (len != iov.iov_len) return -EAGAIN; @@ -2679,7 +2686,6 @@ static int bc_send_request(struct rpc_rqst *req) struct svc_xprt *xprt; int len; - dprintk("sending request with xid: %08x\n", ntohl(req->rq_xid)); /* * Get the server socket associated with this callback xprt */ diff --git a/net/tipc/Kconfig b/net/tipc/Kconfig index b83e16ade4d2..716b61a701a8 100644 --- a/net/tipc/Kconfig +++ b/net/tipc/Kconfig @@ -35,6 +35,21 @@ config TIPC_MEDIA_UDP Saying Y here will enable support for running TIPC over IP/UDP bool default y +config TIPC_CRYPTO + bool "TIPC encryption support" + depends on TIPC + select CRYPTO + select CRYPTO_AES + select CRYPTO_GCM + help + Saying Y here will enable support for TIPC encryption. + All TIPC messages will be encrypted/decrypted by using the currently most + advanced algorithm: AEAD AES-GCM (like IPSec or TLS) before leaving/ + entering the TIPC stack. + Key setting from user-space is performed via netlink by a user program + (e.g. the iproute2 'tipc' tool). + bool + default y config TIPC_DIAG tristate "TIPC: socket monitoring interface" diff --git a/net/tipc/Makefile b/net/tipc/Makefile index c86aba0282af..ee49a9f1dd4f 100644 --- a/net/tipc/Makefile +++ b/net/tipc/Makefile @@ -9,15 +9,14 @@ tipc-y += addr.o bcast.o bearer.o \ core.o link.o discover.o msg.o \ name_distr.o subscr.o monitor.o name_table.o net.o \ netlink.o netlink_compat.o node.o socket.o eth_media.o \ - topsrv.o socket.o group.o trace.o + topsrv.o group.o trace.o CFLAGS_trace.o += -I$(src) tipc-$(CONFIG_TIPC_MEDIA_UDP) += udp_media.o tipc-$(CONFIG_TIPC_MEDIA_IB) += ib_media.o tipc-$(CONFIG_SYSCTL) += sysctl.o +tipc-$(CONFIG_TIPC_CRYPTO) += crypto.o obj-$(CONFIG_TIPC_DIAG) += diag.o - -tipc_diag-y := diag.o diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 1336f3cdad38..4c20be08b9c4 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -84,12 +84,12 @@ static struct tipc_bc_base *tipc_bc_base(struct net *net) */ int tipc_bcast_get_mtu(struct net *net) { - return tipc_link_mtu(tipc_bc_sndlink(net)) - INT_H_SIZE; + return tipc_link_mss(tipc_bc_sndlink(net)); } -void tipc_bcast_disable_rcast(struct net *net) +void tipc_bcast_toggle_rcast(struct net *net, bool supp) { - tipc_bc_base(net)->rcast_support = false; + tipc_bc_base(net)->rcast_support = supp; } static void tipc_bcbase_calc_bc_threshold(struct net *net) @@ -185,7 +185,7 @@ static void tipc_bcbase_xmit(struct net *net, struct sk_buff_head *xmitq) } /* We have to transmit across all bearers */ - skb_queue_head_init(&_xmitq); + __skb_queue_head_init(&_xmitq); for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { if (!bb->dests[bearer_id]) continue; @@ -256,7 +256,7 @@ static int tipc_bcast_xmit(struct net *net, struct sk_buff_head *pkts, struct sk_buff_head xmitq; int rc = 0; - skb_queue_head_init(&xmitq); + __skb_queue_head_init(&xmitq); tipc_bcast_lock(net); if (tipc_link_bc_peers(l)) rc = tipc_link_xmit(l, pkts, &xmitq); @@ -286,7 +286,7 @@ static int tipc_rcast_xmit(struct net *net, struct sk_buff_head *pkts, u32 dnode, selector; selector = msg_link_selector(buf_msg(skb_peek(pkts))); - skb_queue_head_init(&_pkts); + __skb_queue_head_init(&_pkts); list_for_each_entry_safe(dst, tmp, &dests->list, list) { dnode = dst->node; @@ -305,17 +305,17 @@ static int tipc_rcast_xmit(struct net *net, struct sk_buff_head *pkts, * @skb: socket buffer to copy * @method: send method to be used * @dests: destination nodes for message. - * @cong_link_cnt: returns number of encountered congested destination links * Returns 0 if success, otherwise errno */ static int tipc_mcast_send_sync(struct net *net, struct sk_buff *skb, struct tipc_mc_method *method, - struct tipc_nlist *dests, - u16 *cong_link_cnt) + struct tipc_nlist *dests) { struct tipc_msg *hdr, *_hdr; struct sk_buff_head tmpq; struct sk_buff *_skb; + u16 cong_link_cnt; + int rc = 0; /* Is a cluster supporting with new capabilities ? */ if (!(tipc_net(net)->capabilities & TIPC_MCAST_RBCTL)) @@ -343,18 +343,19 @@ static int tipc_mcast_send_sync(struct net *net, struct sk_buff *skb, _hdr = buf_msg(_skb); msg_set_size(_hdr, MCAST_H_SIZE); msg_set_is_rcast(_hdr, !msg_is_rcast(hdr)); + msg_set_errcode(_hdr, TIPC_ERR_NO_PORT); - skb_queue_head_init(&tmpq); + __skb_queue_head_init(&tmpq); __skb_queue_tail(&tmpq, _skb); if (method->rcast) - tipc_bcast_xmit(net, &tmpq, cong_link_cnt); + rc = tipc_bcast_xmit(net, &tmpq, &cong_link_cnt); else - tipc_rcast_xmit(net, &tmpq, dests, cong_link_cnt); + rc = tipc_rcast_xmit(net, &tmpq, dests, &cong_link_cnt); /* This queue should normally be empty by now */ __skb_queue_purge(&tmpq); - return 0; + return rc; } /* tipc_mcast_xmit - deliver message to indicated destination nodes @@ -378,7 +379,7 @@ int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts, int rc = 0; skb_queue_head_init(&inputq); - skb_queue_head_init(&localq); + __skb_queue_head_init(&localq); /* Clone packets before they are consumed by next call */ if (dests->local && !tipc_msg_reassemble(pkts, &localq)) { @@ -396,9 +397,14 @@ int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts, msg_set_is_rcast(hdr, method->rcast); /* Switch method ? */ - if (rcast != method->rcast) - tipc_mcast_send_sync(net, skb, method, - dests, cong_link_cnt); + if (rcast != method->rcast) { + rc = tipc_mcast_send_sync(net, skb, method, dests); + if (unlikely(rc)) { + pr_err("Unable to send SYN: method %d, rc %d\n", + rcast, rc); + goto exit; + } + } if (method->rcast) rc = tipc_rcast_xmit(net, pkts, dests, cong_link_cnt); @@ -406,8 +412,10 @@ int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts, rc = tipc_bcast_xmit(net, pkts, cong_link_cnt); } - if (dests->local) + if (dests->local) { + tipc_loopback_trace(net, &localq); tipc_sk_mcast_rcv(net, &localq, &inputq); + } exit: /* This queue should normally be empty by now */ __skb_queue_purge(pkts); @@ -560,18 +568,18 @@ int tipc_bclink_reset_stats(struct net *net) return 0; } -static int tipc_bc_link_set_queue_limits(struct net *net, u32 limit) +static int tipc_bc_link_set_queue_limits(struct net *net, u32 max_win) { struct tipc_link *l = tipc_bc_sndlink(net); if (!l) return -ENOPROTOOPT; - if (limit < BCLINK_WIN_MIN) - limit = BCLINK_WIN_MIN; - if (limit > TIPC_MAX_LINK_WIN) + if (max_win < BCLINK_WIN_MIN) + max_win = BCLINK_WIN_MIN; + if (max_win > TIPC_MAX_LINK_WIN) return -EINVAL; tipc_bcast_lock(net); - tipc_link_set_queue_limits(l, limit); + tipc_link_set_queue_limits(l, BCLINK_WIN_MIN, max_win); tipc_bcast_unlock(net); return 0; } @@ -681,6 +689,7 @@ int tipc_bcast_init(struct net *net) if (!tipc_link_bc_create(net, 0, 0, FB_MTU, BCLINK_WIN_DEFAULT, + BCLINK_WIN_DEFAULT, 0, &bb->inputq, NULL, diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h index dadad953e2be..9e847d9617d3 100644 --- a/net/tipc/bcast.h +++ b/net/tipc/bcast.h @@ -85,7 +85,7 @@ void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_bcl); void tipc_bcast_inc_bearer_dst_cnt(struct net *net, int bearer_id); void tipc_bcast_dec_bearer_dst_cnt(struct net *net, int bearer_id); int tipc_bcast_get_mtu(struct net *net); -void tipc_bcast_disable_rcast(struct net *net); +void tipc_bcast_toggle_rcast(struct net *net, bool supp); int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts, struct tipc_mc_method *method, struct tipc_nlist *dests, u16 *cong_link_cnt); diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index a809c0ec8d15..34ca7b789eba 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -44,6 +44,7 @@ #include "netlink.h" #include "udp_media.h" #include "trace.h" +#include "crypto.h" #define MAX_ADDR_STR 60 @@ -310,11 +311,13 @@ static int tipc_enable_bearer(struct net *net, const char *name, b->identity = bearer_id; b->tolerance = m->tolerance; - b->window = m->window; + b->min_win = m->min_win; + b->max_win = m->max_win; b->domain = disc_domain; b->net_plane = bearer_id + 'A'; b->priority = prio; test_and_set_bit_lock(0, &b->up); + refcount_set(&b->refcnt, 1); res = tipc_disc_create(net, b, &b->bcast_addr, &skb); if (res) { @@ -351,6 +354,17 @@ static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b) return 0; } +bool tipc_bearer_hold(struct tipc_bearer *b) +{ + return (b && refcount_inc_not_zero(&b->refcnt)); +} + +void tipc_bearer_put(struct tipc_bearer *b) +{ + if (b && refcount_dec_and_test(&b->refcnt)) + kfree_rcu(b, rcu); +} + /** * bearer_disable * @@ -369,7 +383,7 @@ static void bearer_disable(struct net *net, struct tipc_bearer *b) if (b->disc) tipc_disc_delete(b->disc); RCU_INIT_POINTER(tn->bearer_list[bearer_id], NULL); - kfree_rcu(b, rcu); + tipc_bearer_put(b); tipc_mon_delete(net, bearer_id); } @@ -389,6 +403,11 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, dev_put(dev); return -EINVAL; } + if (dev == net->loopback_dev) { + dev_put(dev); + pr_info("Enabling <%s> not permitted\n", b->name); + return -EINVAL; + } /* Autoconfigure own node identity if needed */ if (!tipc_own_id(net) && hwaddr_len <= NODE_ID_LEN) { @@ -499,10 +518,15 @@ void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id, rcu_read_lock(); b = bearer_get(net, bearer_id); - if (likely(b && (test_bit(0, &b->up) || msg_is_reset(hdr)))) - b->media->send_msg(net, skb, b, dest); - else + if (likely(b && (test_bit(0, &b->up) || msg_is_reset(hdr)))) { +#ifdef CONFIG_TIPC_CRYPTO + tipc_crypto_xmit(net, &skb, b, dest, NULL); + if (skb) +#endif + b->media->send_msg(net, skb, b, dest); + } else { kfree_skb(skb); + } rcu_read_unlock(); } @@ -510,7 +534,8 @@ void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id, */ void tipc_bearer_xmit(struct net *net, u32 bearer_id, struct sk_buff_head *xmitq, - struct tipc_media_addr *dst) + struct tipc_media_addr *dst, + struct tipc_node *__dnode) { struct tipc_bearer *b; struct sk_buff *skb, *tmp; @@ -524,10 +549,15 @@ void tipc_bearer_xmit(struct net *net, u32 bearer_id, __skb_queue_purge(xmitq); skb_queue_walk_safe(xmitq, skb, tmp) { __skb_dequeue(xmitq); - if (likely(test_bit(0, &b->up) || msg_is_reset(buf_msg(skb)))) - b->media->send_msg(net, skb, b, dst); - else + if (likely(test_bit(0, &b->up) || msg_is_reset(buf_msg(skb)))) { +#ifdef CONFIG_TIPC_CRYPTO + tipc_crypto_xmit(net, &skb, b, dst, __dnode); + if (skb) +#endif + b->media->send_msg(net, skb, b, dst); + } else { kfree_skb(skb); + } } rcu_read_unlock(); } @@ -538,6 +568,7 @@ void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id, struct sk_buff_head *xmitq) { struct tipc_net *tn = tipc_net(net); + struct tipc_media_addr *dst; int net_id = tn->net_id; struct tipc_bearer *b; struct sk_buff *skb, *tmp; @@ -552,7 +583,12 @@ void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id, msg_set_non_seq(hdr, 1); msg_set_mc_netid(hdr, net_id); __skb_dequeue(xmitq); - b->media->send_msg(net, skb, b, &b->bcast_addr); + dst = &b->bcast_addr; +#ifdef CONFIG_TIPC_CRYPTO + tipc_crypto_xmit(net, &skb, b, dst, NULL); + if (skb) +#endif + b->media->send_msg(net, skb, b, dst); } rcu_read_unlock(); } @@ -579,6 +615,7 @@ static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev, if (likely(b && test_bit(0, &b->up) && (skb->pkt_type <= PACKET_MULTICAST))) { skb_mark_not_on_list(skb); + TIPC_SKB_CB(skb)->flags = 0; tipc_rcv(dev_net(b->pt.dev), skb, b); rcu_read_unlock(); return NET_RX_SUCCESS; @@ -674,6 +711,65 @@ void tipc_bearer_stop(struct net *net) } } +void tipc_clone_to_loopback(struct net *net, struct sk_buff_head *pkts) +{ + struct net_device *dev = net->loopback_dev; + struct sk_buff *skb, *_skb; + int exp; + + skb_queue_walk(pkts, _skb) { + skb = pskb_copy(_skb, GFP_ATOMIC); + if (!skb) + continue; + + exp = SKB_DATA_ALIGN(dev->hard_header_len - skb_headroom(skb)); + if (exp > 0 && pskb_expand_head(skb, exp, 0, GFP_ATOMIC)) { + kfree_skb(skb); + continue; + } + + skb_reset_network_header(skb); + dev_hard_header(skb, dev, ETH_P_TIPC, dev->dev_addr, + dev->dev_addr, skb->len); + skb->dev = dev; + skb->pkt_type = PACKET_HOST; + skb->ip_summed = CHECKSUM_UNNECESSARY; + skb->protocol = eth_type_trans(skb, dev); + netif_rx_ni(skb); + } +} + +static int tipc_loopback_rcv_pkt(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *od) +{ + consume_skb(skb); + return NET_RX_SUCCESS; +} + +int tipc_attach_loopback(struct net *net) +{ + struct net_device *dev = net->loopback_dev; + struct tipc_net *tn = tipc_net(net); + + if (!dev) + return -ENODEV; + + dev_hold(dev); + tn->loopback_pt.dev = dev; + tn->loopback_pt.type = htons(ETH_P_TIPC); + tn->loopback_pt.func = tipc_loopback_rcv_pkt; + dev_add_pack(&tn->loopback_pt); + return 0; +} + +void tipc_detach_loopback(struct net *net) +{ + struct tipc_net *tn = tipc_net(net); + + dev_remove_pack(&tn->loopback_pt); + dev_put(net->loopback_dev); +} + /* Caller should hold rtnl_lock to protect the bearer */ static int __tipc_nl_add_bearer(struct tipc_nl_msg *msg, struct tipc_bearer *bearer, int nlflags) @@ -701,7 +797,7 @@ static int __tipc_nl_add_bearer(struct tipc_nl_msg *msg, goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_TOL, bearer->tolerance)) goto prop_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bearer->window)) + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bearer->max_win)) goto prop_msg_full; if (bearer->media->type_id == TIPC_MEDIA_TYPE_UDP) if (nla_put_u32(msg->skb, TIPC_NLA_PROP_MTU, bearer->mtu)) @@ -993,7 +1089,7 @@ int __tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) if (props[TIPC_NLA_PROP_PRIO]) b->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); if (props[TIPC_NLA_PROP_WIN]) - b->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + b->max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); if (props[TIPC_NLA_PROP_MTU]) { if (b->media->type_id != TIPC_MEDIA_TYPE_UDP) return -EINVAL; @@ -1047,7 +1143,7 @@ static int __tipc_nl_add_media(struct tipc_nl_msg *msg, goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_TOL, media->tolerance)) goto prop_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, media->window)) + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, media->max_win)) goto prop_msg_full; if (media->type_id == TIPC_MEDIA_TYPE_UDP) if (nla_put_u32(msg->skb, TIPC_NLA_PROP_MTU, media->mtu)) @@ -1180,7 +1276,7 @@ int __tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info) if (props[TIPC_NLA_PROP_PRIO]) m->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); if (props[TIPC_NLA_PROP_WIN]) - m->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + m->max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); if (props[TIPC_NLA_PROP_MTU]) { if (m->type_id != TIPC_MEDIA_TYPE_UDP) return -EINVAL; diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index 7f4c569594a5..bc0023119da2 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -119,7 +119,8 @@ struct tipc_media { char *raw); u32 priority; u32 tolerance; - u32 window; + u32 min_win; + u32 max_win; u32 mtu; u32 type_id; u32 hwaddr_len; @@ -158,13 +159,15 @@ struct tipc_bearer { struct packet_type pt; struct rcu_head rcu; u32 priority; - u32 window; + u32 min_win; + u32 max_win; u32 tolerance; u32 domain; u32 identity; struct tipc_discoverer *disc; char net_plane; unsigned long up; + refcount_t refcnt; }; struct tipc_bearer_names { @@ -210,6 +213,8 @@ int tipc_media_set_window(const char *name, u32 new_value); int tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a); int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, struct nlattr *attrs[]); +bool tipc_bearer_hold(struct tipc_bearer *b); +void tipc_bearer_put(struct tipc_bearer *b); void tipc_disable_l2_media(struct tipc_bearer *b); int tipc_l2_send_msg(struct net *net, struct sk_buff *buf, struct tipc_bearer *b, struct tipc_media_addr *dest); @@ -229,9 +234,20 @@ void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id, struct tipc_media_addr *dest); void tipc_bearer_xmit(struct net *net, u32 bearer_id, struct sk_buff_head *xmitq, - struct tipc_media_addr *dst); + struct tipc_media_addr *dst, + struct tipc_node *__dnode); void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id, struct sk_buff_head *xmitq); +void tipc_clone_to_loopback(struct net *net, struct sk_buff_head *pkts); +int tipc_attach_loopback(struct net *net); +void tipc_detach_loopback(struct net *net); + +static inline void tipc_loopback_trace(struct net *net, + struct sk_buff_head *pkts) +{ + if (unlikely(dev_nit_active(net->loopback_dev))) + tipc_clone_to_loopback(net, pkts); +} /* check if device MTU is too low for tipc headers */ static inline bool tipc_mtu_bad(struct net_device *dev, unsigned int reserve) diff --git a/net/tipc/core.c b/net/tipc/core.c index c8370722f0bb..4f6dc74adf45 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -34,8 +34,6 @@ * POSSIBILITY OF SUCH DAMAGE. */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - #include "core.h" #include "name_table.h" #include "subscr.h" @@ -44,6 +42,7 @@ #include "socket.h" #include "bcast.h" #include "node.h" +#include "crypto.h" #include <linux/module.h> @@ -68,6 +67,11 @@ static int __net_init tipc_init_net(struct net *net) INIT_LIST_HEAD(&tn->node_list); spin_lock_init(&tn->node_list_lock); +#ifdef CONFIG_TIPC_CRYPTO + err = tipc_crypto_start(&tn->crypto_tx, net, NULL); + if (err) + goto out_crypto; +#endif err = tipc_sk_rht_init(net); if (err) goto out_sk_rht; @@ -82,6 +86,10 @@ static int __net_init tipc_init_net(struct net *net) if (err) goto out_bclink; + err = tipc_attach_loopback(net); + if (err) + goto out_bclink; + return 0; out_bclink: @@ -89,17 +97,35 @@ out_bclink: out_nametbl: tipc_sk_rht_destroy(net); out_sk_rht: + +#ifdef CONFIG_TIPC_CRYPTO + tipc_crypto_stop(&tn->crypto_tx); +out_crypto: +#endif return err; } static void __net_exit tipc_exit_net(struct net *net) { + tipc_detach_loopback(net); tipc_net_stop(net); tipc_bcast_stop(net); tipc_nametbl_stop(net); tipc_sk_rht_destroy(net); +#ifdef CONFIG_TIPC_CRYPTO + tipc_crypto_stop(&tipc_net(net)->crypto_tx); +#endif } +static void __net_exit tipc_pernet_pre_exit(struct net *net) +{ + tipc_node_pre_cleanup_net(net); +} + +static struct pernet_operations tipc_pernet_pre_exit_ops = { + .pre_exit = tipc_pernet_pre_exit, +}; + static struct pernet_operations tipc_net_ops = { .init = tipc_init_net, .exit = tipc_exit_net, @@ -122,14 +148,6 @@ static int __init tipc_init(void) sysctl_tipc_rmem[1] = RCVBUF_DEF; sysctl_tipc_rmem[2] = RCVBUF_MAX; - err = tipc_netlink_start(); - if (err) - goto out_netlink; - - err = tipc_netlink_compat_start(); - if (err) - goto out_netlink_compat; - err = tipc_register_sysctl(); if (err) goto out_sysctl; @@ -146,13 +164,32 @@ static int __init tipc_init(void) if (err) goto out_pernet_topsrv; + err = register_pernet_subsys(&tipc_pernet_pre_exit_ops); + if (err) + goto out_register_pernet_subsys; + err = tipc_bearer_setup(); if (err) goto out_bearer; + err = tipc_netlink_start(); + if (err) + goto out_netlink; + + err = tipc_netlink_compat_start(); + if (err) + goto out_netlink_compat; + pr_info("Started in single node mode\n"); return 0; + +out_netlink_compat: + tipc_netlink_stop(); +out_netlink: + tipc_bearer_cleanup(); out_bearer: + unregister_pernet_subsys(&tipc_pernet_pre_exit_ops); +out_register_pernet_subsys: unregister_pernet_device(&tipc_topsrv_net_ops); out_pernet_topsrv: tipc_socket_stop(); @@ -161,22 +198,19 @@ out_socket: out_pernet: tipc_unregister_sysctl(); out_sysctl: - tipc_netlink_compat_stop(); -out_netlink_compat: - tipc_netlink_stop(); -out_netlink: pr_err("Unable to start in single node mode\n"); return err; } static void __exit tipc_exit(void) { + tipc_netlink_compat_stop(); + tipc_netlink_stop(); tipc_bearer_cleanup(); + unregister_pernet_subsys(&tipc_pernet_pre_exit_ops); unregister_pernet_device(&tipc_topsrv_net_ops); tipc_socket_stop(); unregister_pernet_device(&tipc_net_ops); - tipc_netlink_stop(); - tipc_netlink_compat_stop(); tipc_unregister_sysctl(); pr_info("Deactivated\n"); diff --git a/net/tipc/core.h b/net/tipc/core.h index 7a68e1b6a066..631d83c9705f 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -59,6 +59,13 @@ #include <net/netns/generic.h> #include <linux/rhashtable.h> #include <net/genetlink.h> +#include <net/netns/hash.h> + +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt struct tipc_node; struct tipc_bearer; @@ -67,6 +74,9 @@ struct tipc_link; struct tipc_name_table; struct tipc_topsrv; struct tipc_monitor; +#ifdef CONFIG_TIPC_CRYPTO +struct tipc_crypto; +#endif #define TIPC_MOD_VER "2.0.0" @@ -125,6 +135,14 @@ struct tipc_net { /* Cluster capabilities */ u16 capabilities; + + /* Tracing of node internal messages */ + struct packet_type loopback_pt; + +#ifdef CONFIG_TIPC_CRYPTO + /* TX crypto handler */ + struct tipc_crypto *crypto_tx; +#endif }; static inline struct tipc_net *tipc_net(struct net *net) @@ -182,6 +200,11 @@ static inline int in_range(u16 val, u16 min, u16 max) return !less(val, min) && !more(val, max); } +static inline u32 tipc_net_hash_mixes(struct net *net, int tn_rand) +{ + return net_hash_mix(&init_net) ^ net_hash_mix(net) ^ tn_rand; +} + #ifdef CONFIG_SYSCTL int tipc_register_sysctl(void); void tipc_unregister_sysctl(void); diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c new file mode 100644 index 000000000000..c8c47fc72653 --- /dev/null +++ b/net/tipc/crypto.c @@ -0,0 +1,1983 @@ +// SPDX-License-Identifier: GPL-2.0 +/** + * net/tipc/crypto.c: TIPC crypto for key handling & packet en/decryption + * + * Copyright (c) 2019, Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <crypto/aead.h> +#include <crypto/aes.h> +#include "crypto.h" + +#define TIPC_TX_PROBE_LIM msecs_to_jiffies(1000) /* > 1s */ +#define TIPC_TX_LASTING_LIM msecs_to_jiffies(120000) /* 2 mins */ +#define TIPC_RX_ACTIVE_LIM msecs_to_jiffies(3000) /* 3s */ +#define TIPC_RX_PASSIVE_LIM msecs_to_jiffies(180000) /* 3 mins */ +#define TIPC_MAX_TFMS_DEF 10 +#define TIPC_MAX_TFMS_LIM 1000 + +/** + * TIPC Key ids + */ +enum { + KEY_UNUSED = 0, + KEY_MIN, + KEY_1 = KEY_MIN, + KEY_2, + KEY_3, + KEY_MAX = KEY_3, +}; + +/** + * TIPC Crypto statistics + */ +enum { + STAT_OK, + STAT_NOK, + STAT_ASYNC, + STAT_ASYNC_OK, + STAT_ASYNC_NOK, + STAT_BADKEYS, /* tx only */ + STAT_BADMSGS = STAT_BADKEYS, /* rx only */ + STAT_NOKEYS, + STAT_SWITCHES, + + MAX_STATS, +}; + +/* TIPC crypto statistics' header */ +static const char *hstats[MAX_STATS] = {"ok", "nok", "async", "async_ok", + "async_nok", "badmsgs", "nokeys", + "switches"}; + +/* Max TFMs number per key */ +int sysctl_tipc_max_tfms __read_mostly = TIPC_MAX_TFMS_DEF; + +/** + * struct tipc_key - TIPC keys' status indicator + * + * 7 6 5 4 3 2 1 0 + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * key: | (reserved)|passive idx| active idx|pending idx| + * +-----+-----+-----+-----+-----+-----+-----+-----+ + */ +struct tipc_key { +#define KEY_BITS (2) +#define KEY_MASK ((1 << KEY_BITS) - 1) + union { + struct { +#if defined(__LITTLE_ENDIAN_BITFIELD) + u8 pending:2, + active:2, + passive:2, /* rx only */ + reserved:2; +#elif defined(__BIG_ENDIAN_BITFIELD) + u8 reserved:2, + passive:2, /* rx only */ + active:2, + pending:2; +#else +#error "Please fix <asm/byteorder.h>" +#endif + } __packed; + u8 keys; + }; +}; + +/** + * struct tipc_tfm - TIPC TFM structure to form a list of TFMs + */ +struct tipc_tfm { + struct crypto_aead *tfm; + struct list_head list; +}; + +/** + * struct tipc_aead - TIPC AEAD key structure + * @tfm_entry: per-cpu pointer to one entry in TFM list + * @crypto: TIPC crypto owns this key + * @cloned: reference to the source key in case cloning + * @users: the number of the key users (TX/RX) + * @salt: the key's SALT value + * @authsize: authentication tag size (max = 16) + * @mode: crypto mode is applied to the key + * @hint[]: a hint for user key + * @rcu: struct rcu_head + * @seqno: the key seqno (cluster scope) + * @refcnt: the key reference counter + */ +struct tipc_aead { +#define TIPC_AEAD_HINT_LEN (5) + struct tipc_tfm * __percpu *tfm_entry; + struct tipc_crypto *crypto; + struct tipc_aead *cloned; + atomic_t users; + u32 salt; + u8 authsize; + u8 mode; + char hint[TIPC_AEAD_HINT_LEN + 1]; + struct rcu_head rcu; + + atomic64_t seqno ____cacheline_aligned; + refcount_t refcnt ____cacheline_aligned; + +} ____cacheline_aligned; + +/** + * struct tipc_crypto_stats - TIPC Crypto statistics + */ +struct tipc_crypto_stats { + unsigned int stat[MAX_STATS]; +}; + +/** + * struct tipc_crypto - TIPC TX/RX crypto structure + * @net: struct net + * @node: TIPC node (RX) + * @aead: array of pointers to AEAD keys for encryption/decryption + * @peer_rx_active: replicated peer RX active key index + * @key: the key states + * @working: the crypto is working or not + * @stats: the crypto statistics + * @sndnxt: the per-peer sndnxt (TX) + * @timer1: general timer 1 (jiffies) + * @timer2: general timer 1 (jiffies) + * @lock: tipc_key lock + */ +struct tipc_crypto { + struct net *net; + struct tipc_node *node; + struct tipc_aead __rcu *aead[KEY_MAX + 1]; /* key[0] is UNUSED */ + atomic_t peer_rx_active; + struct tipc_key key; + u8 working:1; + struct tipc_crypto_stats __percpu *stats; + + atomic64_t sndnxt ____cacheline_aligned; + unsigned long timer1; + unsigned long timer2; + spinlock_t lock; /* crypto lock */ + +} ____cacheline_aligned; + +/* struct tipc_crypto_tx_ctx - TX context for callbacks */ +struct tipc_crypto_tx_ctx { + struct tipc_aead *aead; + struct tipc_bearer *bearer; + struct tipc_media_addr dst; +}; + +/* struct tipc_crypto_rx_ctx - RX context for callbacks */ +struct tipc_crypto_rx_ctx { + struct tipc_aead *aead; + struct tipc_bearer *bearer; +}; + +static struct tipc_aead *tipc_aead_get(struct tipc_aead __rcu *aead); +static inline void tipc_aead_put(struct tipc_aead *aead); +static void tipc_aead_free(struct rcu_head *rp); +static int tipc_aead_users(struct tipc_aead __rcu *aead); +static void tipc_aead_users_inc(struct tipc_aead __rcu *aead, int lim); +static void tipc_aead_users_dec(struct tipc_aead __rcu *aead, int lim); +static void tipc_aead_users_set(struct tipc_aead __rcu *aead, int val); +static struct crypto_aead *tipc_aead_tfm_next(struct tipc_aead *aead); +static int tipc_aead_init(struct tipc_aead **aead, struct tipc_aead_key *ukey, + u8 mode); +static int tipc_aead_clone(struct tipc_aead **dst, struct tipc_aead *src); +static void *tipc_aead_mem_alloc(struct crypto_aead *tfm, + unsigned int crypto_ctx_size, + u8 **iv, struct aead_request **req, + struct scatterlist **sg, int nsg); +static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb, + struct tipc_bearer *b, + struct tipc_media_addr *dst, + struct tipc_node *__dnode); +static void tipc_aead_encrypt_done(struct crypto_async_request *base, int err); +static int tipc_aead_decrypt(struct net *net, struct tipc_aead *aead, + struct sk_buff *skb, struct tipc_bearer *b); +static void tipc_aead_decrypt_done(struct crypto_async_request *base, int err); +static inline int tipc_ehdr_size(struct tipc_ehdr *ehdr); +static int tipc_ehdr_build(struct net *net, struct tipc_aead *aead, + u8 tx_key, struct sk_buff *skb, + struct tipc_crypto *__rx); +static inline void tipc_crypto_key_set_state(struct tipc_crypto *c, + u8 new_passive, + u8 new_active, + u8 new_pending); +static int tipc_crypto_key_attach(struct tipc_crypto *c, + struct tipc_aead *aead, u8 pos); +static bool tipc_crypto_key_try_align(struct tipc_crypto *rx, u8 new_pending); +static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx, + struct tipc_crypto *rx, + struct sk_buff *skb); +static void tipc_crypto_key_synch(struct tipc_crypto *rx, u8 new_rx_active, + struct tipc_msg *hdr); +static int tipc_crypto_key_revoke(struct net *net, u8 tx_key); +static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead, + struct tipc_bearer *b, + struct sk_buff **skb, int err); +static void tipc_crypto_do_cmd(struct net *net, int cmd); +static char *tipc_crypto_key_dump(struct tipc_crypto *c, char *buf); +#ifdef TIPC_CRYPTO_DEBUG +static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new, + char *buf); +#endif + +#define key_next(cur) ((cur) % KEY_MAX + 1) + +#define tipc_aead_rcu_ptr(rcu_ptr, lock) \ + rcu_dereference_protected((rcu_ptr), lockdep_is_held(lock)) + +#define tipc_aead_rcu_replace(rcu_ptr, ptr, lock) \ +do { \ + typeof(rcu_ptr) __tmp = rcu_dereference_protected((rcu_ptr), \ + lockdep_is_held(lock)); \ + rcu_assign_pointer((rcu_ptr), (ptr)); \ + tipc_aead_put(__tmp); \ +} while (0) + +#define tipc_crypto_key_detach(rcu_ptr, lock) \ + tipc_aead_rcu_replace((rcu_ptr), NULL, lock) + +/** + * tipc_aead_key_validate - Validate a AEAD user key + */ +int tipc_aead_key_validate(struct tipc_aead_key *ukey) +{ + int keylen; + + /* Check if algorithm exists */ + if (unlikely(!crypto_has_alg(ukey->alg_name, 0, 0))) { + pr_info("Not found cipher: \"%s\"!\n", ukey->alg_name); + return -ENODEV; + } + + /* Currently, we only support the "gcm(aes)" cipher algorithm */ + if (strcmp(ukey->alg_name, "gcm(aes)")) + return -ENOTSUPP; + + /* Check if key size is correct */ + keylen = ukey->keylen - TIPC_AES_GCM_SALT_SIZE; + if (unlikely(keylen != TIPC_AES_GCM_KEY_SIZE_128 && + keylen != TIPC_AES_GCM_KEY_SIZE_192 && + keylen != TIPC_AES_GCM_KEY_SIZE_256)) + return -EINVAL; + + return 0; +} + +static struct tipc_aead *tipc_aead_get(struct tipc_aead __rcu *aead) +{ + struct tipc_aead *tmp; + + rcu_read_lock(); + tmp = rcu_dereference(aead); + if (unlikely(!tmp || !refcount_inc_not_zero(&tmp->refcnt))) + tmp = NULL; + rcu_read_unlock(); + + return tmp; +} + +static inline void tipc_aead_put(struct tipc_aead *aead) +{ + if (aead && refcount_dec_and_test(&aead->refcnt)) + call_rcu(&aead->rcu, tipc_aead_free); +} + +/** + * tipc_aead_free - Release AEAD key incl. all the TFMs in the list + * @rp: rcu head pointer + */ +static void tipc_aead_free(struct rcu_head *rp) +{ + struct tipc_aead *aead = container_of(rp, struct tipc_aead, rcu); + struct tipc_tfm *tfm_entry, *head, *tmp; + + if (aead->cloned) { + tipc_aead_put(aead->cloned); + } else { + head = *this_cpu_ptr(aead->tfm_entry); + list_for_each_entry_safe(tfm_entry, tmp, &head->list, list) { + crypto_free_aead(tfm_entry->tfm); + list_del(&tfm_entry->list); + kfree(tfm_entry); + } + /* Free the head */ + crypto_free_aead(head->tfm); + list_del(&head->list); + kfree(head); + } + free_percpu(aead->tfm_entry); + kfree(aead); +} + +static int tipc_aead_users(struct tipc_aead __rcu *aead) +{ + struct tipc_aead *tmp; + int users = 0; + + rcu_read_lock(); + tmp = rcu_dereference(aead); + if (tmp) + users = atomic_read(&tmp->users); + rcu_read_unlock(); + + return users; +} + +static void tipc_aead_users_inc(struct tipc_aead __rcu *aead, int lim) +{ + struct tipc_aead *tmp; + + rcu_read_lock(); + tmp = rcu_dereference(aead); + if (tmp) + atomic_add_unless(&tmp->users, 1, lim); + rcu_read_unlock(); +} + +static void tipc_aead_users_dec(struct tipc_aead __rcu *aead, int lim) +{ + struct tipc_aead *tmp; + + rcu_read_lock(); + tmp = rcu_dereference(aead); + if (tmp) + atomic_add_unless(&rcu_dereference(aead)->users, -1, lim); + rcu_read_unlock(); +} + +static void tipc_aead_users_set(struct tipc_aead __rcu *aead, int val) +{ + struct tipc_aead *tmp; + int cur; + + rcu_read_lock(); + tmp = rcu_dereference(aead); + if (tmp) { + do { + cur = atomic_read(&tmp->users); + if (cur == val) + break; + } while (atomic_cmpxchg(&tmp->users, cur, val) != cur); + } + rcu_read_unlock(); +} + +/** + * tipc_aead_tfm_next - Move TFM entry to the next one in list and return it + */ +static struct crypto_aead *tipc_aead_tfm_next(struct tipc_aead *aead) +{ + struct tipc_tfm **tfm_entry = this_cpu_ptr(aead->tfm_entry); + + *tfm_entry = list_next_entry(*tfm_entry, list); + return (*tfm_entry)->tfm; +} + +/** + * tipc_aead_init - Initiate TIPC AEAD + * @aead: returned new TIPC AEAD key handle pointer + * @ukey: pointer to user key data + * @mode: the key mode + * + * Allocate a (list of) new cipher transformation (TFM) with the specific user + * key data if valid. The number of the allocated TFMs can be set via the sysfs + * "net/tipc/max_tfms" first. + * Also, all the other AEAD data are also initialized. + * + * Return: 0 if the initiation is successful, otherwise: < 0 + */ +static int tipc_aead_init(struct tipc_aead **aead, struct tipc_aead_key *ukey, + u8 mode) +{ + struct tipc_tfm *tfm_entry, *head; + struct crypto_aead *tfm; + struct tipc_aead *tmp; + int keylen, err, cpu; + int tfm_cnt = 0; + + if (unlikely(*aead)) + return -EEXIST; + + /* Allocate a new AEAD */ + tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC); + if (unlikely(!tmp)) + return -ENOMEM; + + /* The key consists of two parts: [AES-KEY][SALT] */ + keylen = ukey->keylen - TIPC_AES_GCM_SALT_SIZE; + + /* Allocate per-cpu TFM entry pointer */ + tmp->tfm_entry = alloc_percpu(struct tipc_tfm *); + if (!tmp->tfm_entry) { + kzfree(tmp); + return -ENOMEM; + } + + /* Make a list of TFMs with the user key data */ + do { + tfm = crypto_alloc_aead(ukey->alg_name, 0, 0); + if (IS_ERR(tfm)) { + err = PTR_ERR(tfm); + break; + } + + if (unlikely(!tfm_cnt && + crypto_aead_ivsize(tfm) != TIPC_AES_GCM_IV_SIZE)) { + crypto_free_aead(tfm); + err = -ENOTSUPP; + break; + } + + err = crypto_aead_setauthsize(tfm, TIPC_AES_GCM_TAG_SIZE); + err |= crypto_aead_setkey(tfm, ukey->key, keylen); + if (unlikely(err)) { + crypto_free_aead(tfm); + break; + } + + tfm_entry = kmalloc(sizeof(*tfm_entry), GFP_KERNEL); + if (unlikely(!tfm_entry)) { + crypto_free_aead(tfm); + err = -ENOMEM; + break; + } + INIT_LIST_HEAD(&tfm_entry->list); + tfm_entry->tfm = tfm; + + /* First entry? */ + if (!tfm_cnt) { + head = tfm_entry; + for_each_possible_cpu(cpu) { + *per_cpu_ptr(tmp->tfm_entry, cpu) = head; + } + } else { + list_add_tail(&tfm_entry->list, &head->list); + } + + } while (++tfm_cnt < sysctl_tipc_max_tfms); + + /* Not any TFM is allocated? */ + if (!tfm_cnt) { + free_percpu(tmp->tfm_entry); + kzfree(tmp); + return err; + } + + /* Copy some chars from the user key as a hint */ + memcpy(tmp->hint, ukey->key, TIPC_AEAD_HINT_LEN); + tmp->hint[TIPC_AEAD_HINT_LEN] = '\0'; + + /* Initialize the other data */ + tmp->mode = mode; + tmp->cloned = NULL; + tmp->authsize = TIPC_AES_GCM_TAG_SIZE; + memcpy(&tmp->salt, ukey->key + keylen, TIPC_AES_GCM_SALT_SIZE); + atomic_set(&tmp->users, 0); + atomic64_set(&tmp->seqno, 0); + refcount_set(&tmp->refcnt, 1); + + *aead = tmp; + return 0; +} + +/** + * tipc_aead_clone - Clone a TIPC AEAD key + * @dst: dest key for the cloning + * @src: source key to clone from + * + * Make a "copy" of the source AEAD key data to the dest, the TFMs list is + * common for the keys. + * A reference to the source is hold in the "cloned" pointer for the later + * freeing purposes. + * + * Note: this must be done in cluster-key mode only! + * Return: 0 in case of success, otherwise < 0 + */ +static int tipc_aead_clone(struct tipc_aead **dst, struct tipc_aead *src) +{ + struct tipc_aead *aead; + int cpu; + + if (!src) + return -ENOKEY; + + if (src->mode != CLUSTER_KEY) + return -EINVAL; + + if (unlikely(*dst)) + return -EEXIST; + + aead = kzalloc(sizeof(*aead), GFP_ATOMIC); + if (unlikely(!aead)) + return -ENOMEM; + + aead->tfm_entry = alloc_percpu_gfp(struct tipc_tfm *, GFP_ATOMIC); + if (unlikely(!aead->tfm_entry)) { + kzfree(aead); + return -ENOMEM; + } + + for_each_possible_cpu(cpu) { + *per_cpu_ptr(aead->tfm_entry, cpu) = + *per_cpu_ptr(src->tfm_entry, cpu); + } + + memcpy(aead->hint, src->hint, sizeof(src->hint)); + aead->mode = src->mode; + aead->salt = src->salt; + aead->authsize = src->authsize; + atomic_set(&aead->users, 0); + atomic64_set(&aead->seqno, 0); + refcount_set(&aead->refcnt, 1); + + WARN_ON(!refcount_inc_not_zero(&src->refcnt)); + aead->cloned = src; + + *dst = aead; + return 0; +} + +/** + * tipc_aead_mem_alloc - Allocate memory for AEAD request operations + * @tfm: cipher handle to be registered with the request + * @crypto_ctx_size: size of crypto context for callback + * @iv: returned pointer to IV data + * @req: returned pointer to AEAD request data + * @sg: returned pointer to SG lists + * @nsg: number of SG lists to be allocated + * + * Allocate memory to store the crypto context data, AEAD request, IV and SG + * lists, the memory layout is as follows: + * crypto_ctx || iv || aead_req || sg[] + * + * Return: the pointer to the memory areas in case of success, otherwise NULL + */ +static void *tipc_aead_mem_alloc(struct crypto_aead *tfm, + unsigned int crypto_ctx_size, + u8 **iv, struct aead_request **req, + struct scatterlist **sg, int nsg) +{ + unsigned int iv_size, req_size; + unsigned int len; + u8 *mem; + + iv_size = crypto_aead_ivsize(tfm); + req_size = sizeof(**req) + crypto_aead_reqsize(tfm); + + len = crypto_ctx_size; + len += iv_size; + len += crypto_aead_alignmask(tfm) & ~(crypto_tfm_ctx_alignment() - 1); + len = ALIGN(len, crypto_tfm_ctx_alignment()); + len += req_size; + len = ALIGN(len, __alignof__(struct scatterlist)); + len += nsg * sizeof(**sg); + + mem = kmalloc(len, GFP_ATOMIC); + if (!mem) + return NULL; + + *iv = (u8 *)PTR_ALIGN(mem + crypto_ctx_size, + crypto_aead_alignmask(tfm) + 1); + *req = (struct aead_request *)PTR_ALIGN(*iv + iv_size, + crypto_tfm_ctx_alignment()); + *sg = (struct scatterlist *)PTR_ALIGN((u8 *)*req + req_size, + __alignof__(struct scatterlist)); + + return (void *)mem; +} + +/** + * tipc_aead_encrypt - Encrypt a message + * @aead: TIPC AEAD key for the message encryption + * @skb: the input/output skb + * @b: TIPC bearer where the message will be delivered after the encryption + * @dst: the destination media address + * @__dnode: TIPC dest node if "known" + * + * Return: + * 0 : if the encryption has completed + * -EINPROGRESS/-EBUSY : if a callback will be performed + * < 0 : the encryption has failed + */ +static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb, + struct tipc_bearer *b, + struct tipc_media_addr *dst, + struct tipc_node *__dnode) +{ + struct crypto_aead *tfm = tipc_aead_tfm_next(aead); + struct tipc_crypto_tx_ctx *tx_ctx; + struct aead_request *req; + struct sk_buff *trailer; + struct scatterlist *sg; + struct tipc_ehdr *ehdr; + int ehsz, len, tailen, nsg, rc; + void *ctx; + u32 salt; + u8 *iv; + + /* Make sure message len at least 4-byte aligned */ + len = ALIGN(skb->len, 4); + tailen = len - skb->len + aead->authsize; + + /* Expand skb tail for authentication tag: + * As for simplicity, we'd have made sure skb having enough tailroom + * for authentication tag @skb allocation. Even when skb is nonlinear + * but there is no frag_list, it should be still fine! + * Otherwise, we must cow it to be a writable buffer with the tailroom. + */ +#ifdef TIPC_CRYPTO_DEBUG + SKB_LINEAR_ASSERT(skb); + if (tailen > skb_tailroom(skb)) { + pr_warn("TX: skb tailroom is not enough: %d, requires: %d\n", + skb_tailroom(skb), tailen); + } +#endif + + if (unlikely(!skb_cloned(skb) && tailen <= skb_tailroom(skb))) { + nsg = 1; + trailer = skb; + } else { + /* TODO: We could avoid skb_cow_data() if skb has no frag_list + * e.g. by skb_fill_page_desc() to add another page to the skb + * with the wanted tailen... However, page skbs look not often, + * so take it easy now! + * Cloned skbs e.g. from link_xmit() seems no choice though :( + */ + nsg = skb_cow_data(skb, tailen, &trailer); + if (unlikely(nsg < 0)) { + pr_err("TX: skb_cow_data() returned %d\n", nsg); + return nsg; + } + } + + pskb_put(skb, trailer, tailen); + + /* Allocate memory for the AEAD operation */ + ctx = tipc_aead_mem_alloc(tfm, sizeof(*tx_ctx), &iv, &req, &sg, nsg); + if (unlikely(!ctx)) + return -ENOMEM; + TIPC_SKB_CB(skb)->crypto_ctx = ctx; + + /* Map skb to the sg lists */ + sg_init_table(sg, nsg); + rc = skb_to_sgvec(skb, sg, 0, skb->len); + if (unlikely(rc < 0)) { + pr_err("TX: skb_to_sgvec() returned %d, nsg %d!\n", rc, nsg); + goto exit; + } + + /* Prepare IV: [SALT (4 octets)][SEQNO (8 octets)] + * In case we're in cluster-key mode, SALT is varied by xor-ing with + * the source address (or w0 of id), otherwise with the dest address + * if dest is known. + */ + ehdr = (struct tipc_ehdr *)skb->data; + salt = aead->salt; + if (aead->mode == CLUSTER_KEY) + salt ^= ehdr->addr; /* __be32 */ + else if (__dnode) + salt ^= tipc_node_get_addr(__dnode); + memcpy(iv, &salt, 4); + memcpy(iv + 4, (u8 *)&ehdr->seqno, 8); + + /* Prepare request */ + ehsz = tipc_ehdr_size(ehdr); + aead_request_set_tfm(req, tfm); + aead_request_set_ad(req, ehsz); + aead_request_set_crypt(req, sg, sg, len - ehsz, iv); + + /* Set callback function & data */ + aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + tipc_aead_encrypt_done, skb); + tx_ctx = (struct tipc_crypto_tx_ctx *)ctx; + tx_ctx->aead = aead; + tx_ctx->bearer = b; + memcpy(&tx_ctx->dst, dst, sizeof(*dst)); + + /* Hold bearer */ + if (unlikely(!tipc_bearer_hold(b))) { + rc = -ENODEV; + goto exit; + } + + /* Now, do encrypt */ + rc = crypto_aead_encrypt(req); + if (rc == -EINPROGRESS || rc == -EBUSY) + return rc; + + tipc_bearer_put(b); + +exit: + kfree(ctx); + TIPC_SKB_CB(skb)->crypto_ctx = NULL; + return rc; +} + +static void tipc_aead_encrypt_done(struct crypto_async_request *base, int err) +{ + struct sk_buff *skb = base->data; + struct tipc_crypto_tx_ctx *tx_ctx = TIPC_SKB_CB(skb)->crypto_ctx; + struct tipc_bearer *b = tx_ctx->bearer; + struct tipc_aead *aead = tx_ctx->aead; + struct tipc_crypto *tx = aead->crypto; + struct net *net = tx->net; + + switch (err) { + case 0: + this_cpu_inc(tx->stats->stat[STAT_ASYNC_OK]); + if (likely(test_bit(0, &b->up))) + b->media->send_msg(net, skb, b, &tx_ctx->dst); + else + kfree_skb(skb); + break; + case -EINPROGRESS: + return; + default: + this_cpu_inc(tx->stats->stat[STAT_ASYNC_NOK]); + kfree_skb(skb); + break; + } + + kfree(tx_ctx); + tipc_bearer_put(b); + tipc_aead_put(aead); +} + +/** + * tipc_aead_decrypt - Decrypt an encrypted message + * @net: struct net + * @aead: TIPC AEAD for the message decryption + * @skb: the input/output skb + * @b: TIPC bearer where the message has been received + * + * Return: + * 0 : if the decryption has completed + * -EINPROGRESS/-EBUSY : if a callback will be performed + * < 0 : the decryption has failed + */ +static int tipc_aead_decrypt(struct net *net, struct tipc_aead *aead, + struct sk_buff *skb, struct tipc_bearer *b) +{ + struct tipc_crypto_rx_ctx *rx_ctx; + struct aead_request *req; + struct crypto_aead *tfm; + struct sk_buff *unused; + struct scatterlist *sg; + struct tipc_ehdr *ehdr; + int ehsz, nsg, rc; + void *ctx; + u32 salt; + u8 *iv; + + if (unlikely(!aead)) + return -ENOKEY; + + /* Cow skb data if needed */ + if (likely(!skb_cloned(skb) && + (!skb_is_nonlinear(skb) || !skb_has_frag_list(skb)))) { + nsg = 1 + skb_shinfo(skb)->nr_frags; + } else { + nsg = skb_cow_data(skb, 0, &unused); + if (unlikely(nsg < 0)) { + pr_err("RX: skb_cow_data() returned %d\n", nsg); + return nsg; + } + } + + /* Allocate memory for the AEAD operation */ + tfm = tipc_aead_tfm_next(aead); + ctx = tipc_aead_mem_alloc(tfm, sizeof(*rx_ctx), &iv, &req, &sg, nsg); + if (unlikely(!ctx)) + return -ENOMEM; + TIPC_SKB_CB(skb)->crypto_ctx = ctx; + + /* Map skb to the sg lists */ + sg_init_table(sg, nsg); + rc = skb_to_sgvec(skb, sg, 0, skb->len); + if (unlikely(rc < 0)) { + pr_err("RX: skb_to_sgvec() returned %d, nsg %d\n", rc, nsg); + goto exit; + } + + /* Reconstruct IV: */ + ehdr = (struct tipc_ehdr *)skb->data; + salt = aead->salt; + if (aead->mode == CLUSTER_KEY) + salt ^= ehdr->addr; /* __be32 */ + else if (ehdr->destined) + salt ^= tipc_own_addr(net); + memcpy(iv, &salt, 4); + memcpy(iv + 4, (u8 *)&ehdr->seqno, 8); + + /* Prepare request */ + ehsz = tipc_ehdr_size(ehdr); + aead_request_set_tfm(req, tfm); + aead_request_set_ad(req, ehsz); + aead_request_set_crypt(req, sg, sg, skb->len - ehsz, iv); + + /* Set callback function & data */ + aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + tipc_aead_decrypt_done, skb); + rx_ctx = (struct tipc_crypto_rx_ctx *)ctx; + rx_ctx->aead = aead; + rx_ctx->bearer = b; + + /* Hold bearer */ + if (unlikely(!tipc_bearer_hold(b))) { + rc = -ENODEV; + goto exit; + } + + /* Now, do decrypt */ + rc = crypto_aead_decrypt(req); + if (rc == -EINPROGRESS || rc == -EBUSY) + return rc; + + tipc_bearer_put(b); + +exit: + kfree(ctx); + TIPC_SKB_CB(skb)->crypto_ctx = NULL; + return rc; +} + +static void tipc_aead_decrypt_done(struct crypto_async_request *base, int err) +{ + struct sk_buff *skb = base->data; + struct tipc_crypto_rx_ctx *rx_ctx = TIPC_SKB_CB(skb)->crypto_ctx; + struct tipc_bearer *b = rx_ctx->bearer; + struct tipc_aead *aead = rx_ctx->aead; + struct tipc_crypto_stats __percpu *stats = aead->crypto->stats; + struct net *net = aead->crypto->net; + + switch (err) { + case 0: + this_cpu_inc(stats->stat[STAT_ASYNC_OK]); + break; + case -EINPROGRESS: + return; + default: + this_cpu_inc(stats->stat[STAT_ASYNC_NOK]); + break; + } + + kfree(rx_ctx); + tipc_crypto_rcv_complete(net, aead, b, &skb, err); + if (likely(skb)) { + if (likely(test_bit(0, &b->up))) + tipc_rcv(net, skb, b); + else + kfree_skb(skb); + } + + tipc_bearer_put(b); +} + +static inline int tipc_ehdr_size(struct tipc_ehdr *ehdr) +{ + return (ehdr->user != LINK_CONFIG) ? EHDR_SIZE : EHDR_CFG_SIZE; +} + +/** + * tipc_ehdr_validate - Validate an encryption message + * @skb: the message buffer + * + * Returns "true" if this is a valid encryption message, otherwise "false" + */ +bool tipc_ehdr_validate(struct sk_buff *skb) +{ + struct tipc_ehdr *ehdr; + int ehsz; + + if (unlikely(!pskb_may_pull(skb, EHDR_MIN_SIZE))) + return false; + + ehdr = (struct tipc_ehdr *)skb->data; + if (unlikely(ehdr->version != TIPC_EVERSION)) + return false; + ehsz = tipc_ehdr_size(ehdr); + if (unlikely(!pskb_may_pull(skb, ehsz))) + return false; + if (unlikely(skb->len <= ehsz + TIPC_AES_GCM_TAG_SIZE)) + return false; + if (unlikely(!ehdr->tx_key)) + return false; + + return true; +} + +/** + * tipc_ehdr_build - Build TIPC encryption message header + * @net: struct net + * @aead: TX AEAD key to be used for the message encryption + * @tx_key: key id used for the message encryption + * @skb: input/output message skb + * @__rx: RX crypto handle if dest is "known" + * + * Return: the header size if the building is successful, otherwise < 0 + */ +static int tipc_ehdr_build(struct net *net, struct tipc_aead *aead, + u8 tx_key, struct sk_buff *skb, + struct tipc_crypto *__rx) +{ + struct tipc_msg *hdr = buf_msg(skb); + struct tipc_ehdr *ehdr; + u32 user = msg_user(hdr); + u64 seqno; + int ehsz; + + /* Make room for encryption header */ + ehsz = (user != LINK_CONFIG) ? EHDR_SIZE : EHDR_CFG_SIZE; + WARN_ON(skb_headroom(skb) < ehsz); + ehdr = (struct tipc_ehdr *)skb_push(skb, ehsz); + + /* Obtain a seqno first: + * Use the key seqno (= cluster wise) if dest is unknown or we're in + * cluster key mode, otherwise it's better for a per-peer seqno! + */ + if (!__rx || aead->mode == CLUSTER_KEY) + seqno = atomic64_inc_return(&aead->seqno); + else + seqno = atomic64_inc_return(&__rx->sndnxt); + + /* Revoke the key if seqno is wrapped around */ + if (unlikely(!seqno)) + return tipc_crypto_key_revoke(net, tx_key); + + /* Word 1-2 */ + ehdr->seqno = cpu_to_be64(seqno); + + /* Words 0, 3- */ + ehdr->version = TIPC_EVERSION; + ehdr->user = 0; + ehdr->keepalive = 0; + ehdr->tx_key = tx_key; + ehdr->destined = (__rx) ? 1 : 0; + ehdr->rx_key_active = (__rx) ? __rx->key.active : 0; + ehdr->reserved_1 = 0; + ehdr->reserved_2 = 0; + + switch (user) { + case LINK_CONFIG: + ehdr->user = LINK_CONFIG; + memcpy(ehdr->id, tipc_own_id(net), NODE_ID_LEN); + break; + default: + if (user == LINK_PROTOCOL && msg_type(hdr) == STATE_MSG) { + ehdr->user = LINK_PROTOCOL; + ehdr->keepalive = msg_is_keepalive(hdr); + } + ehdr->addr = hdr->hdr[3]; + break; + } + + return ehsz; +} + +static inline void tipc_crypto_key_set_state(struct tipc_crypto *c, + u8 new_passive, + u8 new_active, + u8 new_pending) +{ +#ifdef TIPC_CRYPTO_DEBUG + struct tipc_key old = c->key; + char buf[32]; +#endif + + c->key.keys = ((new_passive & KEY_MASK) << (KEY_BITS * 2)) | + ((new_active & KEY_MASK) << (KEY_BITS)) | + ((new_pending & KEY_MASK)); + +#ifdef TIPC_CRYPTO_DEBUG + pr_info("%s(%s): key changing %s ::%pS\n", + (c->node) ? "RX" : "TX", + (c->node) ? tipc_node_get_id_str(c->node) : + tipc_own_id_string(c->net), + tipc_key_change_dump(old, c->key, buf), + __builtin_return_address(0)); +#endif +} + +/** + * tipc_crypto_key_init - Initiate a new user / AEAD key + * @c: TIPC crypto to which new key is attached + * @ukey: the user key + * @mode: the key mode (CLUSTER_KEY or PER_NODE_KEY) + * + * A new TIPC AEAD key will be allocated and initiated with the specified user + * key, then attached to the TIPC crypto. + * + * Return: new key id in case of success, otherwise: < 0 + */ +int tipc_crypto_key_init(struct tipc_crypto *c, struct tipc_aead_key *ukey, + u8 mode) +{ + struct tipc_aead *aead = NULL; + int rc = 0; + + /* Initiate with the new user key */ + rc = tipc_aead_init(&aead, ukey, mode); + + /* Attach it to the crypto */ + if (likely(!rc)) { + rc = tipc_crypto_key_attach(c, aead, 0); + if (rc < 0) + tipc_aead_free(&aead->rcu); + } + + pr_info("%s(%s): key initiating, rc %d!\n", + (c->node) ? "RX" : "TX", + (c->node) ? tipc_node_get_id_str(c->node) : + tipc_own_id_string(c->net), + rc); + + return rc; +} + +/** + * tipc_crypto_key_attach - Attach a new AEAD key to TIPC crypto + * @c: TIPC crypto to which the new AEAD key is attached + * @aead: the new AEAD key pointer + * @pos: desired slot in the crypto key array, = 0 if any! + * + * Return: new key id in case of success, otherwise: -EBUSY + */ +static int tipc_crypto_key_attach(struct tipc_crypto *c, + struct tipc_aead *aead, u8 pos) +{ + u8 new_pending, new_passive, new_key; + struct tipc_key key; + int rc = -EBUSY; + + spin_lock_bh(&c->lock); + key = c->key; + if (key.active && key.passive) + goto exit; + if (key.passive && !tipc_aead_users(c->aead[key.passive])) + goto exit; + if (key.pending) { + if (pos) + goto exit; + if (tipc_aead_users(c->aead[key.pending]) > 0) + goto exit; + /* Replace it */ + new_pending = key.pending; + new_passive = key.passive; + new_key = new_pending; + } else { + if (pos) { + if (key.active && pos != key_next(key.active)) { + new_pending = key.pending; + new_passive = pos; + new_key = new_passive; + goto attach; + } else if (!key.active && !key.passive) { + new_pending = pos; + new_passive = key.passive; + new_key = new_pending; + goto attach; + } + } + new_pending = key_next(key.active ?: key.passive); + new_passive = key.passive; + new_key = new_pending; + } + +attach: + aead->crypto = c; + tipc_crypto_key_set_state(c, new_passive, key.active, new_pending); + tipc_aead_rcu_replace(c->aead[new_key], aead, &c->lock); + + c->working = 1; + c->timer1 = jiffies; + c->timer2 = jiffies; + rc = new_key; + +exit: + spin_unlock_bh(&c->lock); + return rc; +} + +void tipc_crypto_key_flush(struct tipc_crypto *c) +{ + int k; + + spin_lock_bh(&c->lock); + c->working = 0; + tipc_crypto_key_set_state(c, 0, 0, 0); + for (k = KEY_MIN; k <= KEY_MAX; k++) + tipc_crypto_key_detach(c->aead[k], &c->lock); + atomic_set(&c->peer_rx_active, 0); + atomic64_set(&c->sndnxt, 0); + spin_unlock_bh(&c->lock); +} + +/** + * tipc_crypto_key_try_align - Align RX keys if possible + * @rx: RX crypto handle + * @new_pending: new pending slot if aligned (= TX key from peer) + * + * Peer has used an unknown key slot, this only happens when peer has left and + * rejoned, or we are newcomer. + * That means, there must be no active key but a pending key at unaligned slot. + * If so, we try to move the pending key to the new slot. + * Note: A potential passive key can exist, it will be shifted correspondingly! + * + * Return: "true" if key is successfully aligned, otherwise "false" + */ +static bool tipc_crypto_key_try_align(struct tipc_crypto *rx, u8 new_pending) +{ + struct tipc_aead *tmp1, *tmp2 = NULL; + struct tipc_key key; + bool aligned = false; + u8 new_passive = 0; + int x; + + spin_lock(&rx->lock); + key = rx->key; + if (key.pending == new_pending) { + aligned = true; + goto exit; + } + if (key.active) + goto exit; + if (!key.pending) + goto exit; + if (tipc_aead_users(rx->aead[key.pending]) > 0) + goto exit; + + /* Try to "isolate" this pending key first */ + tmp1 = tipc_aead_rcu_ptr(rx->aead[key.pending], &rx->lock); + if (!refcount_dec_if_one(&tmp1->refcnt)) + goto exit; + rcu_assign_pointer(rx->aead[key.pending], NULL); + + /* Move passive key if any */ + if (key.passive) { + tmp2 = rcu_replace_pointer(rx->aead[key.passive], tmp2, lockdep_is_held(&rx->lock)); + x = (key.passive - key.pending + new_pending) % KEY_MAX; + new_passive = (x <= 0) ? x + KEY_MAX : x; + } + + /* Re-allocate the key(s) */ + tipc_crypto_key_set_state(rx, new_passive, 0, new_pending); + rcu_assign_pointer(rx->aead[new_pending], tmp1); + if (new_passive) + rcu_assign_pointer(rx->aead[new_passive], tmp2); + refcount_set(&tmp1->refcnt, 1); + aligned = true; + pr_info("RX(%s): key is aligned!\n", tipc_node_get_id_str(rx->node)); + +exit: + spin_unlock(&rx->lock); + return aligned; +} + +/** + * tipc_crypto_key_pick_tx - Pick one TX key for message decryption + * @tx: TX crypto handle + * @rx: RX crypto handle (can be NULL) + * @skb: the message skb which will be decrypted later + * + * This function looks up the existing TX keys and pick one which is suitable + * for the message decryption, that must be a cluster key and not used before + * on the same message (i.e. recursive). + * + * Return: the TX AEAD key handle in case of success, otherwise NULL + */ +static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx, + struct tipc_crypto *rx, + struct sk_buff *skb) +{ + struct tipc_skb_cb *skb_cb = TIPC_SKB_CB(skb); + struct tipc_aead *aead = NULL; + struct tipc_key key = tx->key; + u8 k, i = 0; + + /* Initialize data if not yet */ + if (!skb_cb->tx_clone_deferred) { + skb_cb->tx_clone_deferred = 1; + memset(&skb_cb->tx_clone_ctx, 0, sizeof(skb_cb->tx_clone_ctx)); + } + + skb_cb->tx_clone_ctx.rx = rx; + if (++skb_cb->tx_clone_ctx.recurs > 2) + return NULL; + + /* Pick one TX key */ + spin_lock(&tx->lock); + do { + k = (i == 0) ? key.pending : + ((i == 1) ? key.active : key.passive); + if (!k) + continue; + aead = tipc_aead_rcu_ptr(tx->aead[k], &tx->lock); + if (!aead) + continue; + if (aead->mode != CLUSTER_KEY || + aead == skb_cb->tx_clone_ctx.last) { + aead = NULL; + continue; + } + /* Ok, found one cluster key */ + skb_cb->tx_clone_ctx.last = aead; + WARN_ON(skb->next); + skb->next = skb_clone(skb, GFP_ATOMIC); + if (unlikely(!skb->next)) + pr_warn("Failed to clone skb for next round if any\n"); + WARN_ON(!refcount_inc_not_zero(&aead->refcnt)); + break; + } while (++i < 3); + spin_unlock(&tx->lock); + + return aead; +} + +/** + * tipc_crypto_key_synch: Synch own key data according to peer key status + * @rx: RX crypto handle + * @new_rx_active: latest RX active key from peer + * @hdr: TIPCv2 message + * + * This function updates the peer node related data as the peer RX active key + * has changed, so the number of TX keys' users on this node are increased and + * decreased correspondingly. + * + * The "per-peer" sndnxt is also reset when the peer key has switched. + */ +static void tipc_crypto_key_synch(struct tipc_crypto *rx, u8 new_rx_active, + struct tipc_msg *hdr) +{ + struct net *net = rx->net; + struct tipc_crypto *tx = tipc_net(net)->crypto_tx; + u8 cur_rx_active; + + /* TX might be even not ready yet */ + if (unlikely(!tx->key.active && !tx->key.pending)) + return; + + cur_rx_active = atomic_read(&rx->peer_rx_active); + if (likely(cur_rx_active == new_rx_active)) + return; + + /* Make sure this message destined for this node */ + if (unlikely(msg_short(hdr) || + msg_destnode(hdr) != tipc_own_addr(net))) + return; + + /* Peer RX active key has changed, try to update owns' & TX users */ + if (atomic_cmpxchg(&rx->peer_rx_active, + cur_rx_active, + new_rx_active) == cur_rx_active) { + if (new_rx_active) + tipc_aead_users_inc(tx->aead[new_rx_active], INT_MAX); + if (cur_rx_active) + tipc_aead_users_dec(tx->aead[cur_rx_active], 0); + + atomic64_set(&rx->sndnxt, 0); + /* Mark the point TX key users changed */ + tx->timer1 = jiffies; + +#ifdef TIPC_CRYPTO_DEBUG + pr_info("TX(%s): key users changed %d-- %d++, peer RX(%s)\n", + tipc_own_id_string(net), cur_rx_active, + new_rx_active, tipc_node_get_id_str(rx->node)); +#endif + } +} + +static int tipc_crypto_key_revoke(struct net *net, u8 tx_key) +{ + struct tipc_crypto *tx = tipc_net(net)->crypto_tx; + struct tipc_key key; + + spin_lock(&tx->lock); + key = tx->key; + WARN_ON(!key.active || tx_key != key.active); + + /* Free the active key */ + tipc_crypto_key_set_state(tx, key.passive, 0, key.pending); + tipc_crypto_key_detach(tx->aead[key.active], &tx->lock); + spin_unlock(&tx->lock); + + pr_warn("TX(%s): key is revoked!\n", tipc_own_id_string(net)); + return -EKEYREVOKED; +} + +int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net, + struct tipc_node *node) +{ + struct tipc_crypto *c; + + if (*crypto) + return -EEXIST; + + /* Allocate crypto */ + c = kzalloc(sizeof(*c), GFP_ATOMIC); + if (!c) + return -ENOMEM; + + /* Allocate statistic structure */ + c->stats = alloc_percpu_gfp(struct tipc_crypto_stats, GFP_ATOMIC); + if (!c->stats) { + kzfree(c); + return -ENOMEM; + } + + c->working = 0; + c->net = net; + c->node = node; + tipc_crypto_key_set_state(c, 0, 0, 0); + atomic_set(&c->peer_rx_active, 0); + atomic64_set(&c->sndnxt, 0); + c->timer1 = jiffies; + c->timer2 = jiffies; + spin_lock_init(&c->lock); + *crypto = c; + + return 0; +} + +void tipc_crypto_stop(struct tipc_crypto **crypto) +{ + struct tipc_crypto *c, *tx, *rx; + bool is_rx; + u8 k; + + if (!*crypto) + return; + + rcu_read_lock(); + /* RX stopping? => decrease TX key users if any */ + is_rx = !!((*crypto)->node); + if (is_rx) { + rx = *crypto; + tx = tipc_net(rx->net)->crypto_tx; + k = atomic_read(&rx->peer_rx_active); + if (k) { + tipc_aead_users_dec(tx->aead[k], 0); + /* Mark the point TX key users changed */ + tx->timer1 = jiffies; + } + } + + /* Release AEAD keys */ + c = *crypto; + for (k = KEY_MIN; k <= KEY_MAX; k++) + tipc_aead_put(rcu_dereference(c->aead[k])); + rcu_read_unlock(); + + pr_warn("%s(%s) has been purged, node left!\n", + (is_rx) ? "RX" : "TX", + (is_rx) ? tipc_node_get_id_str((*crypto)->node) : + tipc_own_id_string((*crypto)->net)); + + /* Free this crypto statistics */ + free_percpu(c->stats); + + *crypto = NULL; + kzfree(c); +} + +void tipc_crypto_timeout(struct tipc_crypto *rx) +{ + struct tipc_net *tn = tipc_net(rx->net); + struct tipc_crypto *tx = tn->crypto_tx; + struct tipc_key key; + u8 new_pending, new_passive; + int cmd; + + /* TX key activating: + * The pending key (users > 0) -> active + * The active key if any (users == 0) -> free + */ + spin_lock(&tx->lock); + key = tx->key; + if (key.active && tipc_aead_users(tx->aead[key.active]) > 0) + goto s1; + if (!key.pending || tipc_aead_users(tx->aead[key.pending]) <= 0) + goto s1; + if (time_before(jiffies, tx->timer1 + TIPC_TX_LASTING_LIM)) + goto s1; + + tipc_crypto_key_set_state(tx, key.passive, key.pending, 0); + if (key.active) + tipc_crypto_key_detach(tx->aead[key.active], &tx->lock); + this_cpu_inc(tx->stats->stat[STAT_SWITCHES]); + pr_info("TX(%s): key %d is activated!\n", tipc_own_id_string(tx->net), + key.pending); + +s1: + spin_unlock(&tx->lock); + + /* RX key activating: + * The pending key (users > 0) -> active + * The active key if any -> passive, freed later + */ + spin_lock(&rx->lock); + key = rx->key; + if (!key.pending || tipc_aead_users(rx->aead[key.pending]) <= 0) + goto s2; + + new_pending = (key.passive && + !tipc_aead_users(rx->aead[key.passive])) ? + key.passive : 0; + new_passive = (key.active) ?: ((new_pending) ? 0 : key.passive); + tipc_crypto_key_set_state(rx, new_passive, key.pending, new_pending); + this_cpu_inc(rx->stats->stat[STAT_SWITCHES]); + pr_info("RX(%s): key %d is activated!\n", + tipc_node_get_id_str(rx->node), key.pending); + goto s5; + +s2: + /* RX key "faulty" switching: + * The faulty pending key (users < -30) -> passive + * The passive key (users = 0) -> pending + * Note: This only happens after RX deactivated - s3! + */ + key = rx->key; + if (!key.pending || tipc_aead_users(rx->aead[key.pending]) > -30) + goto s3; + if (!key.passive || tipc_aead_users(rx->aead[key.passive]) != 0) + goto s3; + + new_pending = key.passive; + new_passive = key.pending; + tipc_crypto_key_set_state(rx, new_passive, key.active, new_pending); + goto s5; + +s3: + /* RX key deactivating: + * The passive key if any -> pending + * The active key -> passive (users = 0) / pending + * The pending key if any -> passive (users = 0) + */ + key = rx->key; + if (!key.active) + goto s4; + if (time_before(jiffies, rx->timer1 + TIPC_RX_ACTIVE_LIM)) + goto s4; + + new_pending = (key.passive) ?: key.active; + new_passive = (key.passive) ? key.active : key.pending; + tipc_aead_users_set(rx->aead[new_pending], 0); + if (new_passive) + tipc_aead_users_set(rx->aead[new_passive], 0); + tipc_crypto_key_set_state(rx, new_passive, 0, new_pending); + pr_info("RX(%s): key %d is deactivated!\n", + tipc_node_get_id_str(rx->node), key.active); + goto s5; + +s4: + /* RX key passive -> freed: */ + key = rx->key; + if (!key.passive || !tipc_aead_users(rx->aead[key.passive])) + goto s5; + if (time_before(jiffies, rx->timer2 + TIPC_RX_PASSIVE_LIM)) + goto s5; + + tipc_crypto_key_set_state(rx, 0, key.active, key.pending); + tipc_crypto_key_detach(rx->aead[key.passive], &rx->lock); + pr_info("RX(%s): key %d is freed!\n", tipc_node_get_id_str(rx->node), + key.passive); + +s5: + spin_unlock(&rx->lock); + + /* Limit max_tfms & do debug commands if needed */ + if (likely(sysctl_tipc_max_tfms <= TIPC_MAX_TFMS_LIM)) + return; + + cmd = sysctl_tipc_max_tfms; + sysctl_tipc_max_tfms = TIPC_MAX_TFMS_DEF; + tipc_crypto_do_cmd(rx->net, cmd); +} + +/** + * tipc_crypto_xmit - Build & encrypt TIPC message for xmit + * @net: struct net + * @skb: input/output message skb pointer + * @b: bearer used for xmit later + * @dst: destination media address + * @__dnode: destination node for reference if any + * + * First, build an encryption message header on the top of the message, then + * encrypt the original TIPC message by using the active or pending TX key. + * If the encryption is successful, the encrypted skb is returned directly or + * via the callback. + * Otherwise, the skb is freed! + * + * Return: + * 0 : the encryption has succeeded (or no encryption) + * -EINPROGRESS/-EBUSY : the encryption is ongoing, a callback will be made + * -ENOKEK : the encryption has failed due to no key + * -EKEYREVOKED : the encryption has failed due to key revoked + * -ENOMEM : the encryption has failed due to no memory + * < 0 : the encryption has failed due to other reasons + */ +int tipc_crypto_xmit(struct net *net, struct sk_buff **skb, + struct tipc_bearer *b, struct tipc_media_addr *dst, + struct tipc_node *__dnode) +{ + struct tipc_crypto *__rx = tipc_node_crypto_rx(__dnode); + struct tipc_crypto *tx = tipc_net(net)->crypto_tx; + struct tipc_crypto_stats __percpu *stats = tx->stats; + struct tipc_key key = tx->key; + struct tipc_aead *aead = NULL; + struct sk_buff *probe; + int rc = -ENOKEY; + u8 tx_key; + + /* No encryption? */ + if (!tx->working) + return 0; + + /* Try with the pending key if available and: + * 1) This is the only choice (i.e. no active key) or; + * 2) Peer has switched to this key (unicast only) or; + * 3) It is time to do a pending key probe; + */ + if (unlikely(key.pending)) { + tx_key = key.pending; + if (!key.active) + goto encrypt; + if (__rx && atomic_read(&__rx->peer_rx_active) == tx_key) + goto encrypt; + if (TIPC_SKB_CB(*skb)->probe) + goto encrypt; + if (!__rx && + time_after(jiffies, tx->timer2 + TIPC_TX_PROBE_LIM)) { + tx->timer2 = jiffies; + probe = skb_clone(*skb, GFP_ATOMIC); + if (probe) { + TIPC_SKB_CB(probe)->probe = 1; + tipc_crypto_xmit(net, &probe, b, dst, __dnode); + if (probe) + b->media->send_msg(net, probe, b, dst); + } + } + } + /* Else, use the active key if any */ + if (likely(key.active)) { + tx_key = key.active; + goto encrypt; + } + goto exit; + +encrypt: + aead = tipc_aead_get(tx->aead[tx_key]); + if (unlikely(!aead)) + goto exit; + rc = tipc_ehdr_build(net, aead, tx_key, *skb, __rx); + if (likely(rc > 0)) + rc = tipc_aead_encrypt(aead, *skb, b, dst, __dnode); + +exit: + switch (rc) { + case 0: + this_cpu_inc(stats->stat[STAT_OK]); + break; + case -EINPROGRESS: + case -EBUSY: + this_cpu_inc(stats->stat[STAT_ASYNC]); + *skb = NULL; + return rc; + default: + this_cpu_inc(stats->stat[STAT_NOK]); + if (rc == -ENOKEY) + this_cpu_inc(stats->stat[STAT_NOKEYS]); + else if (rc == -EKEYREVOKED) + this_cpu_inc(stats->stat[STAT_BADKEYS]); + kfree_skb(*skb); + *skb = NULL; + break; + } + + tipc_aead_put(aead); + return rc; +} + +/** + * tipc_crypto_rcv - Decrypt an encrypted TIPC message from peer + * @net: struct net + * @rx: RX crypto handle + * @skb: input/output message skb pointer + * @b: bearer where the message has been received + * + * If the decryption is successful, the decrypted skb is returned directly or + * as the callback, the encryption header and auth tag will be trimed out + * before forwarding to tipc_rcv() via the tipc_crypto_rcv_complete(). + * Otherwise, the skb will be freed! + * Note: RX key(s) can be re-aligned, or in case of no key suitable, TX + * cluster key(s) can be taken for decryption (- recursive). + * + * Return: + * 0 : the decryption has successfully completed + * -EINPROGRESS/-EBUSY : the decryption is ongoing, a callback will be made + * -ENOKEY : the decryption has failed due to no key + * -EBADMSG : the decryption has failed due to bad message + * -ENOMEM : the decryption has failed due to no memory + * < 0 : the decryption has failed due to other reasons + */ +int tipc_crypto_rcv(struct net *net, struct tipc_crypto *rx, + struct sk_buff **skb, struct tipc_bearer *b) +{ + struct tipc_crypto *tx = tipc_net(net)->crypto_tx; + struct tipc_crypto_stats __percpu *stats; + struct tipc_aead *aead = NULL; + struct tipc_key key; + int rc = -ENOKEY; + u8 tx_key = 0; + + /* New peer? + * Let's try with TX key (i.e. cluster mode) & verify the skb first! + */ + if (unlikely(!rx)) + goto pick_tx; + + /* Pick RX key according to TX key, three cases are possible: + * 1) The current active key (likely) or; + * 2) The pending (new or deactivated) key (if any) or; + * 3) The passive or old active key (i.e. users > 0); + */ + tx_key = ((struct tipc_ehdr *)(*skb)->data)->tx_key; + key = rx->key; + if (likely(tx_key == key.active)) + goto decrypt; + if (tx_key == key.pending) + goto decrypt; + if (tx_key == key.passive) { + rx->timer2 = jiffies; + if (tipc_aead_users(rx->aead[key.passive]) > 0) + goto decrypt; + } + + /* Unknown key, let's try to align RX key(s) */ + if (tipc_crypto_key_try_align(rx, tx_key)) + goto decrypt; + +pick_tx: + /* No key suitable? Try to pick one from TX... */ + aead = tipc_crypto_key_pick_tx(tx, rx, *skb); + if (aead) + goto decrypt; + goto exit; + +decrypt: + rcu_read_lock(); + if (!aead) + aead = tipc_aead_get(rx->aead[tx_key]); + rc = tipc_aead_decrypt(net, aead, *skb, b); + rcu_read_unlock(); + +exit: + stats = ((rx) ?: tx)->stats; + switch (rc) { + case 0: + this_cpu_inc(stats->stat[STAT_OK]); + break; + case -EINPROGRESS: + case -EBUSY: + this_cpu_inc(stats->stat[STAT_ASYNC]); + *skb = NULL; + return rc; + default: + this_cpu_inc(stats->stat[STAT_NOK]); + if (rc == -ENOKEY) { + kfree_skb(*skb); + *skb = NULL; + if (rx) + tipc_node_put(rx->node); + this_cpu_inc(stats->stat[STAT_NOKEYS]); + return rc; + } else if (rc == -EBADMSG) { + this_cpu_inc(stats->stat[STAT_BADMSGS]); + } + break; + } + + tipc_crypto_rcv_complete(net, aead, b, skb, rc); + return rc; +} + +static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead, + struct tipc_bearer *b, + struct sk_buff **skb, int err) +{ + struct tipc_skb_cb *skb_cb = TIPC_SKB_CB(*skb); + struct tipc_crypto *rx = aead->crypto; + struct tipc_aead *tmp = NULL; + struct tipc_ehdr *ehdr; + struct tipc_node *n; + u8 rx_key_active; + bool destined; + + /* Is this completed by TX? */ + if (unlikely(!rx->node)) { + rx = skb_cb->tx_clone_ctx.rx; +#ifdef TIPC_CRYPTO_DEBUG + pr_info("TX->RX(%s): err %d, aead %p, skb->next %p, flags %x\n", + (rx) ? tipc_node_get_id_str(rx->node) : "-", err, aead, + (*skb)->next, skb_cb->flags); + pr_info("skb_cb [recurs %d, last %p], tx->aead [%p %p %p]\n", + skb_cb->tx_clone_ctx.recurs, skb_cb->tx_clone_ctx.last, + aead->crypto->aead[1], aead->crypto->aead[2], + aead->crypto->aead[3]); +#endif + if (unlikely(err)) { + if (err == -EBADMSG && (*skb)->next) + tipc_rcv(net, (*skb)->next, b); + goto free_skb; + } + + if (likely((*skb)->next)) { + kfree_skb((*skb)->next); + (*skb)->next = NULL; + } + ehdr = (struct tipc_ehdr *)(*skb)->data; + if (!rx) { + WARN_ON(ehdr->user != LINK_CONFIG); + n = tipc_node_create(net, 0, ehdr->id, 0xffffu, 0, + true); + rx = tipc_node_crypto_rx(n); + if (unlikely(!rx)) + goto free_skb; + } + + /* Skip cloning this time as we had a RX pending key */ + if (rx->key.pending) + goto rcv; + if (tipc_aead_clone(&tmp, aead) < 0) + goto rcv; + if (tipc_crypto_key_attach(rx, tmp, ehdr->tx_key) < 0) { + tipc_aead_free(&tmp->rcu); + goto rcv; + } + tipc_aead_put(aead); + aead = tipc_aead_get(tmp); + } + + if (unlikely(err)) { + tipc_aead_users_dec(aead, INT_MIN); + goto free_skb; + } + + /* Set the RX key's user */ + tipc_aead_users_set(aead, 1); + +rcv: + /* Mark this point, RX works */ + rx->timer1 = jiffies; + + /* Remove ehdr & auth. tag prior to tipc_rcv() */ + ehdr = (struct tipc_ehdr *)(*skb)->data; + destined = ehdr->destined; + rx_key_active = ehdr->rx_key_active; + skb_pull(*skb, tipc_ehdr_size(ehdr)); + pskb_trim(*skb, (*skb)->len - aead->authsize); + + /* Validate TIPCv2 message */ + if (unlikely(!tipc_msg_validate(skb))) { + pr_err_ratelimited("Packet dropped after decryption!\n"); + goto free_skb; + } + + /* Update peer RX active key & TX users */ + if (destined) + tipc_crypto_key_synch(rx, rx_key_active, buf_msg(*skb)); + + /* Mark skb decrypted */ + skb_cb->decrypted = 1; + + /* Clear clone cxt if any */ + if (likely(!skb_cb->tx_clone_deferred)) + goto exit; + skb_cb->tx_clone_deferred = 0; + memset(&skb_cb->tx_clone_ctx, 0, sizeof(skb_cb->tx_clone_ctx)); + goto exit; + +free_skb: + kfree_skb(*skb); + *skb = NULL; + +exit: + tipc_aead_put(aead); + if (rx) + tipc_node_put(rx->node); +} + +static void tipc_crypto_do_cmd(struct net *net, int cmd) +{ + struct tipc_net *tn = tipc_net(net); + struct tipc_crypto *tx = tn->crypto_tx, *rx; + struct list_head *p; + unsigned int stat; + int i, j, cpu; + char buf[200]; + + /* Currently only one command is supported */ + switch (cmd) { + case 0xfff1: + goto print_stats; + default: + return; + } + +print_stats: + /* Print a header */ + pr_info("\n=============== TIPC Crypto Statistics ===============\n\n"); + + /* Print key status */ + pr_info("Key status:\n"); + pr_info("TX(%7.7s)\n%s", tipc_own_id_string(net), + tipc_crypto_key_dump(tx, buf)); + + rcu_read_lock(); + for (p = tn->node_list.next; p != &tn->node_list; p = p->next) { + rx = tipc_node_crypto_rx_by_list(p); + pr_info("RX(%7.7s)\n%s", tipc_node_get_id_str(rx->node), + tipc_crypto_key_dump(rx, buf)); + } + rcu_read_unlock(); + + /* Print crypto statistics */ + for (i = 0, j = 0; i < MAX_STATS; i++) + j += scnprintf(buf + j, 200 - j, "|%11s ", hstats[i]); + pr_info("\nCounter %s", buf); + + memset(buf, '-', 115); + buf[115] = '\0'; + pr_info("%s\n", buf); + + j = scnprintf(buf, 200, "TX(%7.7s) ", tipc_own_id_string(net)); + for_each_possible_cpu(cpu) { + for (i = 0; i < MAX_STATS; i++) { + stat = per_cpu_ptr(tx->stats, cpu)->stat[i]; + j += scnprintf(buf + j, 200 - j, "|%11d ", stat); + } + pr_info("%s", buf); + j = scnprintf(buf, 200, "%12s", " "); + } + + rcu_read_lock(); + for (p = tn->node_list.next; p != &tn->node_list; p = p->next) { + rx = tipc_node_crypto_rx_by_list(p); + j = scnprintf(buf, 200, "RX(%7.7s) ", + tipc_node_get_id_str(rx->node)); + for_each_possible_cpu(cpu) { + for (i = 0; i < MAX_STATS; i++) { + stat = per_cpu_ptr(rx->stats, cpu)->stat[i]; + j += scnprintf(buf + j, 200 - j, "|%11d ", + stat); + } + pr_info("%s", buf); + j = scnprintf(buf, 200, "%12s", " "); + } + } + rcu_read_unlock(); + + pr_info("\n======================== Done ========================\n"); +} + +static char *tipc_crypto_key_dump(struct tipc_crypto *c, char *buf) +{ + struct tipc_key key = c->key; + struct tipc_aead *aead; + int k, i = 0; + char *s; + + for (k = KEY_MIN; k <= KEY_MAX; k++) { + if (k == key.passive) + s = "PAS"; + else if (k == key.active) + s = "ACT"; + else if (k == key.pending) + s = "PEN"; + else + s = "-"; + i += scnprintf(buf + i, 200 - i, "\tKey%d: %s", k, s); + + rcu_read_lock(); + aead = rcu_dereference(c->aead[k]); + if (aead) + i += scnprintf(buf + i, 200 - i, + "{\"%s...\", \"%s\"}/%d:%d", + aead->hint, + (aead->mode == CLUSTER_KEY) ? "c" : "p", + atomic_read(&aead->users), + refcount_read(&aead->refcnt)); + rcu_read_unlock(); + i += scnprintf(buf + i, 200 - i, "\n"); + } + + if (c->node) + i += scnprintf(buf + i, 200 - i, "\tPeer RX active: %d\n", + atomic_read(&c->peer_rx_active)); + + return buf; +} + +#ifdef TIPC_CRYPTO_DEBUG +static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new, + char *buf) +{ + struct tipc_key *key = &old; + int k, i = 0; + char *s; + + /* Output format: "[%s %s %s] -> [%s %s %s]", max len = 32 */ +again: + i += scnprintf(buf + i, 32 - i, "["); + for (k = KEY_MIN; k <= KEY_MAX; k++) { + if (k == key->passive) + s = "pas"; + else if (k == key->active) + s = "act"; + else if (k == key->pending) + s = "pen"; + else + s = "-"; + i += scnprintf(buf + i, 32 - i, + (k != KEY_MAX) ? "%s " : "%s", s); + } + if (key != &new) { + i += scnprintf(buf + i, 32 - i, "] -> "); + key = &new; + goto again; + } + i += scnprintf(buf + i, 32 - i, "]"); + return buf; +} +#endif diff --git a/net/tipc/crypto.h b/net/tipc/crypto.h new file mode 100644 index 000000000000..c3de769f49e8 --- /dev/null +++ b/net/tipc/crypto.h @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/** + * net/tipc/crypto.h: Include file for TIPC crypto + * + * Copyright (c) 2019, Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifdef CONFIG_TIPC_CRYPTO +#ifndef _TIPC_CRYPTO_H +#define _TIPC_CRYPTO_H + +#include "core.h" +#include "node.h" +#include "msg.h" +#include "bearer.h" + +#define TIPC_EVERSION 7 + +/* AEAD aes(gcm) */ +#define TIPC_AES_GCM_KEY_SIZE_128 16 +#define TIPC_AES_GCM_KEY_SIZE_192 24 +#define TIPC_AES_GCM_KEY_SIZE_256 32 + +#define TIPC_AES_GCM_SALT_SIZE 4 +#define TIPC_AES_GCM_IV_SIZE 12 +#define TIPC_AES_GCM_TAG_SIZE 16 + +/** + * TIPC crypto modes: + * - CLUSTER_KEY: + * One single key is used for both TX & RX in all nodes in the cluster. + * - PER_NODE_KEY: + * Each nodes in the cluster has one TX key, for RX a node needs to know + * its peers' TX key for the decryption of messages from those nodes. + */ +enum { + CLUSTER_KEY = 1, + PER_NODE_KEY = (1 << 1), +}; + +extern int sysctl_tipc_max_tfms __read_mostly; + +/** + * TIPC encryption message format: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 + * 1 0 9 8 7 6 5 4|3 2 1 0 9 8 7 6|5 4 3 2 1 0 9 8|7 6 5 4 3 2 1 0 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * w0:|Ver=7| User |D|TX |RX |K| Rsvd | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * w1:| Seqno | + * w2:| (8 octets) | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * w3:\ Prevnode \ + * / (4 or 16 octets) / + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * \ \ + * / Encrypted complete TIPC V2 header and user data / + * \ \ + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | | + * | AuthTag | + * | (16 octets) | + * | | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * Word0: + * Ver : = 7 i.e. TIPC encryption message version + * User : = 7 (for LINK_PROTOCOL); = 13 (for LINK_CONFIG) or = 0 + * D : The destined bit i.e. the message's destination node is + * "known" or not at the message encryption + * TX : TX key used for the message encryption + * RX : Currently RX active key corresponding to the destination + * node's TX key (when the "D" bit is set) + * K : Keep-alive bit (for RPS, LINK_PROTOCOL/STATE_MSG only) + * Rsvd : Reserved bit, field + * Word1-2: + * Seqno : The 64-bit sequence number of the encrypted message, also + * part of the nonce used for the message encryption/decryption + * Word3-: + * Prevnode: The source node address, or ID in case LINK_CONFIG only + * AuthTag : The authentication tag for the message integrity checking + * generated by the message encryption + */ +struct tipc_ehdr { + union { + struct { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 destined:1, + user:4, + version:3; + __u8 reserved_1:3, + keepalive:1, + rx_key_active:2, + tx_key:2; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 version:3, + user:4, + destined:1; + __u8 tx_key:2, + rx_key_active:2, + keepalive:1, + reserved_1:3; +#else +#error "Please fix <asm/byteorder.h>" +#endif + __be16 reserved_2; + } __packed; + __be32 w0; + }; + __be64 seqno; + union { + __be32 addr; + __u8 id[NODE_ID_LEN]; /* For a LINK_CONFIG message only! */ + }; +#define EHDR_SIZE (offsetof(struct tipc_ehdr, addr) + sizeof(__be32)) +#define EHDR_CFG_SIZE (sizeof(struct tipc_ehdr)) +#define EHDR_MIN_SIZE (EHDR_SIZE) +#define EHDR_MAX_SIZE (EHDR_CFG_SIZE) +#define EMSG_OVERHEAD (EHDR_SIZE + TIPC_AES_GCM_TAG_SIZE) +} __packed; + +int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net, + struct tipc_node *node); +void tipc_crypto_stop(struct tipc_crypto **crypto); +void tipc_crypto_timeout(struct tipc_crypto *rx); +int tipc_crypto_xmit(struct net *net, struct sk_buff **skb, + struct tipc_bearer *b, struct tipc_media_addr *dst, + struct tipc_node *__dnode); +int tipc_crypto_rcv(struct net *net, struct tipc_crypto *rx, + struct sk_buff **skb, struct tipc_bearer *b); +int tipc_crypto_key_init(struct tipc_crypto *c, struct tipc_aead_key *ukey, + u8 mode); +void tipc_crypto_key_flush(struct tipc_crypto *c); +int tipc_aead_key_validate(struct tipc_aead_key *ukey); +bool tipc_ehdr_validate(struct sk_buff *skb); + +#endif /* _TIPC_CRYPTO_H */ +#endif diff --git a/net/tipc/discover.c b/net/tipc/discover.c index c138d68e8a69..bfe43da127c0 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -94,6 +94,7 @@ static void tipc_disc_init_msg(struct net *net, struct sk_buff *skb, msg_set_dest_domain(hdr, dest_domain); msg_set_bc_netid(hdr, tn->net_id); b->media->addr2msg(msg_media_addr(hdr), &b->addr); + msg_set_peer_net_hash(hdr, tipc_net_hash_mixes(net, tn->random)); msg_set_node_id(hdr, tipc_own_id(net)); } @@ -193,6 +194,7 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, { struct tipc_net *tn = tipc_net(net); struct tipc_msg *hdr = buf_msg(skb); + u32 pnet_hash = msg_peer_net_hash(hdr); u16 caps = msg_node_capabilities(hdr); bool legacy = tn->legacy_addr_format; u32 sugg = msg_sugg_node_addr(hdr); @@ -241,7 +243,7 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, return; if (!tipc_in_scope(legacy, b->domain, src)) return; - tipc_node_check_dest(net, src, peer_id, b, caps, signature, + tipc_node_check_dest(net, src, peer_id, b, caps, signature, pnet_hash, &maddr, &respond, &dupl_addr); if (dupl_addr) disc_dupl_alert(b, src, &maddr); diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c index f69a2fde9f4a..8b0bb600602d 100644 --- a/net/tipc/eth_media.c +++ b/net/tipc/eth_media.c @@ -92,7 +92,8 @@ struct tipc_media eth_media_info = { .raw2addr = tipc_eth_raw2addr, .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, - .window = TIPC_DEF_LINK_WIN, + .min_win = TIPC_DEF_LINK_WIN, + .max_win = TIPC_MAX_LINK_WIN, .type_id = TIPC_MEDIA_TYPE_ETH, .hwaddr_len = ETH_ALEN, .name = "eth" diff --git a/net/tipc/group.c b/net/tipc/group.c index 5f98d38bcf08..89257e2a980d 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -199,7 +199,7 @@ void tipc_group_join(struct net *net, struct tipc_group *grp, int *sk_rcvbuf) struct tipc_member *m, *tmp; struct sk_buff_head xmitq; - skb_queue_head_init(&xmitq); + __skb_queue_head_init(&xmitq); rbtree_postorder_for_each_entry_safe(m, tmp, tree, tree_node) { tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, &xmitq); tipc_group_update_member(m, 0); @@ -435,7 +435,7 @@ bool tipc_group_cong(struct tipc_group *grp, u32 dnode, u32 dport, return true; if (state == MBR_PENDING && adv == ADV_IDLE) return true; - skb_queue_head_init(&xmitq); + __skb_queue_head_init(&xmitq); tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, &xmitq); tipc_node_distr_xmit(grp->net, &xmitq); return true; diff --git a/net/tipc/ib_media.c b/net/tipc/ib_media.c index e8c16718e3fa..7aa9ff88458d 100644 --- a/net/tipc/ib_media.c +++ b/net/tipc/ib_media.c @@ -42,6 +42,8 @@ #include "core.h" #include "bearer.h" +#define TIPC_MAX_IB_LINK_WIN 500 + /* convert InfiniBand address (media address format) media address to string */ static int tipc_ib_addr2str(struct tipc_media_addr *a, char *str_buf, int str_size) @@ -94,7 +96,8 @@ struct tipc_media ib_media_info = { .raw2addr = tipc_ib_raw2addr, .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, - .window = TIPC_DEF_LINK_WIN, + .min_win = TIPC_DEF_LINK_WIN, + .max_win = TIPC_MAX_IB_LINK_WIN, .type_id = TIPC_MEDIA_TYPE_IB, .hwaddr_len = INFINIBAND_ALEN, .name = "ib" diff --git a/net/tipc/link.c b/net/tipc/link.c index c2c5c53cad22..467c53a1fb5c 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -44,6 +44,7 @@ #include "netlink.h" #include "monitor.h" #include "trace.h" +#include "crypto.h" #include <linux/pkt_sched.h> @@ -160,9 +161,9 @@ struct tipc_link { struct { u16 len; u16 limit; + struct sk_buff *target_bskb; } backlog[5]; u16 snd_nxt; - u16 window; /* Reception */ u16 rcv_nxt; @@ -173,9 +174,16 @@ struct tipc_link { /* Congestion handling */ struct sk_buff_head wakeupq; + u16 window; + u16 min_win; + u16 ssthresh; + u16 max_win; + u16 cong_acks; + u16 checkpoint; /* Fragmentation/reassembly */ struct sk_buff *reasm_buf; + struct sk_buff *reasm_tnlmsg; /* Broadcast */ u16 ackers; @@ -241,12 +249,13 @@ static int tipc_link_build_nack_msg(struct tipc_link *l, struct sk_buff_head *xmitq); static void tipc_link_build_bc_init_msg(struct tipc_link *l, struct sk_buff_head *xmitq); -static bool tipc_link_release_pkts(struct tipc_link *l, u16 to); -static u16 tipc_build_gap_ack_blks(struct tipc_link *l, void *data); +static int tipc_link_release_pkts(struct tipc_link *l, u16 to); +static u16 tipc_build_gap_ack_blks(struct tipc_link *l, void *data, u16 gap); static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, struct tipc_gap_ack_blks *ga, struct sk_buff_head *xmitq); - +static void tipc_link_update_cwin(struct tipc_link *l, int released, + bool retransmitted); /* * Simple non-static link routines (i.e. referenced outside this file) */ @@ -305,9 +314,14 @@ u32 tipc_link_id(struct tipc_link *l) return l->peer_bearer_id << 16 | l->bearer_id; } -int tipc_link_window(struct tipc_link *l) +int tipc_link_min_win(struct tipc_link *l) +{ + return l->min_win; +} + +int tipc_link_max_win(struct tipc_link *l) { - return l->window; + return l->max_win; } int tipc_link_prio(struct tipc_link *l) @@ -395,6 +409,15 @@ int tipc_link_mtu(struct tipc_link *l) return l->mtu; } +int tipc_link_mss(struct tipc_link *l) +{ +#ifdef CONFIG_TIPC_CRYPTO + return l->mtu - INT_H_SIZE - EMSG_OVERHEAD; +#else + return l->mtu - INT_H_SIZE; +#endif +} + u16 tipc_link_rcv_nxt(struct tipc_link *l) { return l->rcv_nxt; @@ -424,7 +447,8 @@ u32 tipc_link_state(struct tipc_link *l) * @net_plane: network plane (A,B,c..) this link belongs to * @mtu: mtu to be advertised by link * @priority: priority to be used by link - * @window: send window to be used by link + * @min_win: minimal send window to be used by link + * @max_win: maximal send window to be used by link * @session: session to be used by link * @ownnode: identity of own node * @peer: node id of peer node @@ -439,7 +463,7 @@ u32 tipc_link_state(struct tipc_link *l) */ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, int tolerance, char net_plane, u32 mtu, int priority, - int window, u32 session, u32 self, + u32 min_win, u32 max_win, u32 session, u32 self, u32 peer, u8 *peer_id, u16 peer_caps, struct tipc_link *bc_sndlink, struct tipc_link *bc_rcvlink, @@ -483,7 +507,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, l->advertised_mtu = mtu; l->mtu = mtu; l->priority = priority; - tipc_link_set_queue_limits(l, window); + tipc_link_set_queue_limits(l, min_win, max_win); l->ackers = 1; l->bc_sndlink = bc_sndlink; l->bc_rcvlink = bc_rcvlink; @@ -511,7 +535,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, * Returns true if link was created, otherwise false */ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, - int mtu, int window, u16 peer_caps, + int mtu, u32 min_win, u32 max_win, u16 peer_caps, struct sk_buff_head *inputq, struct sk_buff_head *namedq, struct tipc_link *bc_sndlink, @@ -519,9 +543,9 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, { struct tipc_link *l; - if (!tipc_link_create(net, "", MAX_BEARERS, 0, 'Z', mtu, 0, window, - 0, ownnode, peer, NULL, peer_caps, bc_sndlink, - NULL, inputq, namedq, link)) + if (!tipc_link_create(net, "", MAX_BEARERS, 0, 'Z', mtu, 0, min_win, + max_win, 0, ownnode, peer, NULL, peer_caps, + bc_sndlink, NULL, inputq, namedq, link)) return false; l = *link; @@ -538,7 +562,7 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, /* Disable replicast if even a single peer doesn't support it */ if (link_is_bc_rcvlink(l) && !(peer_caps & TIPC_BCAST_RCAST)) - tipc_bcast_disable_rcast(net); + tipc_bcast_toggle_rcast(net, false); return true; } @@ -760,6 +784,8 @@ bool tipc_link_too_silent(struct tipc_link *l) return (l->silent_intv_cnt + 2 > l->abort_limit); } +static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r, + u16 from, u16 to, struct sk_buff_head *xmitq); /* tipc_link_timeout - perform periodic task as instructed from node timeout */ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq) @@ -792,6 +818,11 @@ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq) probe |= l->silent_intv_cnt; if (probe || mstate->monitoring) l->silent_intv_cnt++; + if (l->snd_nxt == l->checkpoint) { + tipc_link_update_cwin(l, 0, 0); + probe = true; + } + l->checkpoint = l->snd_nxt; break; case LINK_RESET: setup = l->rst_cnt++ <= 4; @@ -849,23 +880,37 @@ static int link_schedule_user(struct tipc_link *l, struct tipc_msg *hdr) */ static void link_prepare_wakeup(struct tipc_link *l) { + struct sk_buff_head *wakeupq = &l->wakeupq; + struct sk_buff_head *inputq = l->inputq; struct sk_buff *skb, *tmp; - int imp, i = 0; + struct sk_buff_head tmpq; + int avail[5] = {0,}; + int imp = 0; - skb_queue_walk_safe(&l->wakeupq, skb, tmp) { + __skb_queue_head_init(&tmpq); + + for (; imp <= TIPC_SYSTEM_IMPORTANCE; imp++) + avail[imp] = l->backlog[imp].limit - l->backlog[imp].len; + + skb_queue_walk_safe(wakeupq, skb, tmp) { imp = TIPC_SKB_CB(skb)->chain_imp; - if (l->backlog[imp].len < l->backlog[imp].limit) { - skb_unlink(skb, &l->wakeupq); - skb_queue_tail(l->inputq, skb); - } else if (i++ > 10) { - break; - } + if (avail[imp] <= 0) + continue; + avail[imp]--; + __skb_unlink(skb, wakeupq); + __skb_queue_tail(&tmpq, skb); } + + spin_lock_bh(&inputq->lock); + skb_queue_splice_tail(&tmpq, inputq); + spin_unlock_bh(&inputq->lock); + } void tipc_link_reset(struct tipc_link *l) { struct sk_buff_head list; + u32 imp; __skb_queue_head_init(&list); @@ -887,14 +932,15 @@ void tipc_link_reset(struct tipc_link *l) __skb_queue_purge(&l->deferdq); __skb_queue_purge(&l->backlogq); __skb_queue_purge(&l->failover_deferdq); - l->backlog[TIPC_LOW_IMPORTANCE].len = 0; - l->backlog[TIPC_MEDIUM_IMPORTANCE].len = 0; - l->backlog[TIPC_HIGH_IMPORTANCE].len = 0; - l->backlog[TIPC_CRITICAL_IMPORTANCE].len = 0; - l->backlog[TIPC_SYSTEM_IMPORTANCE].len = 0; + for (imp = 0; imp <= TIPC_SYSTEM_IMPORTANCE; imp++) { + l->backlog[imp].len = 0; + l->backlog[imp].target_bskb = NULL; + } kfree_skb(l->reasm_buf); + kfree_skb(l->reasm_tnlmsg); kfree_skb(l->failover_reasm_skb); l->reasm_buf = NULL; + l->reasm_tnlmsg = NULL; l->failover_reasm_skb = NULL; l->rcv_unacked = 0; l->snd_nxt = 1; @@ -923,20 +969,25 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, struct sk_buff_head *xmitq) { struct tipc_msg *hdr = buf_msg(skb_peek(list)); - unsigned int maxwin = l->window; - int imp = msg_importance(hdr); - unsigned int mtu = l->mtu; + struct sk_buff_head *backlogq = &l->backlogq; + struct sk_buff_head *transmq = &l->transmq; + struct sk_buff *skb, *_skb; + u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; u16 ack = l->rcv_nxt - 1; u16 seqno = l->snd_nxt; - u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; - struct sk_buff_head *transmq = &l->transmq; - struct sk_buff_head *backlogq = &l->backlogq; - struct sk_buff *skb, *_skb, *bskb; int pkt_cnt = skb_queue_len(list); + int imp = msg_importance(hdr); + unsigned int mss = tipc_link_mss(l); + unsigned int cwin = l->window; + unsigned int mtu = l->mtu; + bool new_bundle; int rc = 0; if (unlikely(msg_size(hdr) > mtu)) { - skb_queue_purge(list); + pr_warn("Too large msg, purging xmit list %d %d %d %d %d!\n", + skb_queue_len(list), msg_user(hdr), + msg_type(hdr), msg_size(hdr), mtu); + __skb_queue_purge(list); return -EMSGSIZE; } @@ -955,20 +1006,18 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, } /* Prepare each packet for sending, and add to relevant queue: */ - while (skb_queue_len(list)) { - skb = skb_peek(list); - hdr = buf_msg(skb); - msg_set_seqno(hdr, seqno); - msg_set_ack(hdr, ack); - msg_set_bcast_ack(hdr, bc_ack); - - if (likely(skb_queue_len(transmq) < maxwin)) { + while ((skb = __skb_dequeue(list))) { + if (likely(skb_queue_len(transmq) < cwin)) { + hdr = buf_msg(skb); + msg_set_seqno(hdr, seqno); + msg_set_ack(hdr, ack); + msg_set_bcast_ack(hdr, bc_ack); _skb = skb_clone(skb, GFP_ATOMIC); if (!_skb) { - skb_queue_purge(list); + kfree_skb(skb); + __skb_queue_purge(list); return -ENOBUFS; } - __skb_dequeue(list); __skb_queue_tail(transmq, skb); /* next retransmit attempt */ if (link_is_bc_sndlink(l)) @@ -980,36 +1029,86 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, seqno++; continue; } - if (tipc_msg_bundle(skb_peek_tail(backlogq), hdr, mtu)) { - kfree_skb(__skb_dequeue(list)); - l->stats.sent_bundled++; - continue; - } - if (tipc_msg_make_bundle(&bskb, hdr, mtu, l->addr)) { - kfree_skb(__skb_dequeue(list)); - __skb_queue_tail(backlogq, bskb); - l->backlog[msg_importance(buf_msg(bskb))].len++; - l->stats.sent_bundled++; - l->stats.sent_bundles++; + if (tipc_msg_try_bundle(l->backlog[imp].target_bskb, &skb, + mss, l->addr, &new_bundle)) { + if (skb) { + /* Keep a ref. to the skb for next try */ + l->backlog[imp].target_bskb = skb; + l->backlog[imp].len++; + __skb_queue_tail(backlogq, skb); + } else { + if (new_bundle) { + l->stats.sent_bundles++; + l->stats.sent_bundled++; + } + l->stats.sent_bundled++; + } continue; } - l->backlog[imp].len += skb_queue_len(list); + l->backlog[imp].target_bskb = NULL; + l->backlog[imp].len += (1 + skb_queue_len(list)); + __skb_queue_tail(backlogq, skb); skb_queue_splice_tail_init(list, backlogq); } l->snd_nxt = seqno; return rc; } +static void tipc_link_update_cwin(struct tipc_link *l, int released, + bool retransmitted) +{ + int bklog_len = skb_queue_len(&l->backlogq); + struct sk_buff_head *txq = &l->transmq; + int txq_len = skb_queue_len(txq); + u16 cwin = l->window; + + /* Enter fast recovery */ + if (unlikely(retransmitted)) { + l->ssthresh = max_t(u16, l->window / 2, 300); + l->window = l->ssthresh; + return; + } + /* Enter slow start */ + if (unlikely(!released)) { + l->ssthresh = max_t(u16, l->window / 2, 300); + l->window = l->min_win; + return; + } + /* Don't increase window if no pressure on the transmit queue */ + if (txq_len + bklog_len < cwin) + return; + + /* Don't increase window if there are holes the transmit queue */ + if (txq_len && l->snd_nxt - buf_seqno(skb_peek(txq)) != txq_len) + return; + + l->cong_acks += released; + + /* Slow start */ + if (cwin <= l->ssthresh) { + l->window = min_t(u16, cwin + released, l->max_win); + return; + } + /* Congestion avoidance */ + if (l->cong_acks < cwin) + return; + l->window = min_t(u16, ++cwin, l->max_win); + l->cong_acks = 0; +} + static void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq) { + u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; + struct sk_buff_head *txq = &l->transmq; struct sk_buff *skb, *_skb; - struct tipc_msg *hdr; - u16 seqno = l->snd_nxt; u16 ack = l->rcv_nxt - 1; - u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; + u16 seqno = l->snd_nxt; + struct tipc_msg *hdr; + u16 cwin = l->window; + u32 imp; - while (skb_queue_len(&l->transmq) < l->window) { + while (skb_queue_len(txq) < cwin) { skb = skb_peek(&l->backlogq); if (!skb) break; @@ -1018,7 +1117,10 @@ static void tipc_link_advance_backlog(struct tipc_link *l, break; __skb_dequeue(&l->backlogq); hdr = buf_msg(skb); - l->backlog[msg_importance(hdr)].len--; + imp = msg_importance(hdr); + l->backlog[imp].len--; + if (unlikely(skb == l->backlog[imp].target_bskb)) + l->backlog[imp].target_bskb = NULL; __skb_queue_tail(&l->transmq, skb); /* next retransmit attempt */ if (link_is_bc_sndlink(l)) @@ -1058,7 +1160,7 @@ static bool link_retransmit_failure(struct tipc_link *l, struct tipc_link *r, return false; if (!time_after(jiffies, TIPC_SKB_CB(skb)->retr_stamp + - msecs_to_jiffies(r->tolerance))) + msecs_to_jiffies(r->tolerance * 10))) return false; hdr = buf_msg(skb); @@ -1102,6 +1204,7 @@ static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r, struct sk_buff *_skb, *skb = skb_peek(&l->transmq); u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; u16 ack = l->rcv_nxt - 1; + int retransmitted = 0; struct tipc_msg *hdr; int rc = 0; @@ -1121,11 +1224,10 @@ static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r, continue; if (more(msg_seqno(hdr), to)) break; - if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr)) continue; TIPC_SKB_CB(skb)->nxt_retr = TIPC_BC_RETR_LIM; - _skb = __pskb_copy(skb, LL_MAX_HEADER + MIN_H_SIZE, GFP_ATOMIC); + _skb = pskb_copy(skb, GFP_ATOMIC); if (!_skb) return 0; hdr = buf_msg(_skb); @@ -1134,11 +1236,12 @@ static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r, _skb->priority = TC_PRIO_CONTROL; __skb_queue_tail(xmitq, _skb); l->stats.retransmitted++; - + retransmitted++; /* Increase actual retrans counter & mark first time */ if (!TIPC_SKB_CB(skb)->retr_cnt++) TIPC_SKB_CB(skb)->retr_stamp = jiffies; } + tipc_link_update_cwin(l, 0, retransmitted); return 0; } @@ -1238,6 +1341,7 @@ static int tipc_link_tnl_rcv(struct tipc_link *l, struct sk_buff *skb, struct sk_buff_head *inputq) { struct sk_buff **reasm_skb = &l->failover_reasm_skb; + struct sk_buff **reasm_tnlmsg = &l->reasm_tnlmsg; struct sk_buff_head *fdefq = &l->failover_deferdq; struct tipc_msg *hdr = buf_msg(skb); struct sk_buff *iskb; @@ -1245,46 +1349,62 @@ static int tipc_link_tnl_rcv(struct tipc_link *l, struct sk_buff *skb, int rc = 0; u16 seqno; - /* SYNCH_MSG */ - if (msg_type(hdr) == SYNCH_MSG) - goto drop; + if (msg_type(hdr) == SYNCH_MSG) { + kfree_skb(skb); + return 0; + } - /* FAILOVER_MSG */ - if (!tipc_msg_extract(skb, &iskb, &ipos)) { - pr_warn_ratelimited("Cannot extract FAILOVER_MSG, defq: %d\n", - skb_queue_len(fdefq)); - return rc; + /* Not a fragment? */ + if (likely(!msg_nof_fragms(hdr))) { + if (unlikely(!tipc_msg_extract(skb, &iskb, &ipos))) { + pr_warn_ratelimited("Unable to extract msg, defq: %d\n", + skb_queue_len(fdefq)); + return 0; + } + kfree_skb(skb); + } else { + /* Set fragment type for buf_append */ + if (msg_fragm_no(hdr) == 1) + msg_set_type(hdr, FIRST_FRAGMENT); + else if (msg_fragm_no(hdr) < msg_nof_fragms(hdr)) + msg_set_type(hdr, FRAGMENT); + else + msg_set_type(hdr, LAST_FRAGMENT); + + if (!tipc_buf_append(reasm_tnlmsg, &skb)) { + /* Successful but non-complete reassembly? */ + if (*reasm_tnlmsg || link_is_bc_rcvlink(l)) + return 0; + pr_warn_ratelimited("Unable to reassemble tunnel msg\n"); + return tipc_link_fsm_evt(l, LINK_FAILURE_EVT); + } + iskb = skb; } do { seqno = buf_seqno(iskb); - if (unlikely(less(seqno, l->drop_point))) { kfree_skb(iskb); continue; } - if (unlikely(seqno != l->drop_point)) { __tipc_skb_queue_sorted(fdefq, seqno, iskb); continue; } l->drop_point++; - if (!tipc_data_input(l, iskb, inputq)) rc |= tipc_link_input(l, iskb, inputq, reasm_skb); if (unlikely(rc)) break; } while ((iskb = __tipc_skb_dequeue(fdefq, l->drop_point))); -drop: - kfree_skb(skb); return rc; } -static bool tipc_link_release_pkts(struct tipc_link *l, u16 acked) +static int tipc_link_release_pkts(struct tipc_link *l, u16 acked) { - bool released = false; + int released = 0; struct sk_buff *skb, *tmp; skb_queue_walk_safe(&l->transmq, skb, tmp) { @@ -1292,7 +1412,7 @@ static bool tipc_link_release_pkts(struct tipc_link *l, u16 acked) break; __skb_unlink(skb, &l->transmq); kfree_skb(skb); - released = true; + released++; } return released; } @@ -1303,14 +1423,14 @@ static bool tipc_link_release_pkts(struct tipc_link *l, u16 acked) * * returns the actual allocated memory size */ -static u16 tipc_build_gap_ack_blks(struct tipc_link *l, void *data) +static u16 tipc_build_gap_ack_blks(struct tipc_link *l, void *data, u16 gap) { struct sk_buff *skb = skb_peek(&l->deferdq); struct tipc_gap_ack_blks *ga = data; u16 len, expect, seqno = 0; u8 n = 0; - if (!skb) + if (!skb || !gap) goto exit; expect = buf_seqno(skb); @@ -1361,8 +1481,10 @@ static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, struct sk_buff *skb, *_skb, *tmp; struct tipc_msg *hdr; u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; + bool retransmitted = false; u16 ack = l->rcv_nxt - 1; bool passed = false; + u16 released = 0; u16 seqno, n = 0; int rc = 0; @@ -1374,6 +1496,7 @@ next_gap_ack: /* release skb */ __skb_unlink(skb, &l->transmq); kfree_skb(skb); + released++; } else if (less_eq(seqno, acked + gap)) { /* First, check if repeated retrans failures occurs? */ if (!passed && link_retransmit_failure(l, l, &rc)) @@ -1384,8 +1507,7 @@ next_gap_ack: if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr)) continue; TIPC_SKB_CB(skb)->nxt_retr = TIPC_UC_RETR_TIME; - _skb = __pskb_copy(skb, LL_MAX_HEADER + MIN_H_SIZE, - GFP_ATOMIC); + _skb = pskb_copy(skb, GFP_ATOMIC); if (!_skb) continue; hdr = buf_msg(_skb); @@ -1394,7 +1516,7 @@ next_gap_ack: _skb->priority = TC_PRIO_CONTROL; __skb_queue_tail(xmitq, _skb); l->stats.retransmitted++; - + retransmitted = true; /* Increase actual retrans counter & mark first time */ if (!TIPC_SKB_CB(skb)->retr_cnt++) TIPC_SKB_CB(skb)->retr_stamp = jiffies; @@ -1408,7 +1530,10 @@ next_gap_ack: goto next_gap_ack; } } - + if (released || retransmitted) + tipc_link_update_cwin(l, released, retransmitted); + if (released) + tipc_link_advance_backlog(l, xmitq); return 0; } @@ -1432,7 +1557,6 @@ int tipc_link_build_state_msg(struct tipc_link *l, struct sk_buff_head *xmitq) l->snd_nxt = l->rcv_nxt; return TIPC_LINK_SND_STATE; } - /* Unicast ACK */ l->rcv_unacked = 0; l->stats.sent_acks++; @@ -1466,7 +1590,8 @@ static int tipc_link_build_nack_msg(struct tipc_link *l, struct sk_buff_head *xmitq) { u32 def_cnt = ++l->stats.deferred_recv; - u32 defq_len = skb_queue_len(&l->deferdq); + struct sk_buff_head *dfq = &l->deferdq; + u32 defq_len = skb_queue_len(dfq); int match1, match2; if (link_is_bc_rcvlink(l)) { @@ -1477,8 +1602,12 @@ static int tipc_link_build_nack_msg(struct tipc_link *l, return 0; } - if (defq_len >= 3 && !((defq_len - 3) % 16)) - tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, 0, xmitq); + if (defq_len >= 3 && !((defq_len - 3) % 16)) { + u16 rcvgap = buf_seqno(skb_peek(dfq)) - l->rcv_nxt; + + tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, + rcvgap, 0, 0, xmitq); + } return 0; } @@ -1493,6 +1622,7 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, struct sk_buff_head *defq = &l->deferdq; struct tipc_msg *hdr = buf_msg(skb); u16 seqno, rcv_nxt, win_lim; + int released = 0; int rc = 0; /* Verify and update link state */ @@ -1511,21 +1641,17 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, if (unlikely(!link_is_up(l))) { if (l->state == LINK_ESTABLISHING) rc = TIPC_LINK_UP_EVT; - goto drop; + kfree_skb(skb); + break; } /* Drop if outside receive window */ if (unlikely(less(seqno, rcv_nxt) || more(seqno, win_lim))) { l->stats.duplicates++; - goto drop; - } - - /* Forward queues and wake up waiting users */ - if (likely(tipc_link_release_pkts(l, msg_ack(hdr)))) { - tipc_link_advance_backlog(l, xmitq); - if (unlikely(!skb_queue_empty(&l->wakeupq))) - link_prepare_wakeup(l); + kfree_skb(skb); + break; } + released += tipc_link_release_pkts(l, msg_ack(hdr)); /* Defer delivery if sequence gap */ if (unlikely(seqno != rcv_nxt)) { @@ -1548,9 +1674,13 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, break; } while ((skb = __tipc_skb_dequeue(defq, l->rcv_nxt))); - return rc; -drop: - kfree_skb(skb); + /* Forward queues and wake up waiting users */ + if (released) { + tipc_link_update_cwin(l, released, 0); + tipc_link_advance_backlog(l, xmitq); + if (unlikely(!skb_queue_empty(&l->wakeupq))) + link_prepare_wakeup(l); + } return rc; } @@ -1576,7 +1706,7 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, if (!tipc_link_is_up(l) && (mtyp == STATE_MSG)) return; - if (!skb_queue_empty(dfq)) + if ((probe || probe_reply) && !skb_queue_empty(dfq)) rcvgap = buf_seqno(skb_peek(dfq)) - l->rcv_nxt; skb = tipc_msg_create(LINK_PROTOCOL, mtyp, INT_H_SIZE, @@ -1609,7 +1739,7 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, msg_set_probe(hdr, probe); msg_set_is_keepalive(hdr, probe || probe_reply); if (l->peer_caps & TIPC_GAP_ACK_BLOCK) - glen = tipc_build_gap_ack_blks(l, data); + glen = tipc_build_gap_ack_blks(l, data, rcvgap); tipc_mon_prep(l->net, data + glen, &dlen, mstate, l->bearer_id); msg_set_size(hdr, INT_H_SIZE + glen + dlen); skb_trim(skb, INT_H_SIZE + glen + dlen); @@ -1644,7 +1774,7 @@ void tipc_link_create_dummy_tnl_msg(struct tipc_link *l, struct sk_buff *skb; u32 dnode = l->addr; - skb_queue_head_init(&tnlq); + __skb_queue_head_init(&tnlq); skb = tipc_msg_create(TUNNEL_PROTOCOL, FAILOVER_MSG, INT_H_SIZE, BASIC_H_SIZE, dnode, onode, 0, 0, 0); @@ -1675,15 +1805,43 @@ void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, struct sk_buff *skb, *tnlskb; struct tipc_msg *hdr, tnlhdr; struct sk_buff_head *queue = &l->transmq; - struct sk_buff_head tmpxq, tnlq; + struct sk_buff_head tmpxq, tnlq, frags; u16 pktlen, pktcnt, seqno = l->snd_nxt; + bool pktcnt_need_update = false; + u16 syncpt; + int rc; if (!tnl) return; - skb_queue_head_init(&tnlq); - skb_queue_head_init(&tmpxq); + __skb_queue_head_init(&tnlq); + /* Link Synching: + * From now on, send only one single ("dummy") SYNCH message + * to peer. The SYNCH message does not contain any data, just + * a header conveying the synch point to the peer. + */ + if (mtyp == SYNCH_MSG && (tnl->peer_caps & TIPC_TUNNEL_ENHANCED)) { + tnlskb = tipc_msg_create(TUNNEL_PROTOCOL, SYNCH_MSG, + INT_H_SIZE, 0, l->addr, + tipc_own_addr(l->net), + 0, 0, 0); + if (!tnlskb) { + pr_warn("%sunable to create dummy SYNCH_MSG\n", + link_co_err); + return; + } + + hdr = buf_msg(tnlskb); + syncpt = l->snd_nxt + skb_queue_len(&l->backlogq) - 1; + msg_set_syncpt(hdr, syncpt); + msg_set_bearer_id(hdr, l->peer_bearer_id); + __skb_queue_tail(&tnlq, tnlskb); + tipc_link_xmit(tnl, &tnlq, xmitq); + return; + } + __skb_queue_head_init(&tmpxq); + __skb_queue_head_init(&frags); /* At least one packet required for safe algorithm => add dummy */ skb = tipc_msg_create(TIPC_LOW_IMPORTANCE, TIPC_DIRECT_MSG, BASIC_H_SIZE, 0, l->addr, tipc_own_addr(l->net), @@ -1692,7 +1850,7 @@ void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, pr_warn("%sunable to create tunnel packet\n", link_co_err); return; } - skb_queue_tail(&tnlq, skb); + __skb_queue_tail(&tnlq, skb); tipc_link_xmit(l, &tnlq, &tmpxq); __skb_queue_purge(&tmpxq); @@ -1713,6 +1871,39 @@ tnl: if (queue == &l->backlogq) msg_set_seqno(hdr, seqno++); pktlen = msg_size(hdr); + + /* Tunnel link MTU is not large enough? This could be + * due to: + * 1) Link MTU has just changed or set differently; + * 2) Or FAILOVER on the top of a SYNCH message + * + * The 2nd case should not happen if peer supports + * TIPC_TUNNEL_ENHANCED + */ + if (pktlen > tnl->mtu - INT_H_SIZE) { + if (mtyp == FAILOVER_MSG && + (tnl->peer_caps & TIPC_TUNNEL_ENHANCED)) { + rc = tipc_msg_fragment(skb, &tnlhdr, tnl->mtu, + &frags); + if (rc) { + pr_warn("%sunable to frag msg: rc %d\n", + link_co_err, rc); + return; + } + pktcnt += skb_queue_len(&frags) - 1; + pktcnt_need_update = true; + skb_queue_splice_tail_init(&frags, &tnlq); + continue; + } + /* Unluckily, peer doesn't have TIPC_TUNNEL_ENHANCED + * => Just warn it and return! + */ + pr_warn_ratelimited("%stoo large msg <%d, %d>: %d!\n", + link_co_err, msg_user(hdr), + msg_type(hdr), msg_size(hdr)); + return; + } + msg_set_size(&tnlhdr, pktlen + INT_H_SIZE); tnlskb = tipc_buf_acquire(pktlen + INT_H_SIZE, GFP_ATOMIC); if (!tnlskb) { @@ -1728,6 +1919,12 @@ tnl: goto tnl; } + if (pktcnt_need_update) + skb_queue_walk(&tnlq, skb) { + hdr = buf_msg(skb); + msg_set_msgcnt(hdr, pktcnt); + } + tipc_link_xmit(tnl, &tnlq, xmitq); if (mtyp == FAILOVER_MSG) { @@ -1762,7 +1959,7 @@ void tipc_link_failover_prepare(struct tipc_link *l, struct tipc_link *tnl, tipc_link_create_dummy_tnl_msg(tnl, xmitq); - /* This failover link enpoint was never established before, + /* This failover link endpoint was never established before, * so it has not received anything from peer. * Otherwise, it must be a normal failover situation or the * node has entered SELF_DOWN_PEER_LEAVING and both peer nodes @@ -1952,19 +2149,18 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, &l->mon_state, l->bearer_id); /* Send NACK if peer has sent pkts we haven't received yet */ - if (more(peers_snd_nxt, rcv_nxt) && !tipc_link_is_synching(l)) + if ((reply || msg_is_keepalive(hdr)) && + more(peers_snd_nxt, rcv_nxt) && + !tipc_link_is_synching(l) && + skb_queue_empty(&l->deferdq)) rcvgap = peers_snd_nxt - l->rcv_nxt; if (rcvgap || reply) tipc_link_build_proto_msg(l, STATE_MSG, 0, reply, rcvgap, 0, 0, xmitq); rc |= tipc_link_advance_transmq(l, ack, gap, ga, xmitq); - - /* If NACK, retransmit will now start at right position */ if (gap) l->stats.recv_nacks++; - - tipc_link_advance_backlog(l, xmitq); if (unlikely(!skb_queue_empty(&l->wakeupq))) link_prepare_wakeup(l); } @@ -2183,15 +2379,18 @@ int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb, return 0; } -void tipc_link_set_queue_limits(struct tipc_link *l, u32 win) +void tipc_link_set_queue_limits(struct tipc_link *l, u32 min_win, u32 max_win) { int max_bulk = TIPC_MAX_PUBL / (l->mtu / ITEM_SIZE); - l->window = win; - l->backlog[TIPC_LOW_IMPORTANCE].limit = max_t(u16, 50, win); - l->backlog[TIPC_MEDIUM_IMPORTANCE].limit = max_t(u16, 100, win * 2); - l->backlog[TIPC_HIGH_IMPORTANCE].limit = max_t(u16, 150, win * 3); - l->backlog[TIPC_CRITICAL_IMPORTANCE].limit = max_t(u16, 200, win * 4); + l->min_win = min_win; + l->ssthresh = max_win; + l->max_win = max_win; + l->window = min_win; + l->backlog[TIPC_LOW_IMPORTANCE].limit = min_win * 2; + l->backlog[TIPC_MEDIUM_IMPORTANCE].limit = min_win * 4; + l->backlog[TIPC_HIGH_IMPORTANCE].limit = min_win * 6; + l->backlog[TIPC_CRITICAL_IMPORTANCE].limit = min_win * 8; l->backlog[TIPC_SYSTEM_IMPORTANCE].limit = max_bulk; } @@ -2244,10 +2443,10 @@ int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]) } if (props[TIPC_NLA_PROP_WIN]) { - u32 win; + u32 max_win; - win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); - if ((win < TIPC_MIN_LINK_WIN) || (win > TIPC_MAX_LINK_WIN)) + max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + if (max_win < TIPC_DEF_LINK_WIN || max_win > TIPC_MAX_LINK_WIN) return -EINVAL; } @@ -2483,7 +2682,7 @@ int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg) prop = nla_nest_start_noflag(msg->skb, TIPC_NLA_LINK_PROP); if (!prop) goto attr_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bcl->window)) + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bcl->max_win)) goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_BROADCAST, bc_mode)) goto prop_msg_full; diff --git a/net/tipc/link.h b/net/tipc/link.h index adcad65e761c..d3c1c3fc1659 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -73,7 +73,7 @@ enum { bool tipc_link_create(struct net *net, char *if_name, int bearer_id, int tolerance, char net_plane, u32 mtu, int priority, - int window, u32 session, u32 ownnode, + u32 min_win, u32 max_win, u32 session, u32 ownnode, u32 peer, u8 *peer_id, u16 peer_caps, struct tipc_link *bc_sndlink, struct tipc_link *bc_rcvlink, @@ -81,7 +81,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, struct sk_buff_head *namedq, struct tipc_link **link); bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, - int mtu, int window, u16 peer_caps, + int mtu, u32 min_win, u32 max_win, u16 peer_caps, struct sk_buff_head *inputq, struct sk_buff_head *namedq, struct tipc_link *bc_sndlink, @@ -115,7 +115,8 @@ char *tipc_link_name_ext(struct tipc_link *l, char *buf); u32 tipc_link_state(struct tipc_link *l); char tipc_link_plane(struct tipc_link *l); int tipc_link_prio(struct tipc_link *l); -int tipc_link_window(struct tipc_link *l); +int tipc_link_min_win(struct tipc_link *l); +int tipc_link_max_win(struct tipc_link *l); void tipc_link_update_caps(struct tipc_link *l, u16 capabilities); bool tipc_link_validate_msg(struct tipc_link *l, struct tipc_msg *hdr); unsigned long tipc_link_tolerance(struct tipc_link *l); @@ -124,7 +125,7 @@ void tipc_link_set_tolerance(struct tipc_link *l, u32 tol, void tipc_link_set_prio(struct tipc_link *l, u32 prio, struct sk_buff_head *xmitq); void tipc_link_set_abort_limit(struct tipc_link *l, u32 limit); -void tipc_link_set_queue_limits(struct tipc_link *l, u32 window); +void tipc_link_set_queue_limits(struct tipc_link *l, u32 min_win, u32 max_win); int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg, struct tipc_link *link, int nlflags); int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]); @@ -141,6 +142,7 @@ void tipc_link_remove_bc_peer(struct tipc_link *snd_l, int tipc_link_bc_peers(struct tipc_link *l); void tipc_link_set_mtu(struct tipc_link *l, int mtu); int tipc_link_mtu(struct tipc_link *l); +int tipc_link_mss(struct tipc_link *l); void tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked, struct sk_buff_head *xmitq); void tipc_link_build_bc_sync_msg(struct tipc_link *l, diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c index 6a6eae88442f..58708b4c7719 100644 --- a/net/tipc/monitor.c +++ b/net/tipc/monitor.c @@ -665,6 +665,21 @@ void tipc_mon_delete(struct net *net, int bearer_id) kfree(mon); } +void tipc_mon_reinit_self(struct net *net) +{ + struct tipc_monitor *mon; + int bearer_id; + + for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { + mon = tipc_monitor(net, bearer_id); + if (!mon) + continue; + write_lock_bh(&mon->lock); + mon->self->addr = tipc_own_addr(net); + write_unlock_bh(&mon->lock); + } +} + int tipc_nl_monitor_set_threshold(struct net *net, u32 cluster_size) { struct tipc_net *tn = tipc_net(net); diff --git a/net/tipc/monitor.h b/net/tipc/monitor.h index 2a21b93e0d04..ed63d2e650b0 100644 --- a/net/tipc/monitor.h +++ b/net/tipc/monitor.h @@ -77,6 +77,7 @@ int __tipc_nl_add_monitor(struct net *net, struct tipc_nl_msg *msg, u32 bearer_id); int tipc_nl_add_monitor_peer(struct net *net, struct tipc_nl_msg *msg, u32 bearer_id, u32 *prev_node); +void tipc_mon_reinit_self(struct net *net); extern const int tipc_max_domain_size; #endif diff --git a/net/tipc/msg.c b/net/tipc/msg.c index f48e5857210f..0d515d20b056 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -39,10 +39,16 @@ #include "msg.h" #include "addr.h" #include "name_table.h" +#include "crypto.h" #define MAX_FORWARD_SIZE 1024 +#ifdef CONFIG_TIPC_CRYPTO +#define BUF_HEADROOM ALIGN(((LL_MAX_HEADER + 48) + EHDR_MAX_SIZE), 16) +#define BUF_TAILROOM (TIPC_AES_GCM_TAG_SIZE) +#else #define BUF_HEADROOM (LL_MAX_HEADER + 48) #define BUF_TAILROOM 16 +#endif static unsigned int align(unsigned int i) { @@ -61,7 +67,11 @@ static unsigned int align(unsigned int i) struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp) { struct sk_buff *skb; +#ifdef CONFIG_TIPC_CRYPTO + unsigned int buf_size = (BUF_HEADROOM + size + BUF_TAILROOM + 3) & ~3u; +#else unsigned int buf_size = (BUF_HEADROOM + size + 3) & ~3u; +#endif skb = alloc_skb_fclone(buf_size, gfp); if (skb) { @@ -173,7 +183,7 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) } if (fragid == LAST_FRAGMENT) { - TIPC_SKB_CB(head)->validated = false; + TIPC_SKB_CB(head)->validated = 0; if (unlikely(!tipc_msg_validate(&head))) goto err; *buf = head; @@ -190,6 +200,59 @@ err: return 0; } +/** + * tipc_msg_append(): Append data to tail of an existing buffer queue + * @hdr: header to be used + * @m: the data to be appended + * @mss: max allowable size of buffer + * @dlen: size of data to be appended + * @txq: queue to appand to + * Returns the number og 1k blocks appended or errno value + */ +int tipc_msg_append(struct tipc_msg *_hdr, struct msghdr *m, int dlen, + int mss, struct sk_buff_head *txq) +{ + struct sk_buff *skb, *prev; + int accounted, total, curr; + int mlen, cpy, rem = dlen; + struct tipc_msg *hdr; + + skb = skb_peek_tail(txq); + accounted = skb ? msg_blocks(buf_msg(skb)) : 0; + total = accounted; + + while (rem) { + if (!skb || skb->len >= mss) { + prev = skb; + skb = tipc_buf_acquire(mss, GFP_KERNEL); + if (unlikely(!skb)) + return -ENOMEM; + skb_orphan(skb); + skb_trim(skb, MIN_H_SIZE); + hdr = buf_msg(skb); + skb_copy_to_linear_data(skb, _hdr, MIN_H_SIZE); + msg_set_hdr_sz(hdr, MIN_H_SIZE); + msg_set_size(hdr, MIN_H_SIZE); + __skb_queue_tail(txq, skb); + total += 1; + if (prev) + msg_set_ack_required(buf_msg(prev), 0); + msg_set_ack_required(hdr, 1); + } + hdr = buf_msg(skb); + curr = msg_blocks(hdr); + mlen = msg_size(hdr); + cpy = min_t(int, rem, mss - mlen); + if (cpy != copy_from_iter(skb->data + mlen, cpy, &m->msg_iter)) + return -EFAULT; + msg_set_size(hdr, mlen + cpy); + skb_put(skb, cpy); + rem -= cpy; + total += msg_blocks(hdr) - curr; + } + return total - accounted; +} + /* tipc_msg_validate - validate basic format of received message * * This routine ensures a TIPC message has an acceptable header, and at least @@ -218,6 +281,7 @@ bool tipc_msg_validate(struct sk_buff **_skb) if (unlikely(TIPC_SKB_CB(skb)->validated)) return true; + if (unlikely(!pskb_may_pull(skb, MIN_H_SIZE))) return false; @@ -239,11 +303,70 @@ bool tipc_msg_validate(struct sk_buff **_skb) if (unlikely(skb->len < msz)) return false; - TIPC_SKB_CB(skb)->validated = true; + TIPC_SKB_CB(skb)->validated = 1; return true; } /** + * tipc_msg_fragment - build a fragment skb list for TIPC message + * + * @skb: TIPC message skb + * @hdr: internal msg header to be put on the top of the fragments + * @pktmax: max size of a fragment incl. the header + * @frags: returned fragment skb list + * + * Returns 0 if the fragmentation is successful, otherwise: -EINVAL + * or -ENOMEM + */ +int tipc_msg_fragment(struct sk_buff *skb, const struct tipc_msg *hdr, + int pktmax, struct sk_buff_head *frags) +{ + int pktno, nof_fragms, dsz, dmax, eat; + struct tipc_msg *_hdr; + struct sk_buff *_skb; + u8 *data; + + /* Non-linear buffer? */ + if (skb_linearize(skb)) + return -ENOMEM; + + data = (u8 *)skb->data; + dsz = msg_size(buf_msg(skb)); + dmax = pktmax - INT_H_SIZE; + if (dsz <= dmax || !dmax) + return -EINVAL; + + nof_fragms = dsz / dmax + 1; + for (pktno = 1; pktno <= nof_fragms; pktno++) { + if (pktno < nof_fragms) + eat = dmax; + else + eat = dsz % dmax; + /* Allocate a new fragment */ + _skb = tipc_buf_acquire(INT_H_SIZE + eat, GFP_ATOMIC); + if (!_skb) + goto error; + skb_orphan(_skb); + __skb_queue_tail(frags, _skb); + /* Copy header & data to the fragment */ + skb_copy_to_linear_data(_skb, hdr, INT_H_SIZE); + skb_copy_to_linear_data_offset(_skb, INT_H_SIZE, data, eat); + data += eat; + /* Update the fragment's header */ + _hdr = buf_msg(_skb); + msg_set_fragm_no(_hdr, pktno); + msg_set_nof_fragms(_hdr, nof_fragms); + msg_set_size(_hdr, INT_H_SIZE + eat); + } + return 0; + +error: + __skb_queue_purge(frags); + __skb_queue_head_init(frags); + return -ENOMEM; +} + +/** * tipc_msg_build - create buffer chain containing specified header and data * @mhdr: Message header, to be prepended to data * @m: User message @@ -360,48 +483,98 @@ error: } /** - * tipc_msg_bundle(): Append contents of a buffer to tail of an existing one - * @skb: the buffer to append to ("bundle") - * @msg: message to be appended - * @mtu: max allowable size for the bundle buffer - * Consumes buffer if successful - * Returns true if bundling could be performed, otherwise false + * tipc_msg_bundle - Append contents of a buffer to tail of an existing one + * @bskb: the bundle buffer to append to + * @msg: message to be appended + * @max: max allowable size for the bundle buffer + * + * Returns "true" if bundling has been performed, otherwise "false" */ -bool tipc_msg_bundle(struct sk_buff *skb, struct tipc_msg *msg, u32 mtu) +static bool tipc_msg_bundle(struct sk_buff *bskb, struct tipc_msg *msg, + u32 max) { - struct tipc_msg *bmsg; - unsigned int bsz; - unsigned int msz = msg_size(msg); - u32 start, pad; - u32 max = mtu - INT_H_SIZE; + struct tipc_msg *bmsg = buf_msg(bskb); + u32 msz, bsz, offset, pad; - if (likely(msg_user(msg) == MSG_FRAGMENTER)) - return false; - if (!skb) - return false; - bmsg = buf_msg(skb); + msz = msg_size(msg); bsz = msg_size(bmsg); - start = align(bsz); - pad = start - bsz; + offset = align(bsz); + pad = offset - bsz; - if (unlikely(msg_user(msg) == TUNNEL_PROTOCOL)) + if (unlikely(skb_tailroom(bskb) < (pad + msz))) return false; - if (unlikely(msg_user(msg) == BCAST_PROTOCOL)) + if (unlikely(max < (offset + msz))) return false; - if (unlikely(msg_user(bmsg) != MSG_BUNDLER)) + + skb_put(bskb, pad + msz); + skb_copy_to_linear_data_offset(bskb, offset, msg, msz); + msg_set_size(bmsg, offset + msz); + msg_set_msgcnt(bmsg, msg_msgcnt(bmsg) + 1); + return true; +} + +/** + * tipc_msg_try_bundle - Try to bundle a new message to the last one + * @tskb: the last/target message to which the new one will be appended + * @skb: the new message skb pointer + * @mss: max message size (header inclusive) + * @dnode: destination node for the message + * @new_bundle: if this call made a new bundle or not + * + * Return: "true" if the new message skb is potential for bundling this time or + * later, in the case a bundling has been done this time, the skb is consumed + * (the skb pointer = NULL). + * Otherwise, "false" if the skb cannot be bundled at all. + */ +bool tipc_msg_try_bundle(struct sk_buff *tskb, struct sk_buff **skb, u32 mss, + u32 dnode, bool *new_bundle) +{ + struct tipc_msg *msg, *inner, *outer; + u32 tsz; + + /* First, check if the new buffer is suitable for bundling */ + msg = buf_msg(*skb); + if (msg_user(msg) == MSG_FRAGMENTER) return false; - if (unlikely(skb_tailroom(skb) < (pad + msz))) + if (msg_user(msg) == TUNNEL_PROTOCOL) return false; - if (unlikely(max < (start + msz))) + if (msg_user(msg) == BCAST_PROTOCOL) return false; - if ((msg_importance(msg) < TIPC_SYSTEM_IMPORTANCE) && - (msg_importance(bmsg) == TIPC_SYSTEM_IMPORTANCE)) + if (mss <= INT_H_SIZE + msg_size(msg)) return false; - skb_put(skb, pad + msz); - skb_copy_to_linear_data_offset(skb, start, msg, msz); - msg_set_size(bmsg, start + msz); - msg_set_msgcnt(bmsg, msg_msgcnt(bmsg) + 1); + /* Ok, but the last/target buffer can be empty? */ + if (unlikely(!tskb)) + return true; + + /* Is it a bundle already? Try to bundle the new message to it */ + if (msg_user(buf_msg(tskb)) == MSG_BUNDLER) { + *new_bundle = false; + goto bundle; + } + + /* Make a new bundle of the two messages if possible */ + tsz = msg_size(buf_msg(tskb)); + if (unlikely(mss < align(INT_H_SIZE + tsz) + msg_size(msg))) + return true; + if (unlikely(pskb_expand_head(tskb, INT_H_SIZE, mss - tsz - INT_H_SIZE, + GFP_ATOMIC))) + return true; + inner = buf_msg(tskb); + skb_push(tskb, INT_H_SIZE); + outer = buf_msg(tskb); + tipc_msg_init(msg_prevnode(inner), outer, MSG_BUNDLER, 0, INT_H_SIZE, + dnode); + msg_set_importance(outer, msg_importance(inner)); + msg_set_size(outer, INT_H_SIZE + tsz); + msg_set_msgcnt(outer, 1); + *new_bundle = true; + +bundle: + if (likely(tipc_msg_bundle(tskb, msg, mss))) { + consume_skb(*skb); + *skb = NULL; + } return true; } @@ -451,52 +624,6 @@ none: } /** - * tipc_msg_make_bundle(): Create bundle buf and append message to its tail - * @list: the buffer chain, where head is the buffer to replace/append - * @skb: buffer to be created, appended to and returned in case of success - * @msg: message to be appended - * @mtu: max allowable size for the bundle buffer, inclusive header - * @dnode: destination node for message. (Not always present in header) - * Returns true if success, otherwise false - */ -bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg, - u32 mtu, u32 dnode) -{ - struct sk_buff *_skb; - struct tipc_msg *bmsg; - u32 msz = msg_size(msg); - u32 max = mtu - INT_H_SIZE; - - if (msg_user(msg) == MSG_FRAGMENTER) - return false; - if (msg_user(msg) == TUNNEL_PROTOCOL) - return false; - if (msg_user(msg) == BCAST_PROTOCOL) - return false; - if (msz > (max / 2)) - return false; - - _skb = tipc_buf_acquire(max, GFP_ATOMIC); - if (!_skb) - return false; - - skb_trim(_skb, INT_H_SIZE); - bmsg = buf_msg(_skb); - tipc_msg_init(msg_prevnode(msg), bmsg, MSG_BUNDLER, 0, - INT_H_SIZE, dnode); - if (msg_isdata(msg)) - msg_set_importance(bmsg, TIPC_CRITICAL_IMPORTANCE); - else - msg_set_importance(bmsg, TIPC_SYSTEM_IMPORTANCE); - msg_set_seqno(bmsg, msg_seqno(msg)); - msg_set_ack(bmsg, msg_ack(msg)); - msg_set_bcast_ack(bmsg, msg_bcast_ack(msg)); - tipc_msg_bundle(_skb, msg, mtu); - *skb = _skb; - return true; -} - -/** * tipc_msg_reverse(): swap source and destination addresses and add error code * @own_node: originating node id for reversed message * @skb: buffer containing message to be reversed; will be consumed diff --git a/net/tipc/msg.h b/net/tipc/msg.h index d7ebc9e955f6..6d466ebdb64f 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -102,16 +102,42 @@ struct plist; #define TIPC_MEDIA_INFO_OFFSET 5 struct tipc_skb_cb { - struct sk_buff *tail; - unsigned long nxt_retr; - unsigned long retr_stamp; - u32 bytes_read; - u32 orig_member; - u16 chain_imp; - u16 ackers; - u16 retr_cnt; - bool validated; -}; + union { + struct { + struct sk_buff *tail; + unsigned long nxt_retr; + unsigned long retr_stamp; + u32 bytes_read; + u32 orig_member; + u16 chain_imp; + u16 ackers; + u16 retr_cnt; + } __packed; +#ifdef CONFIG_TIPC_CRYPTO + struct { + struct tipc_crypto *rx; + struct tipc_aead *last; + u8 recurs; + } tx_clone_ctx __packed; +#endif + } __packed; + union { + struct { + u8 validated:1; +#ifdef CONFIG_TIPC_CRYPTO + u8 encrypted:1; + u8 decrypted:1; + u8 probe:1; + u8 tx_clone_deferred:1; +#endif + }; + u8 flags; + }; + u8 reserved; +#ifdef CONFIG_TIPC_CRYPTO + void *crypto_ctx; +#endif +} __packed; #define TIPC_SKB_CB(__skb) ((struct tipc_skb_cb *)&((__skb)->cb[0])) @@ -290,6 +316,16 @@ static inline void msg_set_src_droppable(struct tipc_msg *m, u32 d) msg_set_bits(m, 0, 18, 1, d); } +static inline int msg_ack_required(struct tipc_msg *m) +{ + return msg_bits(m, 0, 18, 1); +} + +static inline void msg_set_ack_required(struct tipc_msg *m, u32 d) +{ + msg_set_bits(m, 0, 18, 1, d); +} + static inline bool msg_is_rcast(struct tipc_msg *m) { return msg_bits(m, 0, 18, 0x1); @@ -723,12 +759,26 @@ static inline void msg_set_last_bcast(struct tipc_msg *m, u32 n) msg_set_bits(m, 4, 16, 0xffff, n); } +static inline u32 msg_nof_fragms(struct tipc_msg *m) +{ + return msg_bits(m, 4, 0, 0xffff); +} + +static inline void msg_set_nof_fragms(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 4, 0, 0xffff, n); +} + +static inline u32 msg_fragm_no(struct tipc_msg *m) +{ + return msg_bits(m, 4, 16, 0xffff); +} + static inline void msg_set_fragm_no(struct tipc_msg *m, u32 n) { msg_set_bits(m, 4, 16, 0xffff, n); } - static inline u16 msg_next_sent(struct tipc_msg *m) { return msg_bits(m, 4, 0, 0xffff); @@ -879,6 +929,16 @@ static inline void msg_set_msgcnt(struct tipc_msg *m, u16 n) msg_set_bits(m, 9, 16, 0xffff, n); } +static inline u16 msg_syncpt(struct tipc_msg *m) +{ + return msg_bits(m, 9, 16, 0xffff); +} + +static inline void msg_set_syncpt(struct tipc_msg *m, u16 n) +{ + msg_set_bits(m, 9, 16, 0xffff, n); +} + static inline u32 msg_conn_ack(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); @@ -1002,6 +1062,20 @@ static inline bool msg_is_reset(struct tipc_msg *hdr) return (msg_user(hdr) == LINK_PROTOCOL) && (msg_type(hdr) == RESET_MSG); } +/* Word 13 + */ +static inline void msg_set_peer_net_hash(struct tipc_msg *m, u32 n) +{ + msg_set_word(m, 13, n); +} + +static inline u32 msg_peer_net_hash(struct tipc_msg *m) +{ + return msg_word(m, 13); +} + +/* Word 14 + */ static inline u32 msg_sugg_node_addr(struct tipc_msg *m) { return msg_word(m, 14); @@ -1033,12 +1107,15 @@ struct sk_buff *tipc_msg_create(uint user, uint type, uint hdr_sz, uint data_sz, u32 dnode, u32 onode, u32 dport, u32 oport, int errcode); int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf); -bool tipc_msg_bundle(struct sk_buff *skb, struct tipc_msg *msg, u32 mtu); -bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg, - u32 mtu, u32 dnode); +bool tipc_msg_try_bundle(struct sk_buff *tskb, struct sk_buff **skb, u32 mss, + u32 dnode, bool *new_bundle); bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos); +int tipc_msg_fragment(struct sk_buff *skb, const struct tipc_msg *hdr, + int pktmax, struct sk_buff_head *frags); int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset, int dsz, int mtu, struct sk_buff_head *list); +int tipc_msg_append(struct tipc_msg *hdr, struct msghdr *m, int dlen, + int mss, struct sk_buff_head *txq); bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err); bool tipc_msg_assemble(struct sk_buff_head *list); bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq); diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 44abc8e9c990..5feaf3b67380 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -146,7 +146,7 @@ static void named_distribute(struct net *net, struct sk_buff_head *list, struct publication *publ; struct sk_buff *skb = NULL; struct distr_item *item = NULL; - u32 msg_dsz = ((tipc_node_get_mtu(net, dnode, 0) - INT_H_SIZE) / + u32 msg_dsz = ((tipc_node_get_mtu(net, dnode, 0, false) - INT_H_SIZE) / ITEM_SIZE) * ITEM_SIZE; u32 msg_rem = msg_dsz; @@ -190,7 +190,7 @@ void tipc_named_node_up(struct net *net, u32 dnode) struct name_table *nt = tipc_name_table(net); struct sk_buff_head head; - skb_queue_head_init(&head); + __skb_queue_head_init(&head); read_lock_bh(&nt->cluster_scope_lock); named_distribute(net, &head, dnode, &nt->cluster_scope); @@ -223,7 +223,8 @@ static void tipc_publ_purge(struct net *net, struct publication *publ, u32 addr) publ->key); } - kfree_rcu(p, rcu); + if (p) + kfree_rcu(p, rcu); } /** diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 66a65c2cdb23..359b2bc888cf 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -35,6 +35,8 @@ */ #include <net/sock.h> +#include <linux/list_sort.h> +#include <linux/rbtree_augmented.h> #include "core.h" #include "netlink.h" #include "name_table.h" @@ -50,6 +52,7 @@ * @lower: service range lower bound * @upper: service range upper bound * @tree_node: member of service range RB tree + * @max: largest 'upper' in this node subtree * @local_publ: list of identical publications made from this node * Used by closest_first lookup and multicast lookup algorithm * @all_publ: all publications identical to this one, whatever node and scope @@ -59,6 +62,7 @@ struct service_range { u32 lower; u32 upper; struct rb_node tree_node; + u32 max; struct list_head local_publ; struct list_head all_publ; }; @@ -66,6 +70,7 @@ struct service_range { /** * struct tipc_service - container for all published instances of a service type * @type: 32 bit 'type' value for service + * @publ_cnt: increasing counter for publications in this service * @ranges: rb tree containing all service ranges for this service * @service_list: links to adjacent name ranges in hash chain * @subscriptions: list of subscriptions for this service type @@ -74,6 +79,7 @@ struct service_range { */ struct tipc_service { u32 type; + u32 publ_cnt; struct rb_root ranges; struct hlist_node service_list; struct list_head subscriptions; @@ -81,6 +87,130 @@ struct tipc_service { struct rcu_head rcu; }; +#define service_range_upper(sr) ((sr)->upper) +RB_DECLARE_CALLBACKS_MAX(static, sr_callbacks, + struct service_range, tree_node, u32, max, + service_range_upper) + +#define service_range_entry(rbtree_node) \ + (container_of(rbtree_node, struct service_range, tree_node)) + +#define service_range_overlap(sr, start, end) \ + ((sr)->lower <= (end) && (sr)->upper >= (start)) + +/** + * service_range_foreach_match - iterate over tipc service rbtree for each + * range match + * @sr: the service range pointer as a loop cursor + * @sc: the pointer to tipc service which holds the service range rbtree + * @start, end: the range (end >= start) for matching + */ +#define service_range_foreach_match(sr, sc, start, end) \ + for (sr = service_range_match_first((sc)->ranges.rb_node, \ + start, \ + end); \ + sr; \ + sr = service_range_match_next(&(sr)->tree_node, \ + start, \ + end)) + +/** + * service_range_match_first - find first service range matching a range + * @n: the root node of service range rbtree for searching + * @start, end: the range (end >= start) for matching + * + * Return: the leftmost service range node in the rbtree that overlaps the + * specific range if any. Otherwise, returns NULL. + */ +static struct service_range *service_range_match_first(struct rb_node *n, + u32 start, u32 end) +{ + struct service_range *sr; + struct rb_node *l, *r; + + /* Non overlaps in tree at all? */ + if (!n || service_range_entry(n)->max < start) + return NULL; + + while (n) { + l = n->rb_left; + if (l && service_range_entry(l)->max >= start) { + /* A leftmost overlap range node must be one in the left + * subtree. If not, it has lower > end, then nodes on + * the right side cannot satisfy the condition either. + */ + n = l; + continue; + } + + /* No one in the left subtree can match, return if this node is + * an overlap i.e. leftmost. + */ + sr = service_range_entry(n); + if (service_range_overlap(sr, start, end)) + return sr; + + /* Ok, try to lookup on the right side */ + r = n->rb_right; + if (sr->lower <= end && + r && service_range_entry(r)->max >= start) { + n = r; + continue; + } + break; + } + + return NULL; +} + +/** + * service_range_match_next - find next service range matching a range + * @n: a node in service range rbtree from which the searching starts + * @start, end: the range (end >= start) for matching + * + * Return: the next service range node to the given node in the rbtree that + * overlaps the specific range if any. Otherwise, returns NULL. + */ +static struct service_range *service_range_match_next(struct rb_node *n, + u32 start, u32 end) +{ + struct service_range *sr; + struct rb_node *p, *r; + + while (n) { + r = n->rb_right; + if (r && service_range_entry(r)->max >= start) + /* A next overlap range node must be one in the right + * subtree. If not, it has lower > end, then any next + * successor (- an ancestor) of this node cannot + * satisfy the condition either. + */ + return service_range_match_first(r, start, end); + + /* No one in the right subtree can match, go up to find an + * ancestor of this node which is parent of a left-hand child. + */ + while ((p = rb_parent(n)) && n == p->rb_right) + n = p; + if (!p) + break; + + /* Return if this ancestor is an overlap */ + sr = service_range_entry(p); + if (service_range_overlap(sr, start, end)) + return sr; + + /* Ok, try to lookup more from this ancestor */ + if (sr->lower <= end) { + n = p; + continue; + } + break; + } + + return NULL; +} + static int hash(int x) { return x & (TIPC_NAMETBL_SIZE - 1); @@ -109,6 +239,7 @@ static struct publication *tipc_publ_create(u32 type, u32 lower, u32 upper, INIT_LIST_HEAD(&publ->binding_node); INIT_LIST_HEAD(&publ->local_publ); INIT_LIST_HEAD(&publ->all_publ); + INIT_LIST_HEAD(&publ->list); return publ; } @@ -135,84 +266,51 @@ static struct tipc_service *tipc_service_create(u32 type, struct hlist_head *hd) return service; } -/** - * tipc_service_first_range - find first service range in tree matching instance - * - * Very time-critical, so binary search through range rb tree - */ -static struct service_range *tipc_service_first_range(struct tipc_service *sc, - u32 instance) -{ - struct rb_node *n = sc->ranges.rb_node; - struct service_range *sr; - - while (n) { - sr = container_of(n, struct service_range, tree_node); - if (sr->lower > instance) - n = n->rb_left; - else if (sr->upper < instance) - n = n->rb_right; - else - return sr; - } - return NULL; -} - /* tipc_service_find_range - find service range matching publication parameters */ static struct service_range *tipc_service_find_range(struct tipc_service *sc, u32 lower, u32 upper) { - struct rb_node *n = sc->ranges.rb_node; struct service_range *sr; - sr = tipc_service_first_range(sc, lower); - if (!sr) - return NULL; - - /* Look for exact match */ - for (n = &sr->tree_node; n; n = rb_next(n)) { - sr = container_of(n, struct service_range, tree_node); - if (sr->upper == upper) - break; + service_range_foreach_match(sr, sc, lower, upper) { + /* Look for exact match */ + if (sr->lower == lower && sr->upper == upper) + return sr; } - if (!n || sr->lower != lower || sr->upper != upper) - return NULL; - return sr; + return NULL; } static struct service_range *tipc_service_create_range(struct tipc_service *sc, u32 lower, u32 upper) { struct rb_node **n, *parent = NULL; - struct service_range *sr, *tmp; + struct service_range *sr; n = &sc->ranges.rb_node; while (*n) { - tmp = container_of(*n, struct service_range, tree_node); parent = *n; - tmp = container_of(parent, struct service_range, tree_node); - if (lower < tmp->lower) - n = &(*n)->rb_left; - else if (lower > tmp->lower) - n = &(*n)->rb_right; - else if (upper < tmp->upper) - n = &(*n)->rb_left; - else if (upper > tmp->upper) - n = &(*n)->rb_right; + sr = service_range_entry(parent); + if (lower == sr->lower && upper == sr->upper) + return sr; + if (sr->max < upper) + sr->max = upper; + if (lower <= sr->lower) + n = &parent->rb_left; else - return tmp; + n = &parent->rb_right; } sr = kzalloc(sizeof(*sr), GFP_ATOMIC); if (!sr) return NULL; sr->lower = lower; sr->upper = upper; + sr->max = upper; INIT_LIST_HEAD(&sr->local_publ); INIT_LIST_HEAD(&sr->all_publ); rb_link_node(&sr->tree_node, parent, n); - rb_insert_color(&sr->tree_node, &sc->ranges); + rb_insert_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks); return sr; } @@ -244,6 +342,8 @@ static struct publication *tipc_service_insert_publ(struct net *net, p = tipc_publ_create(type, lower, upper, scope, node, port, key); if (!p) goto err; + /* Suppose there shouldn't be a huge gap btw publs i.e. >INT_MAX */ + p->id = sc->publ_cnt++; if (in_own_node(net, node)) list_add(&p->local_publ, &sr->local_publ); list_add(&p->all_publ, &sr->all_publ); @@ -278,6 +378,20 @@ static struct publication *tipc_service_remove_publ(struct service_range *sr, } /** + * Code reused: time_after32() for the same purpose + */ +#define publication_after(pa, pb) time_after32((pa)->id, (pb)->id) +static int tipc_publ_sort(void *priv, struct list_head *a, + struct list_head *b) +{ + struct publication *pa, *pb; + + pa = container_of(a, struct publication, list); + pb = container_of(b, struct publication, list); + return publication_after(pa, pb); +} + +/** * tipc_service_subscribe - attach a subscription, and optionally * issue the prescribed number of events if there is any service * range overlapping with the requested range @@ -286,36 +400,44 @@ static void tipc_service_subscribe(struct tipc_service *service, struct tipc_subscription *sub) { struct tipc_subscr *sb = &sub->evt.s; + struct publication *p, *first, *tmp; + struct list_head publ_list; struct service_range *sr; struct tipc_name_seq ns; - struct publication *p; - struct rb_node *n; - bool first; + u32 filter; ns.type = tipc_sub_read(sb, seq.type); ns.lower = tipc_sub_read(sb, seq.lower); ns.upper = tipc_sub_read(sb, seq.upper); + filter = tipc_sub_read(sb, filter); tipc_sub_get(sub); list_add(&sub->service_list, &service->subscriptions); - if (tipc_sub_read(sb, filter) & TIPC_SUB_NO_STATUS) + if (filter & TIPC_SUB_NO_STATUS) return; - for (n = rb_first(&service->ranges); n; n = rb_next(n)) { - sr = container_of(n, struct service_range, tree_node); - if (sr->lower > ns.upper) - break; - if (!tipc_sub_check_overlap(&ns, sr->lower, sr->upper)) - continue; - first = true; - + INIT_LIST_HEAD(&publ_list); + service_range_foreach_match(sr, service, ns.lower, ns.upper) { + first = NULL; list_for_each_entry(p, &sr->all_publ, all_publ) { - tipc_sub_report_overlap(sub, sr->lower, sr->upper, - TIPC_PUBLISHED, p->port, - p->node, p->scope, first); - first = false; + if (filter & TIPC_SUB_PORTS) + list_add_tail(&p->list, &publ_list); + else if (!first || publication_after(first, p)) + /* Pick this range's *first* publication */ + first = p; } + if (first) + list_add_tail(&first->list, &publ_list); + } + + /* Sort the publications before reporting */ + list_sort(NULL, &publ_list, tipc_publ_sort); + list_for_each_entry_safe(p, tmp, &publ_list, list) { + tipc_sub_report_overlap(sub, p->lower, p->upper, + TIPC_PUBLISHED, p->port, p->node, + p->scope, true); + list_del_init(&p->list); } } @@ -390,7 +512,7 @@ struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type, /* Remove service range item if this was its last publication */ if (list_empty(&sr->all_publ)) { - rb_erase(&sr->tree_node, &sc->ranges); + rb_erase_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks); kfree(sr); } @@ -438,34 +560,39 @@ u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *dnode) rcu_read_lock(); sc = tipc_service_find(net, type); if (unlikely(!sc)) - goto not_found; + goto exit; spin_lock_bh(&sc->lock); - sr = tipc_service_first_range(sc, instance); - if (unlikely(!sr)) - goto no_match; - - /* Select lookup algorithm: local, closest-first or round-robin */ - if (*dnode == self) { - list = &sr->local_publ; - if (list_empty(list)) - goto no_match; - p = list_first_entry(list, struct publication, local_publ); - list_move_tail(&p->local_publ, &sr->local_publ); - } else if (legacy && !*dnode && !list_empty(&sr->local_publ)) { - list = &sr->local_publ; - p = list_first_entry(list, struct publication, local_publ); - list_move_tail(&p->local_publ, &sr->local_publ); - } else { - list = &sr->all_publ; - p = list_first_entry(list, struct publication, all_publ); - list_move_tail(&p->all_publ, &sr->all_publ); + service_range_foreach_match(sr, sc, instance, instance) { + /* Select lookup algo: local, closest-first or round-robin */ + if (*dnode == self) { + list = &sr->local_publ; + if (list_empty(list)) + continue; + p = list_first_entry(list, struct publication, + local_publ); + list_move_tail(&p->local_publ, &sr->local_publ); + } else if (legacy && !*dnode && !list_empty(&sr->local_publ)) { + list = &sr->local_publ; + p = list_first_entry(list, struct publication, + local_publ); + list_move_tail(&p->local_publ, &sr->local_publ); + } else { + list = &sr->all_publ; + p = list_first_entry(list, struct publication, + all_publ); + list_move_tail(&p->all_publ, &sr->all_publ); + } + port = p->port; + node = p->node; + /* Todo: as for legacy, pick the first matching range only, a + * "true" round-robin will be performed as needed. + */ + break; } - port = p->port; - node = p->node; -no_match: spin_unlock_bh(&sc->lock); -not_found: + +exit: rcu_read_unlock(); *dnode = node; return port; @@ -488,7 +615,8 @@ bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 scope, spin_lock_bh(&sc->lock); - sr = tipc_service_first_range(sc, instance); + /* Todo: a full search i.e. service_range_foreach_match() instead? */ + sr = service_range_match_first(sc->ranges.rb_node, instance, instance); if (!sr) goto no_match; @@ -517,7 +645,6 @@ void tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper, struct service_range *sr; struct tipc_service *sc; struct publication *p; - struct rb_node *n; rcu_read_lock(); sc = tipc_service_find(net, type); @@ -525,13 +652,7 @@ void tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper, goto exit; spin_lock_bh(&sc->lock); - - for (n = rb_first(&sc->ranges); n; n = rb_next(n)) { - sr = container_of(n, struct service_range, tree_node); - if (sr->upper < lower) - continue; - if (sr->lower > upper) - break; + service_range_foreach_match(sr, sc, lower, upper) { list_for_each_entry(p, &sr->local_publ, local_publ) { if (p->scope == scope || (!exact && p->scope < scope)) tipc_dest_push(dports, 0, p->port); @@ -552,7 +673,6 @@ void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower, struct service_range *sr; struct tipc_service *sc; struct publication *p; - struct rb_node *n; rcu_read_lock(); sc = tipc_service_find(net, type); @@ -560,13 +680,7 @@ void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower, goto exit; spin_lock_bh(&sc->lock); - - for (n = rb_first(&sc->ranges); n; n = rb_next(n)) { - sr = container_of(n, struct service_range, tree_node); - if (sr->upper < lower) - continue; - if (sr->lower > upper) - break; + service_range_foreach_match(sr, sc, lower, upper) { list_for_each_entry(p, &sr->all_publ, all_publ) { tipc_nlist_add(nodes, p->node); } @@ -764,7 +878,7 @@ static void tipc_service_delete(struct net *net, struct tipc_service *sc) tipc_service_remove_publ(sr, p->node, p->key); kfree_rcu(p, rcu); } - rb_erase(&sr->tree_node, &sc->ranges); + rb_erase_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks); kfree(sr); } hlist_del_init_rcu(&sc->service_list); diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index f79066334cc8..728bc7016c38 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -58,6 +58,7 @@ struct tipc_group; * @node: network address of publishing socket's node * @port: publishing port * @key: publication key, unique across the cluster + * @id: publication id * @binding_node: all publications from the same node which bound this one * - Remote publications: in node->publ_list * Used by node/name distr to withdraw publications when node is lost @@ -69,6 +70,7 @@ struct tipc_group; * Used by closest_first and multicast receive lookup algorithms * @all_publ: all publications identical to this one, whatever node and scope * Used by round-robin lookup algorithm + * @list: to form a list of publications in temporal order * @rcu: RCU callback head used for deferred freeing */ struct publication { @@ -79,10 +81,12 @@ struct publication { u32 node; u32 port; u32 key; + u32 id; struct list_head binding_node; struct list_head binding_sock; struct list_head local_publ; struct list_head all_publ; + struct list_head list; struct rcu_head rcu; }; diff --git a/net/tipc/net.c b/net/tipc/net.c index 85707c185360..85400e4242de 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -42,6 +42,7 @@ #include "node.h" #include "bcast.h" #include "netlink.h" +#include "monitor.h" /* * The TIPC locking policy is designed to ensure a very fine locking @@ -136,6 +137,7 @@ static void tipc_net_finalize(struct net *net, u32 addr) tipc_set_node_addr(net, addr); tipc_named_reinit(net); tipc_sk_reinit(net); + tipc_mon_reinit_self(net); tipc_nametbl_publish(net, TIPC_CFG_SRV, addr, addr, TIPC_CLUSTER_SCOPE, 0, addr); } @@ -300,3 +302,59 @@ int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info) return err; } + +static int __tipc_nl_addr_legacy_get(struct net *net, struct tipc_nl_msg *msg) +{ + struct tipc_net *tn = tipc_net(net); + struct nlattr *attrs; + void *hdr; + + hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, + 0, TIPC_NL_ADDR_LEGACY_GET); + if (!hdr) + return -EMSGSIZE; + + attrs = nla_nest_start(msg->skb, TIPC_NLA_NET); + if (!attrs) + goto msg_full; + + if (tn->legacy_addr_format) + if (nla_put_flag(msg->skb, TIPC_NLA_NET_ADDR_LEGACY)) + goto attr_msg_full; + + nla_nest_end(msg->skb, attrs); + genlmsg_end(msg->skb, hdr); + + return 0; + +attr_msg_full: + nla_nest_cancel(msg->skb, attrs); +msg_full: + genlmsg_cancel(msg->skb, hdr); + + return -EMSGSIZE; +} + +int tipc_nl_net_addr_legacy_get(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = sock_net(skb->sk); + struct tipc_nl_msg msg; + struct sk_buff *rep; + int err; + + rep = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!rep) + return -ENOMEM; + + msg.skb = rep; + msg.portid = info->snd_portid; + msg.seq = info->snd_seq; + + err = __tipc_nl_addr_legacy_get(net, &msg); + if (err) { + nlmsg_free(msg.skb); + return err; + } + + return genlmsg_reply(msg.skb, info); +} diff --git a/net/tipc/net.h b/net/tipc/net.h index b7f2e364eb99..6740d97c706e 100644 --- a/net/tipc/net.h +++ b/net/tipc/net.h @@ -47,5 +47,6 @@ void tipc_net_stop(struct net *net); int tipc_nl_net_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info); int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info); +int tipc_nl_net_addr_legacy_get(struct sk_buff *skb, struct genl_info *info); #endif diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index d6165ad384c0..7c35094c20b8 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -83,6 +83,7 @@ const struct nla_policy tipc_nl_net_policy[TIPC_NLA_NET_MAX + 1] = { [TIPC_NLA_NET_ADDR] = { .type = NLA_U32 }, [TIPC_NLA_NET_NODEID] = { .type = NLA_U64 }, [TIPC_NLA_NET_NODEID_W1] = { .type = NLA_U64 }, + [TIPC_NLA_NET_ADDR_LEGACY] = { .type = NLA_FLAG } }; const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = { @@ -102,7 +103,11 @@ const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = { const struct nla_policy tipc_nl_node_policy[TIPC_NLA_NODE_MAX + 1] = { [TIPC_NLA_NODE_UNSPEC] = { .type = NLA_UNSPEC }, [TIPC_NLA_NODE_ADDR] = { .type = NLA_U32 }, - [TIPC_NLA_NODE_UP] = { .type = NLA_FLAG } + [TIPC_NLA_NODE_UP] = { .type = NLA_FLAG }, + [TIPC_NLA_NODE_ID] = { .type = NLA_BINARY, + .len = TIPC_NODEID_LEN}, + [TIPC_NLA_NODE_KEY] = { .type = NLA_BINARY, + .len = TIPC_AEAD_KEY_SIZE_MAX}, }; /* Properties valid for media, bearer and link */ @@ -176,7 +181,8 @@ static const struct genl_ops tipc_genl_v2_ops[] = { }, { .cmd = TIPC_NL_PUBL_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = tipc_nl_publ_dump, }, { @@ -239,7 +245,8 @@ static const struct genl_ops tipc_genl_v2_ops[] = { }, { .cmd = TIPC_NL_MON_PEER_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = tipc_nl_node_dump_monitor_peer, }, { @@ -250,10 +257,28 @@ static const struct genl_ops tipc_genl_v2_ops[] = { #ifdef CONFIG_TIPC_MEDIA_UDP { .cmd = TIPC_NL_UDP_GET_REMOTEIP, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = tipc_udp_nl_dump_remoteip, }, #endif +#ifdef CONFIG_TIPC_CRYPTO + { + .cmd = TIPC_NL_KEY_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = tipc_nl_node_set_key, + }, + { + .cmd = TIPC_NL_KEY_FLUSH, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = tipc_nl_node_flush_key, + }, +#endif + { + .cmd = TIPC_NL_ADDR_LEGACY_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = tipc_nl_net_addr_legacy_get, + }, }; struct genl_family tipc_genl_family __ro_after_init = { @@ -268,18 +293,6 @@ struct genl_family tipc_genl_family __ro_after_init = { .n_ops = ARRAY_SIZE(tipc_genl_v2_ops), }; -int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***attr) -{ - u32 maxattr = tipc_genl_family.maxattr; - - *attr = genl_family_attrbuf(&tipc_genl_family); - if (!*attr) - return -EOPNOTSUPP; - - return nlmsg_parse_deprecated(nlh, GENL_HDRLEN, *attr, maxattr, - tipc_nl_policy, NULL); -} - int __init tipc_netlink_start(void) { int res; diff --git a/net/tipc/netlink.h b/net/tipc/netlink.h index 4ba0ad422110..7cf777723e3e 100644 --- a/net/tipc/netlink.h +++ b/net/tipc/netlink.h @@ -38,7 +38,6 @@ #include <net/netlink.h> extern struct genl_family tipc_genl_family; -int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***buf); struct tipc_nl_msg { struct sk_buff *skb; diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index e135d4e11231..217516357ef2 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -181,15 +181,18 @@ static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, struct tipc_nl_compat_msg *msg, struct sk_buff *arg) { + struct genl_dumpit_info info; int len = 0; int err; struct sk_buff *buf; struct nlmsghdr *nlmsg; struct netlink_callback cb; + struct nlattr **attrbuf; memset(&cb, 0, sizeof(cb)); cb.nlh = (struct nlmsghdr *)arg->data; cb.skb = arg; + cb.data = &info; buf = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!buf) @@ -201,19 +204,35 @@ static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, return -ENOMEM; } + attrbuf = kcalloc(tipc_genl_family.maxattr + 1, + sizeof(struct nlattr *), GFP_KERNEL); + if (!attrbuf) { + err = -ENOMEM; + goto err_out; + } + + info.attrs = attrbuf; + err = nlmsg_parse_deprecated(cb.nlh, GENL_HDRLEN, attrbuf, + tipc_genl_family.maxattr, + tipc_genl_family.policy, NULL); + if (err) + goto err_out; + do { int rem; len = (*cmd->dumpit)(buf, &cb); nlmsg_for_each_msg(nlmsg, nlmsg_hdr(buf), len, rem) { - struct nlattr **attrs; - - err = tipc_nlmsg_parse(nlmsg, &attrs); + err = nlmsg_parse_deprecated(nlmsg, GENL_HDRLEN, + attrbuf, + tipc_genl_family.maxattr, + tipc_genl_family.policy, + NULL); if (err) goto err_out; - err = (*cmd->format)(msg, attrs); + err = (*cmd->format)(msg, attrbuf); if (err) goto err_out; @@ -231,6 +250,7 @@ static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, err = 0; err_out: + kfree(attrbuf); tipc_dump_done(&cb); kfree_skb(buf); @@ -550,7 +570,7 @@ static int tipc_nl_compat_link_stat_dump(struct tipc_nl_compat_msg *msg, if (len <= 0) return -EINVAL; - len = min_t(int, len, TIPC_MAX_BEARER_NAME); + len = min_t(int, len, TIPC_MAX_LINK_NAME); if (!string_is_valid(name, len)) return -EINVAL; @@ -822,7 +842,7 @@ static int tipc_nl_compat_link_reset_stats(struct tipc_nl_compat_cmd_doit *cmd, if (len <= 0) return -EINVAL; - len = min_t(int, len, TIPC_MAX_BEARER_NAME); + len = min_t(int, len, TIPC_MAX_LINK_NAME); if (!string_is_valid(name, len)) return -EINVAL; diff --git a/net/tipc/node.c b/net/tipc/node.c index 3a5be1d7e572..0c88778c88b5 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -44,6 +44,7 @@ #include "discover.h" #include "netlink.h" #include "trace.h" +#include "crypto.h" #define INVALID_NODE_SIG 0x10000 #define NODE_CLEANUP_AFTER 300000 @@ -89,6 +90,7 @@ struct tipc_bclink_entry { * @links: array containing references to all links to node * @action_flags: bit mask of different types of node actions * @state: connectivity state vs peer node + * @preliminary: a preliminary node or not * @sync_point: sequence number where synch/failover is finished * @list: links to adjacent nodes in sorted list of cluster's nodes * @working_links: number of working links to node (both active and standby) @@ -99,6 +101,7 @@ struct tipc_bclink_entry { * @publ_list: list of publications * @rcu: rcu struct for tipc_node * @delete_at: indicates the time for deleting a down node + * @crypto_rx: RX crypto handler */ struct tipc_node { u32 addr; @@ -112,6 +115,7 @@ struct tipc_node { int action_flags; struct list_head list; int state; + bool preliminary; bool failover_sent; u16 sync_point; int link_cnt; @@ -120,12 +124,18 @@ struct tipc_node { u32 signature; u32 link_id; u8 peer_id[16]; + char peer_id_string[NODE_ID_STR_LEN]; struct list_head publ_list; struct list_head conn_sks; unsigned long keepalive_intv; struct timer_list timer; struct rcu_head rcu; unsigned long delete_at; + struct net *peer_net; + u32 peer_hash_mix; +#ifdef CONFIG_TIPC_CRYPTO + struct tipc_crypto *crypto_rx; +#endif }; /* Node FSM states and events: @@ -163,7 +173,6 @@ static void tipc_node_timeout(struct timer_list *t); static void tipc_node_fsm_evt(struct tipc_node *n, int evt); static struct tipc_node *tipc_node_find(struct net *net, u32 addr); static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id); -static void tipc_node_put(struct tipc_node *node); static bool node_is_up(struct tipc_node *n); static void tipc_node_delete_from_list(struct tipc_node *node); @@ -184,7 +193,7 @@ static struct tipc_link *node_active_link(struct tipc_node *n, int sel) return n->links[bearer_id].link; } -int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel) +int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel, bool connected) { struct tipc_node *n; int bearer_id; @@ -194,6 +203,14 @@ int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel) if (unlikely(!n)) return mtu; + /* Allow MAX_MSG_SIZE when building connection oriented message + * if they are in the same core network + */ + if (n->peer_net && connected) { + tipc_node_put(n); + return mtu; + } + bearer_id = n->active_links[sel & 1]; if (likely(bearer_id != INVALID_BEARER_ID)) mtu = n->links[bearer_id].mtu; @@ -235,15 +252,51 @@ u16 tipc_node_get_capabilities(struct net *net, u32 addr) return caps; } +u32 tipc_node_get_addr(struct tipc_node *node) +{ + return (node) ? node->addr : 0; +} + +char *tipc_node_get_id_str(struct tipc_node *node) +{ + return node->peer_id_string; +} + +#ifdef CONFIG_TIPC_CRYPTO +/** + * tipc_node_crypto_rx - Retrieve crypto RX handle from node + * Note: node ref counter must be held first! + */ +struct tipc_crypto *tipc_node_crypto_rx(struct tipc_node *__n) +{ + return (__n) ? __n->crypto_rx : NULL; +} + +struct tipc_crypto *tipc_node_crypto_rx_by_list(struct list_head *pos) +{ + return container_of(pos, struct tipc_node, list)->crypto_rx; +} +#endif + +static void tipc_node_free(struct rcu_head *rp) +{ + struct tipc_node *n = container_of(rp, struct tipc_node, rcu); + +#ifdef CONFIG_TIPC_CRYPTO + tipc_crypto_stop(&n->crypto_rx); +#endif + kfree(n); +} + static void tipc_node_kref_release(struct kref *kref) { struct tipc_node *n = container_of(kref, struct tipc_node, kref); kfree(n->bc_entry.link); - kfree_rcu(n, rcu); + call_rcu(&n->rcu, tipc_node_free); } -static void tipc_node_put(struct tipc_node *node) +void tipc_node_put(struct tipc_node *node) { kref_put(&node->kref, tipc_node_kref_release); } @@ -264,7 +317,7 @@ static struct tipc_node *tipc_node_find(struct net *net, u32 addr) rcu_read_lock(); hlist_for_each_entry_rcu(node, &tn->node_htable[thash], hash) { - if (node->addr != addr) + if (node->addr != addr || node->preliminary) continue; if (!kref_get_unless_zero(&node->kref)) node = NULL; @@ -360,18 +413,71 @@ static void tipc_node_write_unlock(struct tipc_node *n) } } -static struct tipc_node *tipc_node_create(struct net *net, u32 addr, - u8 *peer_id, u16 capabilities) +static void tipc_node_assign_peer_net(struct tipc_node *n, u32 hash_mixes) +{ + int net_id = tipc_netid(n->net); + struct tipc_net *tn_peer; + struct net *tmp; + u32 hash_chk; + + if (n->peer_net) + return; + + for_each_net_rcu(tmp) { + tn_peer = tipc_net(tmp); + if (!tn_peer) + continue; + /* Integrity checking whether node exists in namespace or not */ + if (tn_peer->net_id != net_id) + continue; + if (memcmp(n->peer_id, tn_peer->node_id, NODE_ID_LEN)) + continue; + hash_chk = tipc_net_hash_mixes(tmp, tn_peer->random); + if (hash_mixes ^ hash_chk) + continue; + n->peer_net = tmp; + n->peer_hash_mix = hash_mixes; + break; + } +} + +struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id, + u16 capabilities, u32 hash_mixes, + bool preliminary) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *n, *temp_node; struct tipc_link *l; + unsigned long intv; int bearer_id; int i; spin_lock_bh(&tn->node_list_lock); - n = tipc_node_find(net, addr); + n = tipc_node_find(net, addr) ?: + tipc_node_find_by_id(net, peer_id); if (n) { + if (!n->preliminary) + goto update; + if (preliminary) + goto exit; + /* A preliminary node becomes "real" now, refresh its data */ + tipc_node_write_lock(n); + n->preliminary = false; + n->addr = addr; + hlist_del_rcu(&n->hash); + hlist_add_head_rcu(&n->hash, + &tn->node_htable[tipc_hashfn(addr)]); + list_del_rcu(&n->list); + list_for_each_entry_rcu(temp_node, &tn->node_list, list) { + if (n->addr < temp_node->addr) + break; + } + list_add_tail_rcu(&n->list, &temp_node->list); + tipc_node_write_unlock_fast(n); + +update: + if (n->peer_hash_mix ^ hash_mixes) + tipc_node_assign_peer_net(n, hash_mixes); if (n->capabilities == capabilities) goto exit; /* Same node may come back with new capabilities */ @@ -389,6 +495,10 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } + + tipc_bcast_toggle_rcast(net, + (tn->capabilities & TIPC_BCAST_RCAST)); + goto exit; } n = kzalloc(sizeof(*n), GFP_ATOMIC); @@ -396,9 +506,23 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, pr_warn("Node creation failed, no memory\n"); goto exit; } + tipc_nodeid2string(n->peer_id_string, peer_id); +#ifdef CONFIG_TIPC_CRYPTO + if (unlikely(tipc_crypto_start(&n->crypto_rx, net, n))) { + pr_warn("Failed to start crypto RX(%s)!\n", n->peer_id_string); + kfree(n); + n = NULL; + goto exit; + } +#endif n->addr = addr; + n->preliminary = preliminary; memcpy(&n->peer_id, peer_id, 16); n->net = net; + n->peer_net = NULL; + n->peer_hash_mix = 0; + /* Assign kernel local namespace if exists */ + tipc_node_assign_peer_net(n, hash_mixes); n->capabilities = capabilities; kref_init(&n->kref); rwlock_init(&n->lock); @@ -417,22 +541,14 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, n->signature = INVALID_NODE_SIG; n->active_links[0] = INVALID_BEARER_ID; n->active_links[1] = INVALID_BEARER_ID; - if (!tipc_link_bc_create(net, tipc_own_addr(net), - addr, U16_MAX, - tipc_link_window(tipc_bc_sndlink(net)), - n->capabilities, - &n->bc_entry.inputq1, - &n->bc_entry.namedq, - tipc_bc_sndlink(net), - &n->bc_entry.link)) { - pr_warn("Broadcast rcv link creation failed, no memory\n"); - kfree(n); - n = NULL; - goto exit; - } + n->bc_entry.link = NULL; tipc_node_get(n); timer_setup(&n->timer, tipc_node_timeout, 0); - n->keepalive_intv = U32_MAX; + /* Start a slow timer anyway, crypto needs it */ + n->keepalive_intv = 10000; + intv = jiffies + msecs_to_jiffies(n->keepalive_intv); + if (!mod_timer(&n->timer, intv)) + tipc_node_get(n); hlist_add_head_rcu(&n->hash, &tn->node_htable[tipc_hashfn(addr)]); list_for_each_entry_rcu(temp_node, &tn->node_list, list) { if (n->addr < temp_node->addr) @@ -444,6 +560,7 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } + tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST)); trace_tipc_node_create(n, true, " "); exit: spin_unlock_bh(&tn->node_list_lock); @@ -617,12 +734,18 @@ static bool tipc_node_cleanup(struct tipc_node *peer) } tipc_node_write_unlock(peer); + if (!deleted) { + spin_unlock_bh(&tn->node_list_lock); + return deleted; + } + /* Calculate cluster capabilities */ tn->capabilities = TIPC_NODE_CAPABILITIES; list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } - + tipc_bcast_toggle_rcast(peer->net, + (tn->capabilities & TIPC_BCAST_RCAST)); spin_unlock_bh(&tn->node_list_lock); return deleted; } @@ -645,6 +768,10 @@ static void tipc_node_timeout(struct timer_list *t) return; } +#ifdef CONFIG_TIPC_CRYPTO + /* Take any crypto key related actions first */ + tipc_crypto_timeout(n->crypto_rx); +#endif __skb_queue_head_init(&xmitq); /* Initial node interval to value larger (10 seconds), then it will be @@ -665,7 +792,7 @@ static void tipc_node_timeout(struct timer_list *t) remains--; } tipc_node_read_unlock(n); - tipc_bearer_xmit(n->net, bearer_id, &xmitq, &le->maddr); + tipc_bearer_xmit(n->net, bearer_id, &xmitq, &le->maddr, n); if (rc & TIPC_LINK_DOWN_EVT) tipc_node_link_down(n, bearer_id, false); } @@ -697,7 +824,7 @@ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id, n->link_id = tipc_link_id(nl); /* Leave room for tunnel header when returning 'mtu' to users: */ - n->links[bearer_id].mtu = tipc_link_mtu(nl) - INT_H_SIZE; + n->links[bearer_id].mtu = tipc_link_mss(nl); tipc_bearer_add_dest(n->net, bearer_id, n->addr); tipc_bcast_inc_bearer_dst_cnt(n->net, bearer_id); @@ -751,7 +878,7 @@ static void tipc_node_link_up(struct tipc_node *n, int bearer_id, tipc_node_write_lock(n); __tipc_node_link_up(n, bearer_id, xmitq); maddr = &n->links[bearer_id].maddr; - tipc_bearer_xmit(n->net, bearer_id, xmitq, maddr); + tipc_bearer_xmit(n->net, bearer_id, xmitq, maddr, n); tipc_node_write_unlock(n); } @@ -906,7 +1033,7 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete) if (delete) tipc_mon_remove_peer(n->net, n->addr, old_bearer_id); if (!skb_queue_empty(&xmitq)) - tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr); + tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr, n); tipc_sk_rcv(n->net, &le->inputq); } @@ -950,6 +1077,8 @@ u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr) { struct tipc_net *tn = tipc_net(net); struct tipc_node *n; + bool preliminary; + u32 sugg_addr; /* Suggest new address if some other peer is using this one */ n = tipc_node_find(net, addr); @@ -965,9 +1094,11 @@ u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr) /* Suggest previously used address if peer is known */ n = tipc_node_find_by_id(net, id); if (n) { - addr = n->addr; + sugg_addr = n->addr; + preliminary = n->preliminary; tipc_node_put(n); - return addr; + if (!preliminary) + return sugg_addr; } /* Even this node may be in conflict */ @@ -979,12 +1110,12 @@ u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr) void tipc_node_check_dest(struct net *net, u32 addr, u8 *peer_id, struct tipc_bearer *b, - u16 capabilities, u32 signature, + u16 capabilities, u32 signature, u32 hash_mixes, struct tipc_media_addr *maddr, bool *respond, bool *dupl_addr) { struct tipc_node *n; - struct tipc_link *l; + struct tipc_link *l, *snd_l; struct tipc_link_entry *le; bool addr_match = false; bool sign_match = false; @@ -998,11 +1129,28 @@ void tipc_node_check_dest(struct net *net, u32 addr, *dupl_addr = false; *respond = false; - n = tipc_node_create(net, addr, peer_id, capabilities); + n = tipc_node_create(net, addr, peer_id, capabilities, hash_mixes, + false); if (!n) return; tipc_node_write_lock(n); + if (unlikely(!n->bc_entry.link)) { + snd_l = tipc_bc_sndlink(net); + if (!tipc_link_bc_create(net, tipc_own_addr(net), + addr, U16_MAX, + tipc_link_min_win(snd_l), + tipc_link_max_win(snd_l), + n->capabilities, + &n->bc_entry.inputq1, + &n->bc_entry.namedq, snd_l, + &n->bc_entry.link)) { + pr_warn("Broadcast rcv link creation failed, no mem\n"); + tipc_node_write_unlock_fast(n); + tipc_node_put(n); + return; + } + } le = &n->links[b->identity]; @@ -1017,6 +1165,9 @@ void tipc_node_check_dest(struct net *net, u32 addr, if (sign_match && addr_match && link_up) { /* All is fine. Do nothing. */ reset = false; + /* Peer node is not a container/local namespace */ + if (!n->peer_hash_mix) + n->peer_hash_mix = hash_mixes; } else if (sign_match && addr_match && !link_up) { /* Respond. The link will come up in due time */ *respond = true; @@ -1083,7 +1234,7 @@ void tipc_node_check_dest(struct net *net, u32 addr, get_random_bytes(&session, sizeof(u16)); if (!tipc_link_create(net, if_name, b->identity, b->tolerance, b->net_plane, b->mtu, b->priority, - b->window, session, + b->min_win, b->max_win, session, tipc_own_addr(net), addr, peer_id, n->capabilities, tipc_bc_sndlink(n->net), n->bc_entry.link, @@ -1342,7 +1493,8 @@ static void node_lost_contact(struct tipc_node *n, /* Notify publications from this node */ n->action_flags |= TIPC_NOTIFY_NODE_DOWN; - + n->peer_net = NULL; + n->peer_hash_mix = 0; /* Notify sockets connected to node */ list_for_each_entry_safe(conn, safe, conns, list) { skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, @@ -1424,6 +1576,56 @@ msg_full: return -EMSGSIZE; } +static void tipc_lxc_xmit(struct net *peer_net, struct sk_buff_head *list) +{ + struct tipc_msg *hdr = buf_msg(skb_peek(list)); + struct sk_buff_head inputq; + + switch (msg_user(hdr)) { + case TIPC_LOW_IMPORTANCE: + case TIPC_MEDIUM_IMPORTANCE: + case TIPC_HIGH_IMPORTANCE: + case TIPC_CRITICAL_IMPORTANCE: + if (msg_connected(hdr) || msg_named(hdr)) { + tipc_loopback_trace(peer_net, list); + spin_lock_init(&list->lock); + tipc_sk_rcv(peer_net, list); + return; + } + if (msg_mcast(hdr)) { + tipc_loopback_trace(peer_net, list); + skb_queue_head_init(&inputq); + tipc_sk_mcast_rcv(peer_net, list, &inputq); + __skb_queue_purge(list); + skb_queue_purge(&inputq); + return; + } + return; + case MSG_FRAGMENTER: + if (tipc_msg_assemble(list)) { + tipc_loopback_trace(peer_net, list); + skb_queue_head_init(&inputq); + tipc_sk_mcast_rcv(peer_net, list, &inputq); + __skb_queue_purge(list); + skb_queue_purge(&inputq); + } + return; + case GROUP_PROTOCOL: + case CONN_MANAGER: + tipc_loopback_trace(peer_net, list); + spin_lock_init(&list->lock); + tipc_sk_rcv(peer_net, list); + return; + case LINK_PROTOCOL: + case NAME_DISTRIBUTOR: + case TUNNEL_PROTOCOL: + case BCAST_PROTOCOL: + return; + default: + return; + }; +} + /** * tipc_node_xmit() is the general link level function for message sending * @net: the applicable net namespace @@ -1439,26 +1641,40 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, struct tipc_link_entry *le = NULL; struct tipc_node *n; struct sk_buff_head xmitq; + bool node_up = false; int bearer_id; int rc; if (in_own_node(net, dnode)) { + tipc_loopback_trace(net, list); + spin_lock_init(&list->lock); tipc_sk_rcv(net, list); return 0; } n = tipc_node_find(net, dnode); if (unlikely(!n)) { - skb_queue_purge(list); + __skb_queue_purge(list); return -EHOSTUNREACH; } tipc_node_read_lock(n); + node_up = node_is_up(n); + if (node_up && n->peer_net && check_net(n->peer_net)) { + /* xmit inner linux container */ + tipc_lxc_xmit(n->peer_net, list); + if (likely(skb_queue_empty(list))) { + tipc_node_read_unlock(n); + tipc_node_put(n); + return 0; + } + } + bearer_id = n->active_links[selector & 1]; if (unlikely(bearer_id == INVALID_BEARER_ID)) { tipc_node_read_unlock(n); tipc_node_put(n); - skb_queue_purge(list); + __skb_queue_purge(list); return -EHOSTUNREACH; } @@ -1472,7 +1688,7 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, if (unlikely(rc == -ENOBUFS)) tipc_node_link_down(n, bearer_id, false); else - tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr); + tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n); tipc_node_put(n); @@ -1490,7 +1706,7 @@ int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode, { struct sk_buff_head head; - skb_queue_head_init(&head); + __skb_queue_head_init(&head); __skb_queue_tail(&head, skb); tipc_node_xmit(net, &head, dnode, selector); return 0; @@ -1620,7 +1836,7 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id } if (!skb_queue_empty(&xmitq)) - tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr); + tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n); if (!skb_queue_empty(&be->inputq1)) tipc_node_mcast_rcv(n); @@ -1649,7 +1865,6 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, int usr = msg_user(hdr); int mtyp = msg_type(hdr); u16 oseqno = msg_seqno(hdr); - u16 iseqno = msg_seqno(msg_inner_hdr(hdr)); u16 exp_pkts = msg_msgcnt(hdr); u16 rcv_nxt, syncpt, dlv_nxt, inputq_len; int state = n->state; @@ -1748,7 +1963,10 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, /* Initiate synch mode if applicable */ if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG) && (oseqno == 1)) { - syncpt = iseqno + exp_pkts - 1; + if (n->capabilities & TIPC_TUNNEL_ENHANCED) + syncpt = msg_syncpt(hdr); + else + syncpt = msg_seqno(msg_inner_hdr(hdr)) + exp_pkts - 1; if (!tipc_link_is_up(l)) __tipc_node_link_up(n, bearer_id, xmitq); if (n->state == SELF_UP_PEER_UP) { @@ -1796,20 +2014,38 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) { struct sk_buff_head xmitq; - struct tipc_node *n; + struct tipc_link_entry *le; struct tipc_msg *hdr; + struct tipc_node *n; int bearer_id = b->identity; - struct tipc_link_entry *le; u32 self = tipc_own_addr(net); int usr, rc = 0; u16 bc_ack; +#ifdef CONFIG_TIPC_CRYPTO + struct tipc_ehdr *ehdr; - __skb_queue_head_init(&xmitq); + /* Check if message must be decrypted first */ + if (TIPC_SKB_CB(skb)->decrypted || !tipc_ehdr_validate(skb)) + goto rcv; + + ehdr = (struct tipc_ehdr *)skb->data; + if (likely(ehdr->user != LINK_CONFIG)) { + n = tipc_node_find(net, ntohl(ehdr->addr)); + if (unlikely(!n)) + goto discard; + } else { + n = tipc_node_find_by_id(net, ehdr->id); + } + tipc_crypto_rcv(net, (n) ? n->crypto_rx : NULL, &skb, b); + if (!skb) + return; +rcv: +#endif /* Ensure message is well-formed before touching the header */ - TIPC_SKB_CB(skb)->validated = false; if (unlikely(!tipc_msg_validate(&skb))) goto discard; + __skb_queue_head_init(&xmitq); hdr = buf_msg(skb); usr = msg_user(hdr); bc_ack = msg_bcast_ack(hdr); @@ -1880,7 +2116,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) tipc_sk_rcv(net, &le->inputq); if (!skb_queue_empty(&xmitq)) - tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr); + tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n); tipc_node_put(n); discard: @@ -1911,7 +2147,7 @@ void tipc_node_apply_property(struct net *net, struct tipc_bearer *b, tipc_link_set_mtu(e->link, b->mtu); } tipc_node_write_unlock(n); - tipc_bearer_xmit(net, bearer_id, &xmitq, &e->maddr); + tipc_bearer_xmit(net, bearer_id, &xmitq, &e->maddr, NULL); } rcu_read_unlock(); @@ -1922,7 +2158,7 @@ int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info) struct net *net = sock_net(skb->sk); struct tipc_net *tn = net_generic(net, tipc_net_id); struct nlattr *attrs[TIPC_NLA_NET_MAX + 1]; - struct tipc_node *peer; + struct tipc_node *peer, *temp_node; u32 addr; int err; @@ -1963,6 +2199,12 @@ int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info) tipc_node_write_unlock(peer); tipc_node_delete(peer); + /* Calculate cluster capabilities */ + tn->capabilities = TIPC_NODE_CAPABILITIES; + list_for_each_entry_rcu(temp_node, &tn->node_list, list) { + tn->capabilities &= temp_node->capabilities; + } + tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST)); err = 0; err_out: tipc_node_put(peer); @@ -2007,6 +2249,8 @@ int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb) } list_for_each_entry_rcu(node, &tn->node_list, list) { + if (node->preliminary) + continue; if (last_addr) { if (node->addr == last_addr) last_addr = 0; @@ -2117,8 +2361,7 @@ int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info) if (attrs[TIPC_NLA_LINK_PROP]) { struct nlattr *props[TIPC_NLA_PROP_MAX + 1]; - err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_LINK_PROP], - props); + err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_LINK_PROP], props); if (err) { res = err; goto out; @@ -2137,16 +2380,19 @@ int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info) tipc_link_set_prio(link, prio, &xmitq); } if (props[TIPC_NLA_PROP_WIN]) { - u32 win; + u32 max_win; - win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); - tipc_link_set_queue_limits(link, win); + max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + tipc_link_set_queue_limits(link, + tipc_link_min_win(link), + max_win); } } out: tipc_node_read_unlock(node); - tipc_bearer_xmit(net, bearer_id, &xmitq, &node->links[bearer_id].maddr); + tipc_bearer_xmit(net, bearer_id, &xmitq, &node->links[bearer_id].maddr, + NULL); return res; } @@ -2480,13 +2726,9 @@ int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb, int err; if (!prev_node) { - struct nlattr **attrs; + struct nlattr **attrs = genl_dumpit_info(cb)->attrs; struct nlattr *mon[TIPC_NLA_MON_MAX + 1]; - err = tipc_nlmsg_parse(cb->nlh, &attrs); - if (err) - return err; - if (!attrs[TIPC_NLA_MON]) return -EINVAL; @@ -2526,11 +2768,142 @@ int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb, return skb->len; } -u32 tipc_node_get_addr(struct tipc_node *node) +#ifdef CONFIG_TIPC_CRYPTO +static int tipc_nl_retrieve_key(struct nlattr **attrs, + struct tipc_aead_key **key) { - return (node) ? node->addr : 0; + struct nlattr *attr = attrs[TIPC_NLA_NODE_KEY]; + + if (!attr) + return -ENODATA; + + *key = (struct tipc_aead_key *)nla_data(attr); + if (nla_len(attr) < tipc_aead_key_size(*key)) + return -EINVAL; + + return 0; +} + +static int tipc_nl_retrieve_nodeid(struct nlattr **attrs, u8 **node_id) +{ + struct nlattr *attr = attrs[TIPC_NLA_NODE_ID]; + + if (!attr) + return -ENODATA; + + if (nla_len(attr) < TIPC_NODEID_LEN) + return -EINVAL; + + *node_id = (u8 *)nla_data(attr); + return 0; +} + +static int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *attrs[TIPC_NLA_NODE_MAX + 1]; + struct net *net = sock_net(skb->sk); + struct tipc_net *tn = tipc_net(net); + struct tipc_node *n = NULL; + struct tipc_aead_key *ukey; + struct tipc_crypto *c; + u8 *id, *own_id; + int rc = 0; + + if (!info->attrs[TIPC_NLA_NODE]) + return -EINVAL; + + rc = nla_parse_nested(attrs, TIPC_NLA_NODE_MAX, + info->attrs[TIPC_NLA_NODE], + tipc_nl_node_policy, info->extack); + if (rc) + goto exit; + + own_id = tipc_own_id(net); + if (!own_id) { + rc = -EPERM; + goto exit; + } + + rc = tipc_nl_retrieve_key(attrs, &ukey); + if (rc) + goto exit; + + rc = tipc_aead_key_validate(ukey); + if (rc) + goto exit; + + rc = tipc_nl_retrieve_nodeid(attrs, &id); + switch (rc) { + case -ENODATA: + /* Cluster key mode */ + rc = tipc_crypto_key_init(tn->crypto_tx, ukey, CLUSTER_KEY); + break; + case 0: + /* Per-node key mode */ + if (!memcmp(id, own_id, NODE_ID_LEN)) { + c = tn->crypto_tx; + } else { + n = tipc_node_find_by_id(net, id) ?: + tipc_node_create(net, 0, id, 0xffffu, 0, true); + if (unlikely(!n)) { + rc = -ENOMEM; + break; + } + c = n->crypto_rx; + } + + rc = tipc_crypto_key_init(c, ukey, PER_NODE_KEY); + if (n) + tipc_node_put(n); + break; + default: + break; + } + +exit: + return (rc < 0) ? rc : 0; +} + +int tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) +{ + int err; + + rtnl_lock(); + err = __tipc_nl_node_set_key(skb, info); + rtnl_unlock(); + + return err; +} + +static int __tipc_nl_node_flush_key(struct sk_buff *skb, + struct genl_info *info) +{ + struct net *net = sock_net(skb->sk); + struct tipc_net *tn = tipc_net(net); + struct tipc_node *n; + + tipc_crypto_key_flush(tn->crypto_tx); + rcu_read_lock(); + list_for_each_entry_rcu(n, &tn->node_list, list) + tipc_crypto_key_flush(n->crypto_rx); + rcu_read_unlock(); + + pr_info("All keys are flushed!\n"); + return 0; } +int tipc_nl_node_flush_key(struct sk_buff *skb, struct genl_info *info) +{ + int err; + + rtnl_lock(); + err = __tipc_nl_node_flush_key(skb, info); + rtnl_unlock(); + + return err; +} +#endif + /** * tipc_node_dump - dump TIPC node data * @n: tipc node to be dumped @@ -2587,3 +2960,33 @@ int tipc_node_dump(struct tipc_node *n, bool more, char *buf) return i; } + +void tipc_node_pre_cleanup_net(struct net *exit_net) +{ + struct tipc_node *n; + struct tipc_net *tn; + struct net *tmp; + + rcu_read_lock(); + for_each_net_rcu(tmp) { + if (tmp == exit_net) + continue; + tn = tipc_net(tmp); + if (!tn) + continue; + spin_lock_bh(&tn->node_list_lock); + list_for_each_entry_rcu(n, &tn->node_list, list) { + if (!n->peer_net) + continue; + if (n->peer_net != exit_net) + continue; + tipc_node_write_lock(n); + n->peer_net = NULL; + n->peer_hash_mix = 0; + tipc_node_write_unlock_fast(n); + break; + } + spin_unlock_bh(&tn->node_list_lock); + } + rcu_read_unlock(); +} diff --git a/net/tipc/node.h b/net/tipc/node.h index c0bf49ea3de4..a6803b449a2c 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -53,7 +53,9 @@ enum { TIPC_NODE_ID128 = (1 << 5), TIPC_LINK_PROTO_SEQNO = (1 << 6), TIPC_MCAST_RBCTL = (1 << 7), - TIPC_GAP_ACK_BLOCK = (1 << 8) + TIPC_GAP_ACK_BLOCK = (1 << 8), + TIPC_TUNNEL_ENHANCED = (1 << 9), + TIPC_NAGLE = (1 << 10) }; #define TIPC_NODE_CAPABILITIES (TIPC_SYN_BIT | \ @@ -64,16 +66,28 @@ enum { TIPC_NODE_ID128 | \ TIPC_LINK_PROTO_SEQNO | \ TIPC_MCAST_RBCTL | \ - TIPC_GAP_ACK_BLOCK) + TIPC_GAP_ACK_BLOCK | \ + TIPC_TUNNEL_ENHANCED | \ + TIPC_NAGLE) + #define INVALID_BEARER_ID -1 void tipc_node_stop(struct net *net); bool tipc_node_get_id(struct net *net, u32 addr, u8 *id); u32 tipc_node_get_addr(struct tipc_node *node); +char *tipc_node_get_id_str(struct tipc_node *node); +void tipc_node_put(struct tipc_node *node); +struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id, + u16 capabilities, u32 hash_mixes, + bool preliminary); +#ifdef CONFIG_TIPC_CRYPTO +struct tipc_crypto *tipc_node_crypto_rx(struct tipc_node *__n); +struct tipc_crypto *tipc_node_crypto_rx_by_list(struct list_head *pos); +#endif u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr); void tipc_node_check_dest(struct net *net, u32 onode, u8 *peer_id128, struct tipc_bearer *bearer, - u16 capabilities, u32 signature, + u16 capabilities, u32 signature, u32 hash_mixes, struct tipc_media_addr *maddr, bool *respond, bool *dupl_addr); void tipc_node_delete_links(struct net *net, int bearer_id); @@ -90,7 +104,7 @@ void tipc_node_unsubscribe(struct net *net, struct list_head *subscr, u32 addr); void tipc_node_broadcast(struct net *net, struct sk_buff *skb); int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port); void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port); -int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel); +int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel, bool connected); bool tipc_node_is_up(struct net *net, u32 addr); u16 tipc_node_get_capabilities(struct net *net, u32 addr); int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb); @@ -105,4 +119,9 @@ int tipc_nl_node_get_monitor(struct sk_buff *skb, struct genl_info *info); int tipc_nl_node_dump_monitor(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb, struct netlink_callback *cb); +#ifdef CONFIG_TIPC_CRYPTO +int tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info); +int tipc_nl_node_flush_key(struct sk_buff *skb, struct genl_info *info); +#endif +void tipc_node_pre_cleanup_net(struct net *exit_net); #endif diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 83ae41d7e554..693e8902161e 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -75,6 +75,7 @@ struct sockaddr_pair { * @conn_instance: TIPC instance used when connection was established * @published: non-zero if port has one or more associated names * @max_pkt: maximum packet size "hint" used when building messages sent by port + * @maxnagle: maximum size of msg which can be subject to nagle * @portid: unique port identity in TIPC socket hash table * @phdr: preformatted message header used when sending messages * #cong_links: list of congested links @@ -97,6 +98,7 @@ struct tipc_sock { u32 conn_instance; int published; u32 max_pkt; + u32 maxnagle; u32 portid; struct tipc_msg phdr; struct list_head cong_links; @@ -116,6 +118,10 @@ struct tipc_sock { struct tipc_mc_method mc_method; struct rcu_head rcu; struct tipc_group *group; + u32 oneway; + u16 snd_backlog; + bool expect_ack; + bool nodelay; bool group_is_open; }; @@ -137,6 +143,7 @@ static int tipc_sk_insert(struct tipc_sock *tsk); static void tipc_sk_remove(struct tipc_sock *tsk); static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dsz); static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz); +static void tipc_sk_push_backlog(struct tipc_sock *tsk); static const struct proto_ops packet_ops; static const struct proto_ops stream_ops; @@ -227,6 +234,26 @@ static u16 tsk_inc(struct tipc_sock *tsk, int msglen) return 1; } +/* tsk_set_nagle - enable/disable nagle property by manipulating maxnagle + */ +static void tsk_set_nagle(struct tipc_sock *tsk) +{ + struct sock *sk = &tsk->sk; + + tsk->maxnagle = 0; + if (sk->sk_type != SOCK_STREAM) + return; + if (tsk->nodelay) + return; + if (!(tsk->peer_caps & TIPC_NAGLE)) + return; + /* Limit node local buffer size to avoid receive queue overflow */ + if (tsk->max_pkt == MAX_MSG_SIZE) + tsk->maxnagle = 1500; + else + tsk->maxnagle = tsk->max_pkt; +} + /** * tsk_advance_rx_queue - discard first buffer in socket receive queue * @@ -260,12 +287,12 @@ static void tipc_sk_respond(struct sock *sk, struct sk_buff *skb, int err) * * Caller must hold socket lock */ -static void tsk_rej_rx_queue(struct sock *sk) +static void tsk_rej_rx_queue(struct sock *sk, int error) { struct sk_buff *skb; while ((skb = __skb_dequeue(&sk->sk_receive_queue))) - tipc_sk_respond(sk, skb, TIPC_ERR_NO_PORT); + tipc_sk_respond(sk, skb, error); } static bool tipc_sk_connected(struct sock *sk) @@ -446,6 +473,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, tsk = tipc_sk(sk); tsk->max_pkt = MAX_PKT_DEFAULT; + tsk->maxnagle = 0; INIT_LIST_HEAD(&tsk->publications); INIT_LIST_HEAD(&tsk->cong_links); msg = &tsk->phdr; @@ -504,7 +532,7 @@ static void __tipc_shutdown(struct socket *sock, int error) struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); struct net *net = sock_net(sk); - long timeout = CONN_TIMEOUT_DEFAULT; + long timeout = msecs_to_jiffies(CONN_TIMEOUT_DEFAULT); u32 dnode = tsk_peer_node(tsk); struct sk_buff *skb; @@ -512,37 +540,50 @@ static void __tipc_shutdown(struct socket *sock, int error) tipc_wait_for_cond(sock, &timeout, (!tsk->cong_link_cnt && !tsk_conn_cong(tsk))); - /* Remove any pending SYN message */ + /* Push out delayed messages if in Nagle mode */ + tipc_sk_push_backlog(tsk); + /* Remove pending SYN */ __skb_queue_purge(&sk->sk_write_queue); - /* Reject all unreceived messages, except on an active connection - * (which disconnects locally & sends a 'FIN+' to peer). - */ - while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { - if (TIPC_SKB_CB(skb)->bytes_read) { - kfree_skb(skb); - continue; - } - if (!tipc_sk_type_connectionless(sk) && - sk->sk_state != TIPC_DISCONNECTING) { - tipc_set_sk_state(sk, TIPC_DISCONNECTING); - tipc_node_remove_conn(net, dnode, tsk->portid); - } - tipc_sk_respond(sk, skb, error); + /* Remove partially received buffer if any */ + skb = skb_peek(&sk->sk_receive_queue); + if (skb && TIPC_SKB_CB(skb)->bytes_read) { + __skb_unlink(skb, &sk->sk_receive_queue); + kfree_skb(skb); } - if (tipc_sk_type_connectionless(sk)) + /* Reject all unreceived messages if connectionless */ + if (tipc_sk_type_connectionless(sk)) { + tsk_rej_rx_queue(sk, error); return; + } - if (sk->sk_state != TIPC_DISCONNECTING) { + switch (sk->sk_state) { + case TIPC_CONNECTING: + case TIPC_ESTABLISHED: + tipc_set_sk_state(sk, TIPC_DISCONNECTING); + tipc_node_remove_conn(net, dnode, tsk->portid); + /* Send a FIN+/- to its peer */ + skb = __skb_dequeue(&sk->sk_receive_queue); + if (skb) { + __skb_queue_purge(&sk->sk_receive_queue); + tipc_sk_respond(sk, skb, error); + break; + } skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, SHORT_H_SIZE, 0, dnode, tsk_own_node(tsk), tsk_peer_port(tsk), tsk->portid, error); if (skb) tipc_node_xmit_skb(net, skb, dnode, tsk->portid); - tipc_node_remove_conn(net, dnode, tsk->portid); - tipc_set_sk_state(sk, TIPC_DISCONNECTING); + break; + case TIPC_LISTEN: + /* Reject all SYN messages */ + tsk_rej_rx_queue(sk, error); + break; + default: + __skb_queue_purge(&sk->sk_receive_queue); + break; } } @@ -740,7 +781,7 @@ static __poll_t tipc_poll(struct file *file, struct socket *sock, /* fall through */ case TIPC_LISTEN: case TIPC_CONNECTING: - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) revents |= EPOLLIN | EPOLLRDNORM; break; case TIPC_OPEN: @@ -748,7 +789,7 @@ static __poll_t tipc_poll(struct file *file, struct socket *sock, revents |= EPOLLOUT; if (!tipc_sk_type_connectionless(sk)) break; - if (skb_queue_empty(&sk->sk_receive_queue)) + if (skb_queue_empty_lockless(&sk->sk_receive_queue)) break; revents |= EPOLLIN | EPOLLRDNORM; break; @@ -809,7 +850,7 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, msg_set_nameupper(hdr, seq->upper); /* Build message as chain of buffers */ - skb_queue_head_init(&pkts); + __skb_queue_head_init(&pkts); rc = tipc_msg_build(hdr, msg, 0, dlen, mtu, &pkts); /* Send message if build was successful */ @@ -853,8 +894,8 @@ static int tipc_send_group_msg(struct net *net, struct tipc_sock *tsk, msg_set_grp_bc_seqno(hdr, bc_snd_nxt); /* Build message as chain of buffers */ - skb_queue_head_init(&pkts); - mtu = tipc_node_get_mtu(net, dnode, tsk->portid); + __skb_queue_head_init(&pkts); + mtu = tipc_node_get_mtu(net, dnode, tsk->portid, false); rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); if (unlikely(rc != dlen)) return rc; @@ -1058,7 +1099,7 @@ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m, msg_set_grp_bc_ack_req(hdr, ack); /* Build message as chain of buffers */ - skb_queue_head_init(&pkts); + __skb_queue_head_init(&pkts); rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); if (unlikely(rc != dlen)) return rc; @@ -1208,6 +1249,32 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, tipc_sk_rcv(net, inputq); } +/* tipc_sk_push_backlog(): send accumulated buffers in socket write queue + * when socket is in Nagle mode + */ +static void tipc_sk_push_backlog(struct tipc_sock *tsk) +{ + struct sk_buff_head *txq = &tsk->sk.sk_write_queue; + struct net *net = sock_net(&tsk->sk); + u32 dnode = tsk_peer_node(tsk); + struct sk_buff *skb = skb_peek(txq); + int rc; + + if (!skb || tsk->cong_link_cnt) + return; + + /* Do not send SYN again after congestion */ + if (msg_is_syn(buf_msg(skb))) + return; + + tsk->snt_unacked += tsk->snd_backlog; + tsk->snd_backlog = 0; + tsk->expect_ack = true; + rc = tipc_node_xmit(net, txq, dnode, tsk->portid); + if (rc == -ELINKCONG) + tsk->cong_link_cnt = 1; +} + /** * tipc_sk_conn_proto_rcv - receive a connection mng protocol message * @tsk: receiving socket @@ -1221,7 +1288,7 @@ static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb, u32 onode = tsk_own_node(tsk); struct sock *sk = &tsk->sk; int mtyp = msg_type(hdr); - bool conn_cong; + bool was_cong; /* Ignore if connection cannot be validated: */ if (!tsk_peer_msg(tsk, hdr)) { @@ -1254,11 +1321,13 @@ static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb, __skb_queue_tail(xmitq, skb); return; } else if (mtyp == CONN_ACK) { - conn_cong = tsk_conn_cong(tsk); + was_cong = tsk_conn_cong(tsk); + tsk->expect_ack = false; + tipc_sk_push_backlog(tsk); tsk->snt_unacked -= msg_conn_ack(hdr); if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL) tsk->snd_win = msg_adv_win(hdr); - if (conn_cong) + if (was_cong && !tsk_conn_cong(tsk)) sk->sk_write_space(sk); } else if (mtyp != CONN_PROBE_REPLY) { pr_warn("Received unknown CONN_PROTO msg\n"); @@ -1306,8 +1375,8 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) struct tipc_msg *hdr = &tsk->phdr; struct tipc_name_seq *seq; struct sk_buff_head pkts; - u32 dport, dnode = 0; - u32 type, inst; + u32 dport = 0, dnode = 0; + u32 type = 0, inst = 0; int mtu, rc; if (unlikely(dlen > TIPC_MAX_USER_MSG_SIZE)) @@ -1360,23 +1429,11 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) type = dest->addr.name.name.type; inst = dest->addr.name.name.instance; dnode = dest->addr.name.domain; - msg_set_type(hdr, TIPC_NAMED_MSG); - msg_set_hdr_sz(hdr, NAMED_H_SIZE); - msg_set_nametype(hdr, type); - msg_set_nameinst(hdr, inst); - msg_set_lookup_scope(hdr, tipc_node2scope(dnode)); dport = tipc_nametbl_translate(net, type, inst, &dnode); - msg_set_destnode(hdr, dnode); - msg_set_destport(hdr, dport); if (unlikely(!dport && !dnode)) return -EHOSTUNREACH; } else if (dest->addrtype == TIPC_ADDR_ID) { dnode = dest->addr.id.node; - msg_set_type(hdr, TIPC_DIRECT_MSG); - msg_set_lookup_scope(hdr, 0); - msg_set_destnode(hdr, dnode); - msg_set_destport(hdr, dest->addr.id.ref); - msg_set_hdr_sz(hdr, BASIC_H_SIZE); } else { return -EINVAL; } @@ -1387,13 +1444,31 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) if (unlikely(rc)) return rc; - skb_queue_head_init(&pkts); - mtu = tipc_node_get_mtu(net, dnode, tsk->portid); + if (dest->addrtype == TIPC_ADDR_NAME) { + msg_set_type(hdr, TIPC_NAMED_MSG); + msg_set_hdr_sz(hdr, NAMED_H_SIZE); + msg_set_nametype(hdr, type); + msg_set_nameinst(hdr, inst); + msg_set_lookup_scope(hdr, tipc_node2scope(dnode)); + msg_set_destnode(hdr, dnode); + msg_set_destport(hdr, dport); + } else { /* TIPC_ADDR_ID */ + msg_set_type(hdr, TIPC_DIRECT_MSG); + msg_set_lookup_scope(hdr, 0); + msg_set_destnode(hdr, dnode); + msg_set_destport(hdr, dest->addr.id.ref); + msg_set_hdr_sz(hdr, BASIC_H_SIZE); + } + + __skb_queue_head_init(&pkts); + mtu = tipc_node_get_mtu(net, dnode, tsk->portid, false); rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); if (unlikely(rc != dlen)) return rc; - if (unlikely(syn && !tipc_msg_skb_clone(&pkts, &sk->sk_write_queue))) + if (unlikely(syn && !tipc_msg_skb_clone(&pkts, &sk->sk_write_queue))) { + __skb_queue_purge(&pkts); return -ENOMEM; + } trace_tipc_sk_sendmsg(sk, skb_peek(&pkts), TIPC_DUMP_SK_SNDQ, " "); rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid); @@ -1437,15 +1512,15 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) struct sock *sk = sock->sk; DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); long timeout = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); + struct sk_buff_head *txq = &sk->sk_write_queue; struct tipc_sock *tsk = tipc_sk(sk); struct tipc_msg *hdr = &tsk->phdr; struct net *net = sock_net(sk); - struct sk_buff_head pkts; u32 dnode = tsk_peer_node(tsk); + int maxnagle = tsk->maxnagle; + int maxpkt = tsk->max_pkt; int send, sent = 0; - int rc = 0; - - skb_queue_head_init(&pkts); + int blocks, rc = 0; if (unlikely(dlen > INT_MAX)) return -EMSGSIZE; @@ -1467,21 +1542,35 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) tipc_sk_connected(sk))); if (unlikely(rc)) break; - send = min_t(size_t, dlen - sent, TIPC_MAX_USER_MSG_SIZE); - rc = tipc_msg_build(hdr, m, sent, send, tsk->max_pkt, &pkts); - if (unlikely(rc != send)) - break; - - trace_tipc_sk_sendstream(sk, skb_peek(&pkts), + blocks = tsk->snd_backlog; + if (tsk->oneway++ >= 4 && send <= maxnagle) { + rc = tipc_msg_append(hdr, m, send, maxnagle, txq); + if (unlikely(rc < 0)) + break; + blocks += rc; + if (blocks <= 64 && tsk->expect_ack) { + tsk->snd_backlog = blocks; + sent += send; + break; + } + tsk->expect_ack = true; + } else { + rc = tipc_msg_build(hdr, m, sent, send, maxpkt, txq); + if (unlikely(rc != send)) + break; + blocks += tsk_inc(tsk, send + MIN_H_SIZE); + } + trace_tipc_sk_sendstream(sk, skb_peek(txq), TIPC_DUMP_SK_SNDQ, " "); - rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid); + rc = tipc_node_xmit(net, txq, dnode, tsk->portid); if (unlikely(rc == -ELINKCONG)) { tsk->cong_link_cnt = 1; rc = 0; } if (likely(!rc)) { - tsk->snt_unacked += tsk_inc(tsk, send + MIN_H_SIZE); + tsk->snt_unacked += blocks; + tsk->snd_backlog = 0; sent += send; } } while (sent < dlen && !rc); @@ -1526,8 +1615,9 @@ static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port, sk_reset_timer(sk, &sk->sk_timer, jiffies + CONN_PROBING_INTV); tipc_set_sk_state(sk, TIPC_ESTABLISHED); tipc_node_add_conn(net, peer_node, tsk->portid, peer_port); - tsk->max_pkt = tipc_node_get_mtu(net, peer_node, tsk->portid); + tsk->max_pkt = tipc_node_get_mtu(net, peer_node, tsk->portid, true); tsk->peer_caps = tipc_node_get_capabilities(net, peer_node); + tsk_set_nagle(tsk); __skb_queue_purge(&sk->sk_write_queue); if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL) return; @@ -1805,7 +1895,7 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, /* Send group flow control advertisement when applicable */ if (tsk->group && msg_in_group(hdr) && !grp_evt) { - skb_queue_head_init(&xmitq); + __skb_queue_head_init(&xmitq); tipc_group_update_rcv_win(tsk->group, tsk_blocks(hlen + dlen), msg_orignode(hdr), msg_origport(hdr), &xmitq); @@ -1848,6 +1938,7 @@ static int tipc_recvstream(struct socket *sock, struct msghdr *m, bool peek = flags & MSG_PEEK; int offset, required, copy, copied = 0; int hlen, dlen, err, rc; + bool ack = false; long timeout; /* Catch invalid receive attempts */ @@ -1892,6 +1983,7 @@ static int tipc_recvstream(struct socket *sock, struct msghdr *m, /* Copy data if msg ok, otherwise return error/partial data */ if (likely(!err)) { + ack = msg_ack_required(hdr); offset = skb_cb->bytes_read; copy = min_t(int, dlen - offset, buflen - copied); rc = skb_copy_datagram_msg(skb, hlen + offset, m, copy); @@ -1919,7 +2011,7 @@ static int tipc_recvstream(struct socket *sock, struct msghdr *m, /* Send connection flow control advertisement when applicable */ tsk->rcv_unacked += tsk_inc(tsk, hlen + dlen); - if (unlikely(tsk->rcv_unacked >= tsk->rcv_win / TIPC_ACK_RATE)) + if (ack || tsk->rcv_unacked >= tsk->rcv_win / TIPC_ACK_RATE) tipc_sk_send_ack(tsk); /* Exit if all requested data or FIN/error received */ @@ -1990,6 +2082,7 @@ static void tipc_sk_proto_rcv(struct sock *sk, smp_wmb(); tsk->cong_link_cnt--; wakeup = true; + tipc_sk_push_backlog(tsk); break; case GROUP_PROTOCOL: tipc_group_proto_rcv(grp, &wakeup, hdr, inputq, xmitq); @@ -2029,6 +2122,7 @@ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) if (unlikely(msg_mcast(hdr))) return false; + tsk->oneway = 0; switch (sk->sk_state) { case TIPC_CONNECTING: @@ -2074,6 +2168,8 @@ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) return true; return false; case TIPC_ESTABLISHED: + if (!skb_queue_empty(&sk->sk_write_queue)) + tipc_sk_push_backlog(tsk); /* Accept only connection-based messages sent by peer */ if (likely(con_msg && !err && pport == oport && pnode == onode)) return true; @@ -2119,13 +2215,13 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *skb) struct tipc_msg *hdr = buf_msg(skb); if (unlikely(msg_in_group(hdr))) - return sk->sk_rcvbuf; + return READ_ONCE(sk->sk_rcvbuf); if (unlikely(!msg_connected(hdr))) - return sk->sk_rcvbuf << msg_importance(hdr); + return READ_ONCE(sk->sk_rcvbuf) << msg_importance(hdr); if (likely(tsk->peer_caps & TIPC_BLOCK_FLOWCTL)) - return sk->sk_rcvbuf; + return READ_ONCE(sk->sk_rcvbuf); return FLOWCTL_MSG_LIM; } @@ -2345,10 +2441,12 @@ static int tipc_wait_for_connect(struct socket *sock, long *timeo_p) return -ETIMEDOUT; if (signal_pending(current)) return sock_intr_errno(*timeo_p); + if (sk->sk_state == TIPC_DISCONNECTING) + break; add_wait_queue(sk_sleep(sk), &wait); - done = sk_wait_event(sk, timeo_p, - sk->sk_state != TIPC_CONNECTING, &wait); + done = sk_wait_event(sk, timeo_p, tipc_sk_connected(sk), + &wait); remove_wait_queue(sk_sleep(sk), &wait); } while (!done); return 0; @@ -2558,7 +2656,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags, * Reject any stray messages received by new socket * before the socket lock was taken (very, very unlikely) */ - tsk_rej_rx_queue(new_sk); + tsk_rej_rx_queue(new_sk, TIPC_ERR_NO_PORT); /* Connect new socket to it's peer */ tipc_sk_finish_conn(new_tsock, msg_origport(msg), msg_orignode(msg)); @@ -2674,13 +2772,14 @@ static void tipc_sk_timeout(struct timer_list *t) struct sk_buff_head list; int rc = 0; - skb_queue_head_init(&list); + __skb_queue_head_init(&list); bh_lock_sock(sk); /* Try again later if socket is busy */ if (sock_owned_by_user(sk)) { sk_reset_timer(sk, &sk->sk_timer, jiffies + HZ / 20); bh_unlock_sock(sk); + sock_put(sk); return; } @@ -2804,7 +2903,7 @@ static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid) struct tipc_sock *tsk; rcu_read_lock(); - tsk = rhashtable_lookup_fast(&tn->sk_rht, &portid, tsk_rht_params); + tsk = rhashtable_lookup(&tn->sk_rht, &portid, tsk_rht_params); if (tsk) sock_hold(&tsk->sk); rcu_read_unlock(); @@ -2959,6 +3058,7 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, case TIPC_SRC_DROPPABLE: case TIPC_DEST_DROPPABLE: case TIPC_CONN_TIMEOUT: + case TIPC_NODELAY: if (ol < sizeof(value)) return -EINVAL; if (get_user(value, (u32 __user *)ov)) @@ -3007,6 +3107,10 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, case TIPC_GROUP_LEAVE: res = tipc_sk_leave(tsk); break; + case TIPC_NODELAY: + tsk->nodelay = !!value; + tsk_set_nagle(tsk); + break; default: res = -EINVAL; } @@ -3588,13 +3692,9 @@ int tipc_nl_publ_dump(struct sk_buff *skb, struct netlink_callback *cb) struct tipc_sock *tsk; if (!tsk_portid) { - struct nlattr **attrs; + struct nlattr **attrs = genl_dumpit_info(cb)->attrs; struct nlattr *sock[TIPC_NLA_SOCK_MAX + 1]; - err = tipc_nlmsg_parse(cb->nlh, &attrs); - if (err) - return err; - if (!attrs[TIPC_NLA_SOCK]) return -EINVAL; @@ -3790,7 +3890,7 @@ int tipc_sk_dump(struct sock *sk, u16 dqueues, char *buf) i += scnprintf(buf + i, sz - i, " %d", sk->sk_sndbuf); i += scnprintf(buf + i, sz - i, " | %d", sk_rmem_alloc_get(sk)); i += scnprintf(buf + i, sz - i, " %d", sk->sk_rcvbuf); - i += scnprintf(buf + i, sz - i, " | %d\n", sk->sk_backlog.len); + i += scnprintf(buf + i, sz - i, " | %d\n", READ_ONCE(sk->sk_backlog.len)); if (dqueues & TIPC_DUMP_SK_SNDQ) { i += scnprintf(buf + i, sz - i, "sk_write_queue: "); diff --git a/net/tipc/sysctl.c b/net/tipc/sysctl.c index 6159d327db76..58ab3d6dcdce 100644 --- a/net/tipc/sysctl.c +++ b/net/tipc/sysctl.c @@ -35,6 +35,7 @@ #include "core.h" #include "trace.h" +#include "crypto.h" #include <linux/sysctl.h> @@ -64,6 +65,16 @@ static struct ctl_table tipc_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, +#ifdef CONFIG_TIPC_CRYPTO + { + .procname = "max_tfms", + .data = &sysctl_tipc_max_tfms, + .maxlen = sizeof(sysctl_tipc_max_tfms), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + }, +#endif {} }; diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index ca8ac96d22a9..3a12fc18239b 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -40,6 +40,7 @@ #include "socket.h" #include "addr.h" #include "msg.h" +#include "bearer.h" #include <net/sock.h> #include <linux/module.h> @@ -608,6 +609,7 @@ static void tipc_topsrv_kern_evt(struct net *net, struct tipc_event *evt) memcpy(msg_data(buf_msg(skb)), evt, sizeof(*evt)); skb_queue_head_init(&evtq); __skb_queue_tail(&evtq, skb); + tipc_loopback_trace(net, &evtq); tipc_sk_rcv(net, &evtq); } diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 287df68721df..d6620ad53546 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -195,10 +195,13 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, .saddr = src->ipv6, .flowi6_proto = IPPROTO_UDP }; - err = ipv6_stub->ipv6_dst_lookup(net, ub->ubsock->sk, - &ndst, &fl6); - if (err) + ndst = ipv6_stub->ipv6_dst_lookup_flow(net, + ub->ubsock->sk, + &fl6, NULL); + if (IS_ERR(ndst)) { + err = PTR_ERR(ndst); goto tx_error; + } dst_cache_set_ip6(cache, ndst, &fl6.saddr); } ttl = ip6_dst_hoplimit(ndst); @@ -372,6 +375,7 @@ static int tipc_udp_recv(struct sock *sk, struct sk_buff *skb) goto out; if (b && test_bit(0, &b->up)) { + TIPC_SKB_CB(skb)->flags = 0; tipc_rcv(sock_net(sk), skb, b); return 0; } @@ -448,15 +452,11 @@ int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb) int i; if (!bid && !skip_cnt) { + struct nlattr **attrs = genl_dumpit_info(cb)->attrs; struct net *net = sock_net(skb->sk); struct nlattr *battrs[TIPC_NLA_BEARER_MAX + 1]; - struct nlattr **attrs; char *bname; - err = tipc_nlmsg_parse(cb->nlh, &attrs); - if (err) - return err; - if (!attrs[TIPC_NLA_BEARER]) return -EINVAL; @@ -828,7 +828,8 @@ struct tipc_media udp_media_info = { .msg2addr = tipc_udp_msg2addr, .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, - .window = TIPC_DEF_LINK_WIN, + .min_win = TIPC_DEF_LINK_WIN, + .max_win = TIPC_DEF_LINK_WIN, .mtu = TIPC_DEF_LINK_UDP_MTU, .type_id = TIPC_MEDIA_TYPE_UDP, .hwaddr_len = 0, diff --git a/net/tls/Kconfig b/net/tls/Kconfig index e4328b3b72eb..61ec78521a60 100644 --- a/net/tls/Kconfig +++ b/net/tls/Kconfig @@ -26,3 +26,13 @@ config TLS_DEVICE Enable kernel support for HW offload of the TLS protocol. If unsure, say N. + +config TLS_TOE + bool "Transport Layer Security TCP stack bypass" + depends on TLS + default n + help + Enable kernel support for legacy HW offload of the TLS protocol, + which is incompatible with the Linux networking stack semantics. + + If unsure, say N. diff --git a/net/tls/Makefile b/net/tls/Makefile index ef0dc74ce8f9..f1ffbfe8968d 100644 --- a/net/tls/Makefile +++ b/net/tls/Makefile @@ -3,8 +3,11 @@ # Makefile for the TLS subsystem. # +CFLAGS_trace.o := -I$(src) + obj-$(CONFIG_TLS) += tls.o -tls-y := tls_main.o tls_sw.o +tls-y := tls_main.o tls_sw.o tls_proc.o trace.o +tls-$(CONFIG_TLS_TOE) += tls_toe.o tls-$(CONFIG_TLS_DEVICE) += tls_device.o tls_device_fallback.o diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 43922d86e510..1ba5a92832bb 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -38,6 +38,8 @@ #include <net/tcp.h> #include <net/tls.h> +#include "trace.h" + /* device_offload_lock is used to synchronize tls_dev_add * against NETDEV_DOWN notifications. */ @@ -61,7 +63,7 @@ static void tls_device_free_ctx(struct tls_context *ctx) if (ctx->rx_conf == TLS_HW) kfree(tls_offload_ctx_rx(ctx)); - tls_ctx_free(ctx); + tls_ctx_free(NULL, ctx); } static void tls_device_gc_task(struct work_struct *work) @@ -122,13 +124,10 @@ static struct net_device *get_netdev_for_sock(struct sock *sk) static void destroy_record(struct tls_record_info *record) { - int nr_frags = record->num_frags; - skb_frag_t *frag; + int i; - while (nr_frags-- > 0) { - frag = &record->frags[nr_frags]; - __skb_frag_unref(frag); - } + for (i = 0; i < record->num_frags; i++) + __skb_frag_unref(&record->frags[i]); kfree(record); } @@ -159,12 +158,8 @@ static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq) spin_lock_irqsave(&ctx->lock, flags); info = ctx->retransmit_hint; - if (info && !before(acked_seq, info->end_seq)) { + if (info && !before(acked_seq, info->end_seq)) ctx->retransmit_hint = NULL; - list_del(&info->list); - destroy_record(info); - deleted_records++; - } list_for_each_entry_safe(info, temp, &ctx->records_list, list) { if (before(acked_seq, info->end_seq)) @@ -183,7 +178,7 @@ static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq) * socket and no in-flight SKBs associated with this * socket, so it is safe to free all the resources. */ -static void tls_device_sk_destruct(struct sock *sk) +void tls_device_sk_destruct(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); @@ -201,6 +196,7 @@ static void tls_device_sk_destruct(struct sock *sk) if (refcount_dec_and_test(&tls_ctx->refcount)) tls_device_queue_ctx_destruction(tls_ctx); } +EXPORT_SYMBOL_GPL(tls_device_sk_destruct); void tls_device_free_resources_tx(struct sock *sk) { @@ -209,6 +205,15 @@ void tls_device_free_resources_tx(struct sock *sk) tls_free_partial_record(sk, tls_ctx); } +void tls_offload_tx_resync_request(struct sock *sk, u32 got_seq, u32 exp_seq) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + + trace_tls_device_tx_resync_req(sk, got_seq, exp_seq); + WARN_ON(test_and_set_bit(TLS_TX_SYNC_SCHED, &tls_ctx->flags)); +} +EXPORT_SYMBOL_GPL(tls_offload_tx_resync_request); + static void tls_device_resync_tx(struct sock *sk, struct tls_context *tls_ctx, u32 seq) { @@ -223,6 +228,7 @@ static void tls_device_resync_tx(struct sock *sk, struct tls_context *tls_ctx, rcd_sn = tls_ctx->tx.rec_seq; + trace_tls_device_tx_resync_send(sk, seq, rcd_sn); down_read(&device_offload_lock); netdev = tls_ctx->netdev; if (netdev) @@ -243,14 +249,14 @@ static void tls_append_frag(struct tls_record_info *record, skb_frag_t *frag; frag = &record->frags[record->num_frags - 1]; - if (frag->page.p == pfrag->page && - frag->page_offset + frag->size == pfrag->offset) { - frag->size += size; + if (skb_frag_page(frag) == pfrag->page && + skb_frag_off(frag) + skb_frag_size(frag) == pfrag->offset) { + skb_frag_size_add(frag, size); } else { ++frag; - frag->page.p = pfrag->page; - frag->page_offset = pfrag->offset; - frag->size = size; + __skb_frag_set_page(frag, pfrag->page); + skb_frag_off_set(frag, pfrag->offset); + skb_frag_size_set(frag, size); ++record->num_frags; get_page(pfrag->page); } @@ -263,33 +269,15 @@ static int tls_push_record(struct sock *sk, struct tls_context *ctx, struct tls_offload_context_tx *offload_ctx, struct tls_record_info *record, - struct page_frag *pfrag, - int flags, - unsigned char record_type) + int flags) { struct tls_prot_info *prot = &ctx->prot_info; struct tcp_sock *tp = tcp_sk(sk); - struct page_frag dummy_tag_frag; skb_frag_t *frag; int i; - /* fill prepend */ - frag = &record->frags[0]; - tls_fill_prepend(ctx, - skb_frag_address(frag), - record->len - prot->prepend_size, - record_type, - prot->version); - - /* HW doesn't care about the data in the tag, because it fills it. */ - dummy_tag_frag.page = skb_frag_page(frag); - dummy_tag_frag.offset = 0; - - tls_append_frag(record, &dummy_tag_frag, prot->tag_size); record->end_seq = tp->write_seq + record->len; - spin_lock_irq(&offload_ctx->lock); - list_add_tail(&record->list, &offload_ctx->records_list); - spin_unlock_irq(&offload_ctx->lock); + list_add_tail_rcu(&record->list, &offload_ctx->records_list); offload_ctx->open_record = NULL; if (test_bit(TLS_TX_SYNC_SCHED, &ctx->flags)) @@ -301,8 +289,8 @@ static int tls_push_record(struct sock *sk, frag = &record->frags[i]; sg_unmark_end(&offload_ctx->sg_tx_data[i]); sg_set_page(&offload_ctx->sg_tx_data[i], skb_frag_page(frag), - frag->size, frag->page_offset); - sk_mem_charge(sk, frag->size); + skb_frag_size(frag), skb_frag_off(frag)); + sk_mem_charge(sk, skb_frag_size(frag)); get_page(skb_frag_page(frag)); } sg_mark_end(&offload_ctx->sg_tx_data[record->num_frags - 1]); @@ -311,6 +299,38 @@ static int tls_push_record(struct sock *sk, return tls_push_sg(sk, ctx, offload_ctx->sg_tx_data, 0, flags); } +static int tls_device_record_close(struct sock *sk, + struct tls_context *ctx, + struct tls_record_info *record, + struct page_frag *pfrag, + unsigned char record_type) +{ + struct tls_prot_info *prot = &ctx->prot_info; + int ret; + + /* append tag + * device will fill in the tag, we just need to append a placeholder + * use socket memory to improve coalescing (re-using a single buffer + * increases frag count) + * if we can't allocate memory now, steal some back from data + */ + if (likely(skb_page_frag_refill(prot->tag_size, pfrag, + sk->sk_allocation))) { + ret = 0; + tls_append_frag(record, pfrag, prot->tag_size); + } else { + ret = prot->tag_size; + if (record->len <= prot->overhead_size) + return -ENOMEM; + } + + /* fill prepend */ + tls_fill_prepend(ctx, skb_frag_address(&record->frags[0]), + record->len - prot->overhead_size, + record_type, prot->version); + return ret; +} + static int tls_create_new_record(struct tls_offload_context_tx *offload_ctx, struct page_frag *pfrag, size_t prepend_size) @@ -324,7 +344,7 @@ static int tls_create_new_record(struct tls_offload_context_tx *offload_ctx, frag = &record->frags[0]; __skb_frag_set_page(frag, pfrag->page); - frag->page_offset = pfrag->offset; + skb_frag_off_set(frag, pfrag->offset); skb_frag_size_set(frag, prepend_size); get_page(pfrag->page); @@ -365,6 +385,31 @@ static int tls_do_allocation(struct sock *sk, return 0; } +static int tls_device_copy_data(void *addr, size_t bytes, struct iov_iter *i) +{ + size_t pre_copy, nocache; + + pre_copy = ~((unsigned long)addr - 1) & (SMP_CACHE_BYTES - 1); + if (pre_copy) { + pre_copy = min(pre_copy, bytes); + if (copy_from_iter(addr, pre_copy, i) != pre_copy) + return -EFAULT; + bytes -= pre_copy; + addr += pre_copy; + } + + nocache = round_down(bytes, SMP_CACHE_BYTES); + if (copy_from_iter_nocache(addr, nocache, i) != nocache) + return -EFAULT; + bytes -= nocache; + addr += nocache; + + if (bytes && copy_from_iter(addr, bytes, i) != bytes) + return -EFAULT; + + return 0; +} + static int tls_push_data(struct sock *sk, struct iov_iter *msg_iter, size_t size, int flags, @@ -385,9 +430,9 @@ static int tls_push_data(struct sock *sk, if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SENDPAGE_NOTLAST)) - return -ENOTSUPP; + return -EOPNOTSUPP; - if (sk->sk_err) + if (unlikely(sk->sk_err)) return -sk->sk_err; flags |= MSG_SENDPAGE_DECRYPTED; @@ -408,9 +453,8 @@ static int tls_push_data(struct sock *sk, max_open_record_len = TLS_MAX_PAYLOAD_SIZE + prot->prepend_size; do { - rc = tls_do_allocation(sk, ctx, pfrag, - prot->prepend_size); - if (rc) { + rc = tls_do_allocation(sk, ctx, pfrag, prot->prepend_size); + if (unlikely(rc)) { rc = sk_stream_wait_memory(sk, &timeo); if (!rc) continue; @@ -438,12 +482,10 @@ handle_error: copy = min_t(size_t, size, (pfrag->size - pfrag->offset)); copy = min_t(size_t, copy, (max_open_record_len - record->len)); - if (copy_from_iter_nocache(page_address(pfrag->page) + - pfrag->offset, - copy, msg_iter) != copy) { - rc = -EFAULT; + rc = tls_device_copy_data(page_address(pfrag->page) + + pfrag->offset, copy, msg_iter); + if (rc) goto handle_error; - } tls_append_frag(record, pfrag, copy); size -= copy; @@ -461,13 +503,24 @@ last_record: if (done || record->len >= max_open_record_len || (record->num_frags >= MAX_SKB_FRAGS - 1)) { + rc = tls_device_record_close(sk, tls_ctx, record, + pfrag, record_type); + if (rc) { + if (rc > 0) { + size += rc; + } else { + size = orig_size; + destroy_record(record); + ctx->open_record = NULL; + break; + } + } + rc = tls_push_record(sk, tls_ctx, ctx, record, - pfrag, - tls_push_record_flags, - record_type); + tls_push_record_flags); if (rc < 0) break; } @@ -482,8 +535,10 @@ last_record: int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { unsigned char record_type = TLS_RECORD_TYPE_DATA; + struct tls_context *tls_ctx = tls_get_ctx(sk); int rc; + mutex_lock(&tls_ctx->tx_lock); lock_sock(sk); if (unlikely(msg->msg_controllen)) { @@ -497,12 +552,14 @@ int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) out: release_sock(sk); + mutex_unlock(&tls_ctx->tx_lock); return rc; } int tls_device_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags) { + struct tls_context *tls_ctx = tls_get_ctx(sk); struct iov_iter msg_iter; char *kaddr = kmap(page); struct kvec iov; @@ -511,10 +568,11 @@ int tls_device_sendpage(struct sock *sk, struct page *page, if (flags & MSG_SENDPAGE_NOTLAST) flags |= MSG_MORE; + mutex_lock(&tls_ctx->tx_lock); lock_sock(sk); if (flags & MSG_OOB) { - rc = -ENOTSUPP; + rc = -EOPNOTSUPP; goto out; } @@ -527,6 +585,7 @@ int tls_device_sendpage(struct sock *sk, struct page *page, out: release_sock(sk); + mutex_unlock(&tls_ctx->tx_lock); return rc; } @@ -542,12 +601,16 @@ struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context, /* if retransmit_hint is irrelevant start * from the beggining of the list */ - info = list_first_entry(&context->records_list, - struct tls_record_info, list); + info = list_first_entry_or_null(&context->records_list, + struct tls_record_info, list); + if (!info) + return NULL; record_sn = context->unacked_record_sn; } - list_for_each_entry_from(info, &context->records_list, list) { + /* We just need the _rcu for the READ_ONCE() */ + rcu_read_lock(); + list_for_each_entry_from_rcu(info, &context->records_list, list) { if (before(seq, info->end_seq)) { if (!context->retransmit_hint || after(info->end_seq, @@ -556,12 +619,15 @@ struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context, context->retransmit_hint = info; } *p_record_sn = record_sn; - return info; + goto exit_rcu_unlock; } record_sn++; } + info = NULL; - return NULL; +exit_rcu_unlock: + rcu_read_unlock(); + return info; } EXPORT_SYMBOL(tls_get_record); @@ -575,9 +641,11 @@ static int tls_device_push_pending_record(struct sock *sk, int flags) void tls_device_write_space(struct sock *sk, struct tls_context *ctx) { - if (!sk->sk_write_pending && tls_is_partially_sent_record(ctx)) { + if (tls_is_partially_sent_record(ctx)) { gfp_t sk_allocation = sk->sk_allocation; + WARN_ON_ONCE(sk->sk_write_pending); + sk->sk_allocation = GFP_ATOMIC; tls_push_partial_record(sk, ctx, MSG_DONTWAIT | MSG_NOSIGNAL | @@ -589,15 +657,19 @@ void tls_device_write_space(struct sock *sk, struct tls_context *ctx) static void tls_device_resync_rx(struct tls_context *tls_ctx, struct sock *sk, u32 seq, u8 *rcd_sn) { + struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx); struct net_device *netdev; if (WARN_ON(test_and_set_bit(TLS_RX_SYNC_RUNNING, &tls_ctx->flags))) return; + + trace_tls_device_rx_resync_send(sk, seq, rcd_sn, rx_ctx->resync_type); netdev = READ_ONCE(tls_ctx->netdev); if (netdev) netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq, rcd_sn, TLS_OFFLOAD_CTX_DIR_RX); clear_bit_unlock(TLS_RX_SYNC_RUNNING, &tls_ctx->flags); + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXDEVICERESYNC); } void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) @@ -605,8 +677,8 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_rx *rx_ctx; u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE]; + u32 sock_data, is_req_pending; struct tls_prot_info *prot; - u32 is_req_pending; s64 resync_req; u32 req_seq; @@ -635,8 +707,12 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) /* head of next rec is already in, note that the sock_inq will * include the currently parsed message when called from parser */ - if (tcp_inq(sk) > rcd_len) + sock_data = tcp_inq(sk); + if (sock_data > rcd_len) { + trace_tls_device_rx_resync_nh_delay(sk, sock_data, + rcd_len); return; + } rx_ctx->resync_nh_do_now = 0; seq += rcd_len; @@ -680,6 +756,7 @@ static void tls_device_core_ctrl_rx_resync(struct tls_context *tls_ctx, /* head of next rec is already in, parser will sync for us */ if (tcp_inq(sk) > rxm->full_len) { + trace_tls_device_rx_resync_nh_schedule(sk); ctx->resync_nh_do_now = 1; } else { struct tls_prot_info *prot = &tls_ctx->prot_info; @@ -778,9 +855,9 @@ free_buf: return err; } -int tls_device_decrypted(struct sock *sk, struct sk_buff *skb) +int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx, + struct sk_buff *skb, struct strp_msg *rxm) { - struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_rx *ctx = tls_offload_ctx_rx(tls_ctx); int is_decrypted = skb->decrypted; int is_encrypted = !is_decrypted; @@ -792,6 +869,10 @@ int tls_device_decrypted(struct sock *sk, struct sk_buff *skb) is_encrypted &= !skb_iter->decrypted; } + trace_tls_device_decrypted(sk, tcp_sk(sk)->copied_seq - rxm->full_len, + tls_ctx->rx.rec_seq, rxm->full_len, + is_encrypted, is_decrypted); + ctx->sw.decrypted |= is_decrypted; /* Return immediately if the record is either entirely plaintext or @@ -823,7 +904,7 @@ static void tls_device_attach(struct tls_context *ctx, struct sock *sk, spin_unlock_irq(&tls_device_lock); ctx->sk_destruct = sk->sk_destruct; - sk->sk_destruct = tls_device_sk_destruct; + smp_store_release(&sk->sk_destruct, tls_device_sk_destruct); } } @@ -838,22 +919,18 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) struct net_device *netdev; char *iv, *rec_seq; struct sk_buff *skb; - int rc = -EINVAL; __be64 rcd_sn; + int rc; if (!ctx) - goto out; + return -EINVAL; - if (ctx->priv_ctx_tx) { - rc = -EEXIST; - goto out; - } + if (ctx->priv_ctx_tx) + return -EEXIST; start_marker_record = kmalloc(sizeof(*start_marker_record), GFP_KERNEL); - if (!start_marker_record) { - rc = -ENOMEM; - goto out; - } + if (!start_marker_record) + return -ENOMEM; offload_ctx = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE_TX, GFP_KERNEL); if (!offload_ctx) { @@ -939,41 +1016,43 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) if (skb) TCP_SKB_CB(skb)->eor = 1; - /* We support starting offload on multiple sockets - * concurrently, so we only need a read lock here. - * This lock must precede get_netdev_for_sock to prevent races between - * NETDEV_DOWN and setsockopt. - */ - down_read(&device_offload_lock); netdev = get_netdev_for_sock(sk); if (!netdev) { pr_err_ratelimited("%s: netdev not found\n", __func__); rc = -EINVAL; - goto release_lock; + goto disable_cad; } if (!(netdev->features & NETIF_F_HW_TLS_TX)) { - rc = -ENOTSUPP; + rc = -EOPNOTSUPP; goto release_netdev; } /* Avoid offloading if the device is down * We don't want to offload new flows after * the NETDEV_DOWN event + * + * device_offload_lock is taken in tls_devices's NETDEV_DOWN + * handler thus protecting from the device going down before + * ctx was added to tls_device_list. */ + down_read(&device_offload_lock); if (!(netdev->flags & IFF_UP)) { rc = -EINVAL; - goto release_netdev; + goto release_lock; } ctx->priv_ctx_tx = offload_ctx; rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_TX, &ctx->crypto_send.info, tcp_sk(sk)->write_seq); + trace_tls_device_offload_set(sk, TLS_OFFLOAD_CTX_DIR_TX, + tcp_sk(sk)->write_seq, rec_seq, rc); if (rc) - goto release_netdev; + goto release_lock; tls_device_attach(ctx, sk, netdev); + up_read(&device_offload_lock); /* following this assignment tls_is_sk_tx_device_offloaded * will return true and the context might be accessed @@ -981,13 +1060,14 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) */ smp_store_release(&sk->sk_validate_xmit_skb, tls_validate_xmit_skb); dev_put(netdev); - up_read(&device_offload_lock); - goto out; -release_netdev: - dev_put(netdev); + return 0; + release_lock: up_read(&device_offload_lock); +release_netdev: + dev_put(netdev); +disable_cad: clean_acked_data_disable(inet_csk(sk)); crypto_free_aead(offload_ctx->aead_send); free_rec_seq: @@ -999,12 +1079,12 @@ free_offload_ctx: ctx->priv_ctx_tx = NULL; free_marker_record: kfree(start_marker_record); -out: return rc; } int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) { + struct tls12_crypto_info_aes_gcm_128 *info; struct tls_offload_context_rx *context; struct net_device *netdev; int rc = 0; @@ -1012,37 +1092,35 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) if (ctx->crypto_recv.info.version != TLS_1_2_VERSION) return -EOPNOTSUPP; - /* We support starting offload on multiple sockets - * concurrently, so we only need a read lock here. - * This lock must precede get_netdev_for_sock to prevent races between - * NETDEV_DOWN and setsockopt. - */ - down_read(&device_offload_lock); netdev = get_netdev_for_sock(sk); if (!netdev) { pr_err_ratelimited("%s: netdev not found\n", __func__); - rc = -EINVAL; - goto release_lock; + return -EINVAL; } if (!(netdev->features & NETIF_F_HW_TLS_RX)) { - rc = -ENOTSUPP; + rc = -EOPNOTSUPP; goto release_netdev; } /* Avoid offloading if the device is down * We don't want to offload new flows after * the NETDEV_DOWN event + * + * device_offload_lock is taken in tls_devices's NETDEV_DOWN + * handler thus protecting from the device going down before + * ctx was added to tls_device_list. */ + down_read(&device_offload_lock); if (!(netdev->flags & IFF_UP)) { rc = -EINVAL; - goto release_netdev; + goto release_lock; } context = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE_RX, GFP_KERNEL); if (!context) { rc = -ENOMEM; - goto release_netdev; + goto release_lock; } context->resync_nh_reset = 1; @@ -1054,11 +1132,18 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_RX, &ctx->crypto_recv.info, tcp_sk(sk)->copied_seq); + info = (void *)&ctx->crypto_recv.info; + trace_tls_device_offload_set(sk, TLS_OFFLOAD_CTX_DIR_RX, + tcp_sk(sk)->copied_seq, info->rec_seq, rc); if (rc) goto free_sw_resources; tls_device_attach(ctx, sk, netdev); - goto release_netdev; + up_read(&device_offload_lock); + + dev_put(netdev); + + return 0; free_sw_resources: up_read(&device_offload_lock); @@ -1066,10 +1151,10 @@ free_sw_resources: down_read(&device_offload_lock); release_ctx: ctx->priv_ctx_rx = NULL; -release_netdev: - dev_put(netdev); release_lock: up_read(&device_offload_lock); +release_netdev: + dev_put(netdev); return rc; } diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index 9070d68a92a4..28895333701e 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -273,7 +273,7 @@ static int fill_sg_in(struct scatterlist *sg_in, __skb_frag_ref(frag); sg_set_page(sg_in + i, skb_frag_page(frag), - skb_frag_size(frag), frag->page_offset); + skb_frag_size(frag), skb_frag_off(frag)); remaining -= skb_frag_size(frag); diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 43252a801c3f..94774c0e5ff3 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -39,8 +39,11 @@ #include <linux/netdevice.h> #include <linux/sched/signal.h> #include <linux/inetdevice.h> +#include <linux/inet_diag.h> +#include <net/snmp.h> #include <net/tls.h> +#include <net/tls_toe.h> MODULE_AUTHOR("Mellanox Technologies"); MODULE_DESCRIPTION("Transport Layer Security Support"); @@ -57,14 +60,12 @@ static struct proto *saved_tcpv6_prot; static DEFINE_MUTEX(tcpv6_prot_mutex); static struct proto *saved_tcpv4_prot; static DEFINE_MUTEX(tcpv4_prot_mutex); -static LIST_HEAD(device_list); -static DEFINE_SPINLOCK(device_spinlock); static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG][TLS_NUM_CONFIG]; static struct proto_ops tls_sw_proto_ops; static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], struct proto *base); -static void update_sk_prot(struct sock *sk, struct tls_context *ctx) +void update_sk_prot(struct sock *sk, struct tls_context *ctx) { int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4; @@ -208,24 +209,15 @@ int tls_push_partial_record(struct sock *sk, struct tls_context *ctx, return tls_push_sg(sk, ctx, sg, offset, flags); } -bool tls_free_partial_record(struct sock *sk, struct tls_context *ctx) +void tls_free_partial_record(struct sock *sk, struct tls_context *ctx) { struct scatterlist *sg; - sg = ctx->partially_sent_record; - if (!sg) - return false; - - while (1) { + for (sg = ctx->partially_sent_record; sg; sg = sg_next(sg)) { put_page(sg_page(sg)); sk_mem_uncharge(sk, sg->length); - - if (sg_is_last(sg)) - break; - sg++; } ctx->partially_sent_record = NULL; - return true; } static void tls_write_space(struct sock *sk) @@ -251,14 +243,27 @@ static void tls_write_space(struct sock *sk) ctx->sk_write_space(sk); } -void tls_ctx_free(struct tls_context *ctx) +/** + * tls_ctx_free() - free TLS ULP context + * @sk: socket to with @ctx is attached + * @ctx: TLS context structure + * + * Free TLS context. If @sk is %NULL caller guarantees that the socket + * to which @ctx was attached has no outstanding references. + */ +void tls_ctx_free(struct sock *sk, struct tls_context *ctx) { if (!ctx) return; memzero_explicit(&ctx->crypto_send, sizeof(ctx->crypto_send)); memzero_explicit(&ctx->crypto_recv, sizeof(ctx->crypto_recv)); - kfree(ctx); + mutex_destroy(&ctx->tx_lock); + + if (sk) + kfree_rcu(ctx, rcu); + else + kfree(ctx); } static void tls_sk_proto_cleanup(struct sock *sk, @@ -273,19 +278,19 @@ static void tls_sk_proto_cleanup(struct sock *sk, kfree(ctx->tx.rec_seq); kfree(ctx->tx.iv); tls_sw_release_resources_tx(sk); -#ifdef CONFIG_TLS_DEVICE + TLS_DEC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXSW); } else if (ctx->tx_conf == TLS_HW) { tls_device_free_resources_tx(sk); -#endif + TLS_DEC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXDEVICE); } - if (ctx->rx_conf == TLS_SW) + if (ctx->rx_conf == TLS_SW) { tls_sw_release_resources_rx(sk); - -#ifdef CONFIG_TLS_DEVICE - if (ctx->rx_conf == TLS_HW) + TLS_DEC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXSW); + } else if (ctx->rx_conf == TLS_HW) { tls_device_offload_cleanup_rx(sk); -#endif + TLS_DEC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXDEVICE); + } } static void tls_sk_proto_close(struct sock *sk, long timeout) @@ -306,7 +311,7 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) write_lock_bh(&sk->sk_callback_lock); if (free_ctx) - icsk->icsk_ulp_data = NULL; + rcu_assign_pointer(icsk->icsk_ulp_data, NULL); sk->sk_prot = ctx->sk_proto; if (sk->sk_write_space == tls_write_space) sk->sk_write_space = ctx->sk_write_space; @@ -318,10 +323,10 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) tls_sw_strparser_done(ctx); if (ctx->rx_conf == TLS_SW) tls_sw_free_ctx_rx(ctx); - ctx->sk_proto_close(sk, timeout); + ctx->sk_proto->close(sk, timeout); if (free_ctx) - tls_ctx_free(ctx); + tls_ctx_free(sk, ctx); } static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval, @@ -438,7 +443,8 @@ static int tls_getsockopt(struct sock *sk, int level, int optname, struct tls_context *ctx = tls_get_ctx(sk); if (level != SOL_TLS) - return ctx->getsockopt(sk, level, optname, optval, optlen); + return ctx->sk_proto->getsockopt(sk, level, + optname, optval, optlen); return do_tls_getsockopt(sk, optname, optval, optlen); } @@ -481,7 +487,7 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, /* check version */ if (crypto_info->version != TLS_1_2_VERSION && crypto_info->version != TLS_1_3_VERSION) { - rc = -ENOTSUPP; + rc = -EINVAL; goto err_crypto_info; } @@ -523,29 +529,31 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, } if (tx) { -#ifdef CONFIG_TLS_DEVICE rc = tls_set_device_offload(sk, ctx); conf = TLS_HW; - if (rc) { -#else - { -#endif + if (!rc) { + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSTXDEVICE); + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXDEVICE); + } else { rc = tls_set_sw_offload(sk, ctx, 1); if (rc) goto err_crypto_info; + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSTXSW); + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXSW); conf = TLS_SW; } } else { -#ifdef CONFIG_TLS_DEVICE rc = tls_set_device_offload_rx(sk, ctx); conf = TLS_HW; - if (rc) { -#else - { -#endif + if (!rc) { + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXDEVICE); + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXDEVICE); + } else { rc = tls_set_sw_offload(sk, ctx, 0); if (rc) goto err_crypto_info; + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXSW); + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXSW); conf = TLS_SW; } tls_sw_strparser_arm(sk, ctx); @@ -596,12 +604,13 @@ static int tls_setsockopt(struct sock *sk, int level, int optname, struct tls_context *ctx = tls_get_ctx(sk); if (level != SOL_TLS) - return ctx->setsockopt(sk, level, optname, optval, optlen); + return ctx->sk_proto->setsockopt(sk, level, optname, optval, + optlen); return do_tls_setsockopt(sk, optname, optval, optlen); } -static struct tls_context *create_ctx(struct sock *sk) +struct tls_context *tls_ctx_create(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tls_context *ctx; @@ -610,11 +619,9 @@ static struct tls_context *create_ctx(struct sock *sk) if (!ctx) return NULL; - icsk->icsk_ulp_data = ctx; - ctx->setsockopt = sk->sk_prot->setsockopt; - ctx->getsockopt = sk->sk_prot->getsockopt; - ctx->sk_proto_close = sk->sk_prot->close; - ctx->unhash = sk->sk_prot->unhash; + mutex_init(&ctx->tx_lock); + rcu_assign_pointer(icsk->icsk_ulp_data, ctx); + ctx->sk_proto = sk->sk_prot; return ctx; } @@ -644,93 +651,6 @@ static void tls_build_proto(struct sock *sk) } } -static void tls_hw_sk_destruct(struct sock *sk) -{ - struct tls_context *ctx = tls_get_ctx(sk); - struct inet_connection_sock *icsk = inet_csk(sk); - - ctx->sk_destruct(sk); - /* Free ctx */ - tls_ctx_free(ctx); - icsk->icsk_ulp_data = NULL; -} - -static int tls_hw_prot(struct sock *sk) -{ - struct tls_context *ctx; - struct tls_device *dev; - int rc = 0; - - spin_lock_bh(&device_spinlock); - list_for_each_entry(dev, &device_list, dev_list) { - if (dev->feature && dev->feature(dev)) { - ctx = create_ctx(sk); - if (!ctx) - goto out; - - spin_unlock_bh(&device_spinlock); - tls_build_proto(sk); - ctx->hash = sk->sk_prot->hash; - ctx->unhash = sk->sk_prot->unhash; - ctx->sk_proto_close = sk->sk_prot->close; - ctx->sk_destruct = sk->sk_destruct; - sk->sk_destruct = tls_hw_sk_destruct; - ctx->rx_conf = TLS_HW_RECORD; - ctx->tx_conf = TLS_HW_RECORD; - update_sk_prot(sk, ctx); - spin_lock_bh(&device_spinlock); - rc = 1; - break; - } - } -out: - spin_unlock_bh(&device_spinlock); - return rc; -} - -static void tls_hw_unhash(struct sock *sk) -{ - struct tls_context *ctx = tls_get_ctx(sk); - struct tls_device *dev; - - spin_lock_bh(&device_spinlock); - list_for_each_entry(dev, &device_list, dev_list) { - if (dev->unhash) { - kref_get(&dev->kref); - spin_unlock_bh(&device_spinlock); - dev->unhash(dev, sk); - kref_put(&dev->kref, dev->release); - spin_lock_bh(&device_spinlock); - } - } - spin_unlock_bh(&device_spinlock); - ctx->unhash(sk); -} - -static int tls_hw_hash(struct sock *sk) -{ - struct tls_context *ctx = tls_get_ctx(sk); - struct tls_device *dev; - int err; - - err = ctx->hash(sk); - spin_lock_bh(&device_spinlock); - list_for_each_entry(dev, &device_list, dev_list) { - if (dev->hash) { - kref_get(&dev->kref); - spin_unlock_bh(&device_spinlock); - err |= dev->hash(dev, sk); - kref_put(&dev->kref, dev->release); - spin_lock_bh(&device_spinlock); - } - } - spin_unlock_bh(&device_spinlock); - - if (err) - tls_hw_unhash(sk); - return err; -} - static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], struct proto *base) { @@ -768,10 +688,11 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], prot[TLS_HW][TLS_HW] = prot[TLS_HW][TLS_SW]; #endif - +#ifdef CONFIG_TLS_TOE prot[TLS_HW_RECORD][TLS_HW_RECORD] = *base; - prot[TLS_HW_RECORD][TLS_HW_RECORD].hash = tls_hw_hash; - prot[TLS_HW_RECORD][TLS_HW_RECORD].unhash = tls_hw_unhash; + prot[TLS_HW_RECORD][TLS_HW_RECORD].hash = tls_toe_hash; + prot[TLS_HW_RECORD][TLS_HW_RECORD].unhash = tls_toe_unhash; +#endif } static int tls_init(struct sock *sk) @@ -779,8 +700,12 @@ static int tls_init(struct sock *sk) struct tls_context *ctx; int rc = 0; - if (tls_hw_prot(sk)) + tls_build_proto(sk); + +#ifdef CONFIG_TLS_TOE + if (tls_toe_bypass(sk)) return 0; +#endif /* The TLS ulp is currently supported only for TCP sockets * in ESTABLISHED state. @@ -789,13 +714,11 @@ static int tls_init(struct sock *sk) * share the ulp context. */ if (sk->sk_state != TCP_ESTABLISHED) - return -ENOTSUPP; - - tls_build_proto(sk); + return -ENOTCONN; /* allocate tls context */ write_lock_bh(&sk->sk_callback_lock); - ctx = create_ctx(sk); + ctx = tls_ctx_create(sk); if (!ctx) { rc = -ENOMEM; goto out; @@ -803,57 +726,139 @@ static int tls_init(struct sock *sk) ctx->tx_conf = TLS_BASE; ctx->rx_conf = TLS_BASE; - ctx->sk_proto = sk->sk_prot; update_sk_prot(sk, ctx); out: write_unlock_bh(&sk->sk_callback_lock); return rc; } -static void tls_update(struct sock *sk, struct proto *p) +static void tls_update(struct sock *sk, struct proto *p, + void (*write_space)(struct sock *sk)) { struct tls_context *ctx; ctx = tls_get_ctx(sk); if (likely(ctx)) { - ctx->sk_proto_close = p->close; + ctx->sk_write_space = write_space; ctx->sk_proto = p; } else { sk->sk_prot = p; + sk->sk_write_space = write_space; } } -void tls_register_device(struct tls_device *device) +static int tls_get_info(const struct sock *sk, struct sk_buff *skb) { - spin_lock_bh(&device_spinlock); - list_add_tail(&device->dev_list, &device_list); - spin_unlock_bh(&device_spinlock); + u16 version, cipher_type; + struct tls_context *ctx; + struct nlattr *start; + int err; + + start = nla_nest_start_noflag(skb, INET_ULP_INFO_TLS); + if (!start) + return -EMSGSIZE; + + rcu_read_lock(); + ctx = rcu_dereference(inet_csk(sk)->icsk_ulp_data); + if (!ctx) { + err = 0; + goto nla_failure; + } + version = ctx->prot_info.version; + if (version) { + err = nla_put_u16(skb, TLS_INFO_VERSION, version); + if (err) + goto nla_failure; + } + cipher_type = ctx->prot_info.cipher_type; + if (cipher_type) { + err = nla_put_u16(skb, TLS_INFO_CIPHER, cipher_type); + if (err) + goto nla_failure; + } + err = nla_put_u16(skb, TLS_INFO_TXCONF, tls_user_config(ctx, true)); + if (err) + goto nla_failure; + + err = nla_put_u16(skb, TLS_INFO_RXCONF, tls_user_config(ctx, false)); + if (err) + goto nla_failure; + + rcu_read_unlock(); + nla_nest_end(skb, start); + return 0; + +nla_failure: + rcu_read_unlock(); + nla_nest_cancel(skb, start); + return err; } -EXPORT_SYMBOL(tls_register_device); -void tls_unregister_device(struct tls_device *device) +static size_t tls_get_info_size(const struct sock *sk) { - spin_lock_bh(&device_spinlock); - list_del(&device->dev_list); - spin_unlock_bh(&device_spinlock); + size_t size = 0; + + size += nla_total_size(0) + /* INET_ULP_INFO_TLS */ + nla_total_size(sizeof(u16)) + /* TLS_INFO_VERSION */ + nla_total_size(sizeof(u16)) + /* TLS_INFO_CIPHER */ + nla_total_size(sizeof(u16)) + /* TLS_INFO_RXCONF */ + nla_total_size(sizeof(u16)) + /* TLS_INFO_TXCONF */ + 0; + + return size; } -EXPORT_SYMBOL(tls_unregister_device); + +static int __net_init tls_init_net(struct net *net) +{ + int err; + + net->mib.tls_statistics = alloc_percpu(struct linux_tls_mib); + if (!net->mib.tls_statistics) + return -ENOMEM; + + err = tls_proc_init(net); + if (err) + goto err_free_stats; + + return 0; +err_free_stats: + free_percpu(net->mib.tls_statistics); + return err; +} + +static void __net_exit tls_exit_net(struct net *net) +{ + tls_proc_fini(net); + free_percpu(net->mib.tls_statistics); +} + +static struct pernet_operations tls_proc_ops = { + .init = tls_init_net, + .exit = tls_exit_net, +}; static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = { .name = "tls", .owner = THIS_MODULE, .init = tls_init, .update = tls_update, + .get_info = tls_get_info, + .get_info_size = tls_get_info_size, }; static int __init tls_register(void) { + int err; + + err = register_pernet_subsys(&tls_proc_ops); + if (err) + return err; + tls_sw_proto_ops = inet_stream_ops; tls_sw_proto_ops.splice_read = tls_sw_splice_read; + tls_sw_proto_ops.sendpage_locked = tls_sw_sendpage_locked, -#ifdef CONFIG_TLS_DEVICE tls_device_init(); -#endif tcp_register_ulp(&tcp_tls_ulp_ops); return 0; @@ -862,9 +867,8 @@ static int __init tls_register(void) static void __exit tls_unregister(void) { tcp_unregister_ulp(&tcp_tls_ulp_ops); -#ifdef CONFIG_TLS_DEVICE tls_device_cleanup(); -#endif + unregister_pernet_subsys(&tls_proc_ops); } module_init(tls_register); diff --git a/net/tls/tls_proc.c b/net/tls/tls_proc.c new file mode 100644 index 000000000000..3a5dd1e07233 --- /dev/null +++ b/net/tls/tls_proc.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (C) 2019 Netronome Systems, Inc. */ + +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <net/snmp.h> +#include <net/tls.h> + +#ifdef CONFIG_PROC_FS +static const struct snmp_mib tls_mib_list[] = { + SNMP_MIB_ITEM("TlsCurrTxSw", LINUX_MIB_TLSCURRTXSW), + SNMP_MIB_ITEM("TlsCurrRxSw", LINUX_MIB_TLSCURRRXSW), + SNMP_MIB_ITEM("TlsCurrTxDevice", LINUX_MIB_TLSCURRTXDEVICE), + SNMP_MIB_ITEM("TlsCurrRxDevice", LINUX_MIB_TLSCURRRXDEVICE), + SNMP_MIB_ITEM("TlsTxSw", LINUX_MIB_TLSTXSW), + SNMP_MIB_ITEM("TlsRxSw", LINUX_MIB_TLSRXSW), + SNMP_MIB_ITEM("TlsTxDevice", LINUX_MIB_TLSTXDEVICE), + SNMP_MIB_ITEM("TlsRxDevice", LINUX_MIB_TLSRXDEVICE), + SNMP_MIB_ITEM("TlsDecryptError", LINUX_MIB_TLSDECRYPTERROR), + SNMP_MIB_ITEM("TlsRxDeviceResync", LINUX_MIB_TLSRXDEVICERESYNC), + SNMP_MIB_SENTINEL +}; + +static int tls_statistics_seq_show(struct seq_file *seq, void *v) +{ + unsigned long buf[LINUX_MIB_TLSMAX] = {}; + struct net *net = seq->private; + int i; + + snmp_get_cpu_field_batch(buf, tls_mib_list, net->mib.tls_statistics); + for (i = 0; tls_mib_list[i].name; i++) + seq_printf(seq, "%-32s\t%lu\n", tls_mib_list[i].name, buf[i]); + + return 0; +} +#endif + +int __net_init tls_proc_init(struct net *net) +{ + if (!proc_create_net_single("tls_stat", 0444, net->proc_net, + tls_statistics_seq_show, NULL)) + return -ENOMEM; + return 0; +} + +void __net_exit tls_proc_fini(struct net *net) +{ + remove_proc_entry("tls_stat", net->proc_net); +} diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 91d21b048a9b..c98e602a1a2d 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -168,6 +168,9 @@ static void tls_decrypt_done(struct crypto_async_request *req, int err) /* Propagate if there was an err */ if (err) { + if (err == -EBADMSG) + TLS_INC_STATS(sock_net(skb->sk), + LINUX_MIB_TLSDECRYPTERROR); ctx->async_wait.err = err; tls_err_abort(skb->sk, err); } else { @@ -677,12 +680,32 @@ static int tls_push_record(struct sock *sk, int flags, split_point = msg_pl->apply_bytes; split = split_point && split_point < msg_pl->sg.size; + if (unlikely((!split && + msg_pl->sg.size + + prot->overhead_size > msg_en->sg.size) || + (split && + split_point + + prot->overhead_size > msg_en->sg.size))) { + split = true; + split_point = msg_en->sg.size; + } if (split) { rc = tls_split_open_record(sk, rec, &tmp, msg_pl, msg_en, split_point, prot->overhead_size, &orig_end); if (rc < 0) return rc; + /* This can happen if above tls_split_open_record allocates + * a single large encryption buffer instead of two smaller + * ones. In this case adjust pointers and continue without + * split. + */ + if (!msg_pl->sg.size) { + tls_merge_open_record(sk, rec, tmp, orig_end); + msg_pl = &rec->msg_plaintext; + msg_en = &rec->msg_encrypted; + split = false; + } sk_msg_trim(sk, msg_en, msg_pl->sg.size + prot->overhead_size); } @@ -704,9 +727,14 @@ static int tls_push_record(struct sock *sk, int flags, sg_mark_end(sk_msg_elem(msg_pl, i)); } + if (msg_pl->sg.end < msg_pl->sg.start) { + sg_chain(&msg_pl->sg.data[msg_pl->sg.start], + MAX_SKB_FRAGS - msg_pl->sg.start + 1, + msg_pl->sg.data); + } + i = msg_pl->sg.start; - sg_chain(rec->sg_aead_in, 2, rec->inplace_crypto ? - &msg_en->sg.data[i] : &msg_pl->sg.data[i]); + sg_chain(rec->sg_aead_in, 2, &msg_pl->sg.data[i]); i = msg_en->sg.end; sk_msg_iter_var_prev(i); @@ -766,17 +794,20 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk, policy = !(flags & MSG_SENDPAGE_NOPOLICY); psock = sk_psock_get(sk); - if (!psock || !policy) - return tls_push_record(sk, flags, record_type); + if (!psock || !policy) { + err = tls_push_record(sk, flags, record_type); + if (err && err != -EINPROGRESS) { + *copied -= sk_msg_free(sk, msg); + tls_free_open_rec(sk); + } + return err; + } more_data: enospc = sk_msg_full(msg); if (psock->eval == __SK_NONE) { delta = msg->sg.size; psock->eval = sk_psock_msg_verdict(sk, psock, msg); - if (delta < msg->sg.size) - delta -= msg->sg.size; - else - delta = 0; + delta -= msg->sg.size; } if (msg->cork_bytes && msg->cork_bytes > msg->sg.size && !enospc && !full_record) { @@ -791,7 +822,7 @@ more_data: switch (psock->eval) { case __SK_PASS: err = tls_push_record(sk, flags, record_type); - if (err < 0) { + if (err && err != -EINPROGRESS) { *copied -= sk_msg_free(sk, msg); tls_free_open_rec(sk); goto out_err; @@ -895,17 +926,11 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) int ret = 0; if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) - return -ENOTSUPP; + return -EOPNOTSUPP; + mutex_lock(&tls_ctx->tx_lock); lock_sock(sk); - /* Wait till there is any pending write on socket */ - if (unlikely(sk->sk_write_pending)) { - ret = wait_on_pending_writer(sk, &timeo); - if (unlikely(ret)) - goto send_end; - } - if (unlikely(msg->msg_controllen)) { ret = tls_proccess_cmsg(sk, msg, &record_type); if (ret) { @@ -971,8 +996,6 @@ alloc_encrypted: if (ret) goto fallback_to_reg_send; - rec->inplace_crypto = 0; - num_zc++; copied += try_to_copy; @@ -985,7 +1008,7 @@ alloc_encrypted: num_async++; else if (ret == -ENOMEM) goto wait_for_memory; - else if (ret == -ENOSPC) + else if (ctx->open_rec && ret == -ENOSPC) goto rollback_iter; else if (ret != -EAGAIN) goto send_end; @@ -1054,11 +1077,12 @@ wait_for_memory: ret = sk_stream_wait_memory(sk, &timeo); if (ret) { trim_sgl: - tls_trim_both_msgs(sk, orig_size); + if (ctx->open_rec) + tls_trim_both_msgs(sk, orig_size); goto send_end; } - if (msg_en->sg.size < required_size) + if (ctx->open_rec && msg_en->sg.size < required_size) goto alloc_encrypted; } @@ -1091,6 +1115,7 @@ send_end: ret = sk_stream_error(sk, msg->msg_flags, ret); release_sock(sk); + mutex_unlock(&tls_ctx->tx_lock); return copied ? copied : ret; } @@ -1114,13 +1139,6 @@ static int tls_sw_do_sendpage(struct sock *sk, struct page *page, eor = !(flags & (MSG_MORE | MSG_SENDPAGE_NOTLAST)); sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); - /* Wait till there is any pending write on socket */ - if (unlikely(sk->sk_write_pending)) { - ret = wait_on_pending_writer(sk, &timeo); - if (unlikely(ret)) - goto sendpage_end; - } - /* Call the sk_stream functions to manage the sndbuf mem. */ while (size > 0) { size_t copy, required_size; @@ -1176,7 +1194,6 @@ alloc_payload: tls_ctx->pending_open_record_frags = true; if (full_record || eor || sk_msg_full(msg_pl)) { - rec->inplace_crypto = 0; ret = bpf_exec_tx_verdict(msg_pl, sk, full_record, record_type, &copied, flags); if (ret) { @@ -1197,11 +1214,13 @@ wait_for_sndbuf: wait_for_memory: ret = sk_stream_wait_memory(sk, &timeo); if (ret) { - tls_trim_both_msgs(sk, msg_pl->sg.size); + if (ctx->open_rec) + tls_trim_both_msgs(sk, msg_pl->sg.size); goto sendpage_end; } - goto alloc_payload; + if (ctx->open_rec) + goto alloc_payload; } if (num_async) { @@ -1216,18 +1235,32 @@ sendpage_end: return copied ? copied : ret; } +int tls_sw_sendpage_locked(struct sock *sk, struct page *page, + int offset, size_t size, int flags) +{ + if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | + MSG_SENDPAGE_NOTLAST | MSG_SENDPAGE_NOPOLICY | + MSG_NO_SHARED_FRAGS)) + return -EOPNOTSUPP; + + return tls_sw_do_sendpage(sk, page, offset, size, flags); +} + int tls_sw_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags) { + struct tls_context *tls_ctx = tls_get_ctx(sk); int ret; if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SENDPAGE_NOTLAST | MSG_SENDPAGE_NOPOLICY)) - return -ENOTSUPP; + return -EOPNOTSUPP; + mutex_lock(&tls_ctx->tx_lock); lock_sock(sk); ret = tls_sw_do_sendpage(sk, page, offset, size, flags); release_sock(sk); + mutex_unlock(&tls_ctx->tx_lock); return ret; } @@ -1489,13 +1522,12 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, int pad, err = 0; if (!ctx->decrypted) { -#ifdef CONFIG_TLS_DEVICE if (tls_ctx->rx_conf == TLS_HW) { - err = tls_device_decrypted(sk, skb); + err = tls_device_decrypted(sk, tls_ctx, skb, rxm); if (err < 0) return err; } -#endif + /* Still not decrypted after tls_device */ if (!ctx->decrypted) { err = decrypt_internal(sk, skb, dest, NULL, chunk, zc, @@ -1504,7 +1536,9 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, if (err == -EINPROGRESS) tls_advance_record_sn(sk, prot, &tls_ctx->rx); - + else if (err == -EBADMSG) + TLS_INC_STATS(sock_net(sk), + LINUX_MIB_TLSDECRYPTERROR); return err; } } else { @@ -1519,7 +1553,7 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, rxm->offset += prot->prepend_size; rxm->full_len -= prot->overhead_size; tls_advance_record_sn(sk, prot, &tls_ctx->rx); - ctx->decrypted = true; + ctx->decrypted = 1; ctx->saved_data_ready(sk); } else { *zc = false; @@ -1921,7 +1955,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, /* splice does not support reading control messages */ if (ctx->control != TLS_RECORD_TYPE_DATA) { - err = -ENOTSUPP; + err = -EINVAL; goto splice_read_end; } @@ -1929,7 +1963,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, tls_err_abort(sk, EBADMSG); goto splice_read_end; } - ctx->decrypted = true; + ctx->decrypted = 1; } rxm = strp_msg(skb); @@ -2014,10 +2048,9 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb) ret = -EINVAL; goto read_failure; } -#ifdef CONFIG_TLS_DEVICE + tls_device_rx_resync_new_rec(strp->sk, data_len + TLS_HEADER_SIZE, TCP_SKB_CB(skb)->seq + rxm->offset); -#endif return data_len + TLS_HEADER_SIZE; read_failure: @@ -2031,7 +2064,7 @@ static void tls_queue(struct strparser *strp, struct sk_buff *skb) struct tls_context *tls_ctx = tls_get_ctx(strp->sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); - ctx->decrypted = false; + ctx->decrypted = 0; ctx->recv_pkt = skb; strp_pause(strp); @@ -2079,7 +2112,8 @@ void tls_sw_release_resources_tx(struct sock *sk) /* Free up un-sent records in tx_list. First, free * the partially sent record if any at head of tx_list. */ - if (tls_free_partial_record(sk, tls_ctx)) { + if (tls_ctx->partially_sent_record) { + tls_free_partial_record(sk, tls_ctx); rec = list_first_entry(&ctx->tx_list, struct tls_rec, list); list_del(&rec->list); @@ -2172,9 +2206,11 @@ static void tx_work_handler(struct work_struct *work) if (!test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) return; + mutex_lock(&tls_ctx->tx_lock); lock_sock(sk); tls_tx_records(sk, -1); release_sock(sk); + mutex_unlock(&tls_ctx->tx_lock); } void tls_sw_write_space(struct sock *sk, struct tls_context *ctx) @@ -2182,12 +2218,9 @@ void tls_sw_write_space(struct sock *sk, struct tls_context *ctx) struct tls_sw_context_tx *tx_ctx = tls_sw_ctx_tx(ctx); /* Schedule the transmission if tx list is ready */ - if (is_tx_ready(tx_ctx) && !sk->sk_write_pending) { - /* Schedule the transmission */ - if (!test_and_set_bit(BIT_TX_SCHEDULED, - &tx_ctx->tx_bitmask)) - schedule_delayed_work(&tx_ctx->tx_work.work, 0); - } + if (is_tx_ready(tx_ctx) && + !test_and_set_bit(BIT_TX_SCHEDULED, &tx_ctx->tx_bitmask)) + schedule_delayed_work(&tx_ctx->tx_work.work, 0); } void tls_sw_strparser_arm(struct sock *sk, struct tls_context *tls_ctx) @@ -2388,10 +2421,11 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) tfm = crypto_aead_tfm(sw_ctx_rx->aead_recv); if (crypto_info->version == TLS_1_3_VERSION) - sw_ctx_rx->async_capable = false; + sw_ctx_rx->async_capable = 0; else sw_ctx_rx->async_capable = - tfm->__crt_alg->cra_flags & CRYPTO_ALG_ASYNC; + !!(tfm->__crt_alg->cra_flags & + CRYPTO_ALG_ASYNC); /* Set up strparser */ memset(&cb, 0, sizeof(cb)); diff --git a/net/tls/tls_toe.c b/net/tls/tls_toe.c new file mode 100644 index 000000000000..7e1330f19165 --- /dev/null +++ b/net/tls/tls_toe.c @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved. + * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/list.h> +#include <linux/rcupdate.h> +#include <linux/spinlock.h> +#include <net/inet_connection_sock.h> +#include <net/tls.h> +#include <net/tls_toe.h> + +static LIST_HEAD(device_list); +static DEFINE_SPINLOCK(device_spinlock); + +static void tls_toe_sk_destruct(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tls_context *ctx = tls_get_ctx(sk); + + ctx->sk_destruct(sk); + /* Free ctx */ + rcu_assign_pointer(icsk->icsk_ulp_data, NULL); + tls_ctx_free(sk, ctx); +} + +int tls_toe_bypass(struct sock *sk) +{ + struct tls_toe_device *dev; + struct tls_context *ctx; + int rc = 0; + + spin_lock_bh(&device_spinlock); + list_for_each_entry(dev, &device_list, dev_list) { + if (dev->feature && dev->feature(dev)) { + ctx = tls_ctx_create(sk); + if (!ctx) + goto out; + + ctx->sk_destruct = sk->sk_destruct; + sk->sk_destruct = tls_toe_sk_destruct; + ctx->rx_conf = TLS_HW_RECORD; + ctx->tx_conf = TLS_HW_RECORD; + update_sk_prot(sk, ctx); + rc = 1; + break; + } + } +out: + spin_unlock_bh(&device_spinlock); + return rc; +} + +void tls_toe_unhash(struct sock *sk) +{ + struct tls_context *ctx = tls_get_ctx(sk); + struct tls_toe_device *dev; + + spin_lock_bh(&device_spinlock); + list_for_each_entry(dev, &device_list, dev_list) { + if (dev->unhash) { + kref_get(&dev->kref); + spin_unlock_bh(&device_spinlock); + dev->unhash(dev, sk); + kref_put(&dev->kref, dev->release); + spin_lock_bh(&device_spinlock); + } + } + spin_unlock_bh(&device_spinlock); + ctx->sk_proto->unhash(sk); +} + +int tls_toe_hash(struct sock *sk) +{ + struct tls_context *ctx = tls_get_ctx(sk); + struct tls_toe_device *dev; + int err; + + err = ctx->sk_proto->hash(sk); + spin_lock_bh(&device_spinlock); + list_for_each_entry(dev, &device_list, dev_list) { + if (dev->hash) { + kref_get(&dev->kref); + spin_unlock_bh(&device_spinlock); + err |= dev->hash(dev, sk); + kref_put(&dev->kref, dev->release); + spin_lock_bh(&device_spinlock); + } + } + spin_unlock_bh(&device_spinlock); + + if (err) + tls_toe_unhash(sk); + return err; +} + +void tls_toe_register_device(struct tls_toe_device *device) +{ + spin_lock_bh(&device_spinlock); + list_add_tail(&device->dev_list, &device_list); + spin_unlock_bh(&device_spinlock); +} +EXPORT_SYMBOL(tls_toe_register_device); + +void tls_toe_unregister_device(struct tls_toe_device *device) +{ + spin_lock_bh(&device_spinlock); + list_del(&device->dev_list); + spin_unlock_bh(&device_spinlock); +} +EXPORT_SYMBOL(tls_toe_unregister_device); diff --git a/net/tls/trace.c b/net/tls/trace.c new file mode 100644 index 000000000000..e374913cf9c9 --- /dev/null +++ b/net/tls/trace.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (C) 2019 Netronome Systems, Inc. */ + +#include <linux/module.h> + +#ifndef __CHECKER__ +#define CREATE_TRACE_POINTS +#include "trace.h" + +#endif diff --git a/net/tls/trace.h b/net/tls/trace.h new file mode 100644 index 000000000000..9ba5f600ea43 --- /dev/null +++ b/net/tls/trace.h @@ -0,0 +1,202 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (C) 2019 Netronome Systems, Inc. */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM tls + +#if !defined(_TLS_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ) +#define _TLS_TRACE_H_ + +#include <asm/unaligned.h> +#include <linux/tracepoint.h> + +struct sock; + +TRACE_EVENT(tls_device_offload_set, + + TP_PROTO(struct sock *sk, int dir, u32 tcp_seq, u8 *rec_no, int ret), + + TP_ARGS(sk, dir, tcp_seq, rec_no, ret), + + TP_STRUCT__entry( + __field( struct sock *, sk ) + __field( u64, rec_no ) + __field( int, dir ) + __field( u32, tcp_seq ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->sk = sk; + __entry->rec_no = get_unaligned_be64(rec_no); + __entry->dir = dir; + __entry->tcp_seq = tcp_seq; + __entry->ret = ret; + ), + + TP_printk( + "sk=%p direction=%d tcp_seq=%u rec_no=%llu ret=%d", + __entry->sk, __entry->dir, __entry->tcp_seq, __entry->rec_no, + __entry->ret + ) +); + +TRACE_EVENT(tls_device_decrypted, + + TP_PROTO(struct sock *sk, u32 tcp_seq, u8 *rec_no, u32 rec_len, + bool encrypted, bool decrypted), + + TP_ARGS(sk, tcp_seq, rec_no, rec_len, encrypted, decrypted), + + TP_STRUCT__entry( + __field( struct sock *, sk ) + __field( u64, rec_no ) + __field( u32, tcp_seq ) + __field( u32, rec_len ) + __field( bool, encrypted ) + __field( bool, decrypted ) + ), + + TP_fast_assign( + __entry->sk = sk; + __entry->rec_no = get_unaligned_be64(rec_no); + __entry->tcp_seq = tcp_seq; + __entry->rec_len = rec_len; + __entry->encrypted = encrypted; + __entry->decrypted = decrypted; + ), + + TP_printk( + "sk=%p tcp_seq=%u rec_no=%llu len=%u encrypted=%d decrypted=%d", + __entry->sk, __entry->tcp_seq, + __entry->rec_no, __entry->rec_len, + __entry->encrypted, __entry->decrypted + ) +); + +TRACE_EVENT(tls_device_rx_resync_send, + + TP_PROTO(struct sock *sk, u32 tcp_seq, u8 *rec_no, int sync_type), + + TP_ARGS(sk, tcp_seq, rec_no, sync_type), + + TP_STRUCT__entry( + __field( struct sock *, sk ) + __field( u64, rec_no ) + __field( u32, tcp_seq ) + __field( int, sync_type ) + ), + + TP_fast_assign( + __entry->sk = sk; + __entry->rec_no = get_unaligned_be64(rec_no); + __entry->tcp_seq = tcp_seq; + __entry->sync_type = sync_type; + ), + + TP_printk( + "sk=%p tcp_seq=%u rec_no=%llu sync_type=%d", + __entry->sk, __entry->tcp_seq, __entry->rec_no, + __entry->sync_type + ) +); + +TRACE_EVENT(tls_device_rx_resync_nh_schedule, + + TP_PROTO(struct sock *sk), + + TP_ARGS(sk), + + TP_STRUCT__entry( + __field( struct sock *, sk ) + ), + + TP_fast_assign( + __entry->sk = sk; + ), + + TP_printk( + "sk=%p", __entry->sk + ) +); + +TRACE_EVENT(tls_device_rx_resync_nh_delay, + + TP_PROTO(struct sock *sk, u32 sock_data, u32 rec_len), + + TP_ARGS(sk, sock_data, rec_len), + + TP_STRUCT__entry( + __field( struct sock *, sk ) + __field( u32, sock_data ) + __field( u32, rec_len ) + ), + + TP_fast_assign( + __entry->sk = sk; + __entry->sock_data = sock_data; + __entry->rec_len = rec_len; + ), + + TP_printk( + "sk=%p sock_data=%u rec_len=%u", + __entry->sk, __entry->sock_data, __entry->rec_len + ) +); + +TRACE_EVENT(tls_device_tx_resync_req, + + TP_PROTO(struct sock *sk, u32 tcp_seq, u32 exp_tcp_seq), + + TP_ARGS(sk, tcp_seq, exp_tcp_seq), + + TP_STRUCT__entry( + __field( struct sock *, sk ) + __field( u32, tcp_seq ) + __field( u32, exp_tcp_seq ) + ), + + TP_fast_assign( + __entry->sk = sk; + __entry->tcp_seq = tcp_seq; + __entry->exp_tcp_seq = exp_tcp_seq; + ), + + TP_printk( + "sk=%p tcp_seq=%u exp_tcp_seq=%u", + __entry->sk, __entry->tcp_seq, __entry->exp_tcp_seq + ) +); + +TRACE_EVENT(tls_device_tx_resync_send, + + TP_PROTO(struct sock *sk, u32 tcp_seq, u8 *rec_no), + + TP_ARGS(sk, tcp_seq, rec_no), + + TP_STRUCT__entry( + __field( struct sock *, sk ) + __field( u64, rec_no ) + __field( u32, tcp_seq ) + ), + + TP_fast_assign( + __entry->sk = sk; + __entry->rec_no = get_unaligned_be64(rec_no); + __entry->tcp_seq = tcp_seq; + ), + + TP_printk( + "sk=%p tcp_seq=%u rec_no=%llu", + __entry->sk, __entry->tcp_seq, __entry->rec_no + ) +); + +#endif /* _TLS_TRACE_H_ */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + +#include <trace/define_trace.h> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 67e87db5877f..62c12cb5763e 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -189,11 +189,17 @@ static inline int unix_may_send(struct sock *sk, struct sock *osk) return unix_peer(osk) == NULL || unix_our_peer(sk, osk); } -static inline int unix_recvq_full(struct sock const *sk) +static inline int unix_recvq_full(const struct sock *sk) { return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; } +static inline int unix_recvq_full_lockless(const struct sock *sk) +{ + return skb_queue_len_lockless(&sk->sk_receive_queue) > + READ_ONCE(sk->sk_max_ack_backlog); +} + struct sock *unix_peer_get(struct sock *s) { struct sock *peer; @@ -284,11 +290,9 @@ static struct sock *__unix_find_socket_byname(struct net *net, if (u->addr->len == len && !memcmp(u->addr->name, sunname, len)) - goto found; + return s; } - s = NULL; -found: - return s; + return NULL; } static inline struct sock *unix_find_socket_byname(struct net *net, @@ -646,6 +650,9 @@ static __poll_t unix_poll(struct file *, struct socket *, poll_table *); static __poll_t unix_dgram_poll(struct file *, struct socket *, poll_table *); static int unix_ioctl(struct socket *, unsigned int, unsigned long); +#ifdef CONFIG_COMPAT +static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); +#endif static int unix_shutdown(struct socket *, int); static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); @@ -675,6 +682,16 @@ static int unix_set_peek_off(struct sock *sk, int val) return 0; } +static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) +{ + struct sock *sk = sock->sk; + struct unix_sock *u; + + if (sk) { + u = unix_sk(sock->sk); + seq_printf(m, "scm_fds: %u\n", READ_ONCE(u->scm_stat.nr_fds)); + } +} static const struct proto_ops unix_stream_ops = { .family = PF_UNIX, @@ -687,6 +704,9 @@ static const struct proto_ops unix_stream_ops = { .getname = unix_getname, .poll = unix_poll, .ioctl = unix_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = unix_compat_ioctl, +#endif .listen = unix_listen, .shutdown = unix_shutdown, .setsockopt = sock_no_setsockopt, @@ -697,6 +717,7 @@ static const struct proto_ops unix_stream_ops = { .sendpage = unix_stream_sendpage, .splice_read = unix_stream_splice_read, .set_peek_off = unix_set_peek_off, + .show_fdinfo = unix_show_fdinfo, }; static const struct proto_ops unix_dgram_ops = { @@ -710,6 +731,9 @@ static const struct proto_ops unix_dgram_ops = { .getname = unix_getname, .poll = unix_dgram_poll, .ioctl = unix_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = unix_compat_ioctl, +#endif .listen = sock_no_listen, .shutdown = unix_shutdown, .setsockopt = sock_no_setsockopt, @@ -719,6 +743,7 @@ static const struct proto_ops unix_dgram_ops = { .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, .set_peek_off = unix_set_peek_off, + .show_fdinfo = unix_show_fdinfo, }; static const struct proto_ops unix_seqpacket_ops = { @@ -732,6 +757,9 @@ static const struct proto_ops unix_seqpacket_ops = { .getname = unix_getname, .poll = unix_dgram_poll, .ioctl = unix_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = unix_compat_ioctl, +#endif .listen = unix_listen, .shutdown = unix_shutdown, .setsockopt = sock_no_setsockopt, @@ -741,6 +769,7 @@ static const struct proto_ops unix_seqpacket_ops = { .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, .set_peek_off = unix_set_peek_off, + .show_fdinfo = unix_show_fdinfo, }; static struct proto unix_proto = { @@ -778,6 +807,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) mutex_init(&u->bindlock); /* single task binding lock */ init_waitqueue_head(&u->peer_wait); init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); + memset(&u->scm_stat, 0, sizeof(struct scm_stat)); unix_insert_socket(unix_sockets_unbound(sk), sk); out: if (sk == NULL) @@ -1562,6 +1592,28 @@ static bool unix_skb_scm_eq(struct sk_buff *skb, unix_secdata_eq(scm, skb); } +static void scm_stat_add(struct sock *sk, struct sk_buff *skb) +{ + struct scm_fp_list *fp = UNIXCB(skb).fp; + struct unix_sock *u = unix_sk(sk); + + lockdep_assert_held(&sk->sk_receive_queue.lock); + + if (unlikely(fp && fp->count)) + u->scm_stat.nr_fds += fp->count; +} + +static void scm_stat_del(struct sock *sk, struct sk_buff *skb) +{ + struct scm_fp_list *fp = UNIXCB(skb).fp; + struct unix_sock *u = unix_sk(sk); + + lockdep_assert_held(&sk->sk_receive_queue.lock); + + if (unlikely(fp && fp->count)) + u->scm_stat.nr_fds -= fp->count; +} + /* * Send AF_UNIX data. */ @@ -1712,7 +1764,8 @@ restart_locked: * - unix_peer(sk) == sk by time of get but disconnected before lock */ if (other != sk && - unlikely(unix_peer(other) != sk && unix_recvq_full(other))) { + unlikely(unix_peer(other) != sk && + unix_recvq_full_lockless(other))) { if (timeo) { timeo = unix_wait_for_peer(other, timeo); @@ -1747,7 +1800,10 @@ restart_locked: if (sock_flag(other, SOCK_RCVTSTAMP)) __net_timestamp(skb); maybe_add_creds(skb, sock, other); - skb_queue_tail(&other->sk_receive_queue, skb); + spin_lock(&other->sk_receive_queue.lock); + scm_stat_add(other, skb); + __skb_queue_tail(&other->sk_receive_queue, skb); + spin_unlock(&other->sk_receive_queue.lock); unix_state_unlock(other); other->sk_data_ready(other); sock_put(other); @@ -1849,7 +1905,10 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, goto pipe_err_free; maybe_add_creds(skb, sock, other); - skb_queue_tail(&other->sk_receive_queue, skb); + spin_lock(&other->sk_receive_queue.lock); + scm_stat_add(other, skb); + __skb_queue_tail(&other->sk_receive_queue, skb); + spin_unlock(&other->sk_receive_queue.lock); unix_state_unlock(other); other->sk_data_ready(other); sent += size; @@ -2048,8 +2107,8 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, mutex_lock(&u->iolock); skip = sk_peek_offset(sk, flags); - skb = __skb_try_recv_datagram(sk, flags, NULL, &skip, &err, - &last); + skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags, + scm_stat_del, &skip, &err, &last); if (skb) break; @@ -2058,7 +2117,8 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, if (err != -EAGAIN) break; } while (timeo && - !__skb_wait_for_more_packets(sk, &err, &timeo, last)); + !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, + &err, &timeo, last)); if (!skb) { /* implies iolock unlocked */ unix_state_lock(sk); @@ -2343,8 +2403,12 @@ unlock: sk_peek_offset_bwd(sk, chunk); - if (UNIXCB(skb).fp) + if (UNIXCB(skb).fp) { + spin_lock(&sk->sk_receive_queue.lock); + scm_stat_del(sk, skb); + spin_unlock(&sk->sk_receive_queue.lock); unix_detach_fds(&scm, skb); + } if (unix_skb_len(skb)) break; @@ -2582,6 +2646,13 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return err; } +#ifdef CONFIG_COMPAT +static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); +} +#endif + static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; @@ -2599,7 +2670,7 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; /* readable? */ - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* Connection-based need to check for termination and startup */ @@ -2628,7 +2699,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, mask = 0; /* exceptional events? */ - if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); @@ -2638,7 +2709,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, mask |= EPOLLHUP; /* readable? */ - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* Connection-based need to check for termination and startup */ @@ -2848,7 +2919,7 @@ static int __init af_unix_init(void) { int rc = -1; - BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb)); + BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); rc = proto_register(&unix_proto, 1); if (rc != 0) { diff --git a/net/vmw_vsock/Kconfig b/net/vmw_vsock/Kconfig index 8abcb815af2d..56356d2980c8 100644 --- a/net/vmw_vsock/Kconfig +++ b/net/vmw_vsock/Kconfig @@ -26,6 +26,18 @@ config VSOCKETS_DIAG Enable this module so userspace applications can query open sockets. +config VSOCKETS_LOOPBACK + tristate "Virtual Sockets loopback transport" + depends on VSOCKETS + default y + select VIRTIO_VSOCKETS_COMMON + help + This module implements a loopback transport for Virtual Sockets, + using vmw_vsock_virtio_transport_common. + + To compile this driver as a module, choose M here: the module + will be called vsock_loopback. If unsure, say N. + config VMWARE_VMCI_VSOCKETS tristate "VMware VMCI transport for Virtual Sockets" depends on VSOCKETS && VMWARE_VMCI diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile index 7c6f9a0b67b0..6a943ec95c4a 100644 --- a/net/vmw_vsock/Makefile +++ b/net/vmw_vsock/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_VMWARE_VMCI_VSOCKETS) += vmw_vsock_vmci_transport.o obj-$(CONFIG_VIRTIO_VSOCKETS) += vmw_vsock_virtio_transport.o obj-$(CONFIG_VIRTIO_VSOCKETS_COMMON) += vmw_vsock_virtio_transport_common.o obj-$(CONFIG_HYPERV_VSOCKETS) += hv_sock.o +obj-$(CONFIG_VSOCKETS_LOOPBACK) += vsock_loopback.o vsock-y += af_vsock.o af_vsock_tap.o vsock_addr.o diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index ab47bf3ab66e..9c5b2a91baad 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -126,19 +126,20 @@ static struct proto vsock_proto = { */ #define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ) -static const struct vsock_transport *transport; +#define VSOCK_DEFAULT_BUFFER_SIZE (1024 * 256) +#define VSOCK_DEFAULT_BUFFER_MAX_SIZE (1024 * 256) +#define VSOCK_DEFAULT_BUFFER_MIN_SIZE 128 + +/* Transport used for host->guest communication */ +static const struct vsock_transport *transport_h2g; +/* Transport used for guest->host communication */ +static const struct vsock_transport *transport_g2h; +/* Transport used for DGRAM communication */ +static const struct vsock_transport *transport_dgram; +/* Transport used for local communication */ +static const struct vsock_transport *transport_local; static DEFINE_MUTEX(vsock_register_mutex); -/**** EXPORTS ****/ - -/* Get the ID of the local context. This is transport dependent. */ - -int vm_sockets_get_local_cid(void) -{ - return transport->get_local_cid(); -} -EXPORT_SYMBOL_GPL(vm_sockets_get_local_cid); - /**** UTILS ****/ /* Each bound VSocket is stored in the bind hash table and each connected @@ -188,7 +189,7 @@ static int vsock_auto_bind(struct vsock_sock *vsk) return __vsock_bind(sk, &local_addr); } -static int __init vsock_init_tables(void) +static void vsock_init_tables(void) { int i; @@ -197,7 +198,6 @@ static int __init vsock_init_tables(void) for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) INIT_LIST_HEAD(&vsock_connected_table[i]); - return 0; } static void __vsock_insert_bound(struct list_head *list, @@ -230,10 +230,16 @@ static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr) { struct vsock_sock *vsk; - list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table) - if (addr->svm_port == vsk->local_addr.svm_port) + list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table) { + if (vsock_addr_equals_addr(addr, &vsk->local_addr)) return sk_vsock(vsk); + if (addr->svm_port == vsk->local_addr.svm_port && + (vsk->local_addr.svm_cid == VMADDR_CID_ANY || + addr->svm_cid == VMADDR_CID_ANY)) + return sk_vsock(vsk); + } + return NULL; } @@ -382,6 +388,106 @@ void vsock_enqueue_accept(struct sock *listener, struct sock *connected) } EXPORT_SYMBOL_GPL(vsock_enqueue_accept); +static bool vsock_use_local_transport(unsigned int remote_cid) +{ + if (!transport_local) + return false; + + if (remote_cid == VMADDR_CID_LOCAL) + return true; + + if (transport_g2h) { + return remote_cid == transport_g2h->get_local_cid(); + } else { + return remote_cid == VMADDR_CID_HOST; + } +} + +static void vsock_deassign_transport(struct vsock_sock *vsk) +{ + if (!vsk->transport) + return; + + vsk->transport->destruct(vsk); + module_put(vsk->transport->module); + vsk->transport = NULL; +} + +/* Assign a transport to a socket and call the .init transport callback. + * + * Note: for stream socket this must be called when vsk->remote_addr is set + * (e.g. during the connect() or when a connection request on a listener + * socket is received). + * The vsk->remote_addr is used to decide which transport to use: + * - remote CID == VMADDR_CID_LOCAL or g2h->local_cid or VMADDR_CID_HOST if + * g2h is not loaded, will use local transport; + * - remote CID <= VMADDR_CID_HOST will use guest->host transport; + * - remote CID > VMADDR_CID_HOST will use host->guest transport; + */ +int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) +{ + const struct vsock_transport *new_transport; + struct sock *sk = sk_vsock(vsk); + unsigned int remote_cid = vsk->remote_addr.svm_cid; + int ret; + + switch (sk->sk_type) { + case SOCK_DGRAM: + new_transport = transport_dgram; + break; + case SOCK_STREAM: + if (vsock_use_local_transport(remote_cid)) + new_transport = transport_local; + else if (remote_cid <= VMADDR_CID_HOST) + new_transport = transport_g2h; + else + new_transport = transport_h2g; + break; + default: + return -ESOCKTNOSUPPORT; + } + + if (vsk->transport) { + if (vsk->transport == new_transport) + return 0; + + vsk->transport->release(vsk); + vsock_deassign_transport(vsk); + } + + /* We increase the module refcnt to prevent the transport unloading + * while there are open sockets assigned to it. + */ + if (!new_transport || !try_module_get(new_transport->module)) + return -ENODEV; + + ret = new_transport->init(vsk, psk); + if (ret) { + module_put(new_transport->module); + return ret; + } + + vsk->transport = new_transport; + + return 0; +} +EXPORT_SYMBOL_GPL(vsock_assign_transport); + +bool vsock_find_cid(unsigned int cid) +{ + if (transport_g2h && cid == transport_g2h->get_local_cid()) + return true; + + if (transport_h2g && cid == VMADDR_CID_HOST) + return true; + + if (transport_local && cid == VMADDR_CID_LOCAL) + return true; + + return false; +} +EXPORT_SYMBOL_GPL(vsock_find_cid); + static struct sock *vsock_dequeue_accept(struct sock *listener) { struct vsock_sock *vlistener; @@ -418,7 +524,12 @@ static bool vsock_is_pending(struct sock *sk) static int vsock_send_shutdown(struct sock *sk, int mode) { - return transport->shutdown(vsock_sk(sk), mode); + struct vsock_sock *vsk = vsock_sk(sk); + + if (!vsk->transport) + return -ENODEV; + + return vsk->transport->shutdown(vsk, mode); } static void vsock_pending_work(struct work_struct *work) @@ -439,7 +550,7 @@ static void vsock_pending_work(struct work_struct *work) if (vsock_is_pending(sk)) { vsock_remove_pending(listener, sk); - listener->sk_ack_backlog--; + sk_acceptq_removed(listener); } else if (!vsk->rejected) { /* We are not on the pending list and accept() did not reject * us, so we must have been accepted by our user process. We @@ -528,13 +639,12 @@ static int __vsock_bind_stream(struct vsock_sock *vsk, static int __vsock_bind_dgram(struct vsock_sock *vsk, struct sockaddr_vm *addr) { - return transport->dgram_bind(vsk, addr); + return vsk->transport->dgram_bind(vsk, addr); } static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr) { struct vsock_sock *vsk = vsock_sk(sk); - u32 cid; int retval; /* First ensure this socket isn't already bound. */ @@ -544,10 +654,9 @@ static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr) /* Now bind to the provided address or select appropriate values if * none are provided (VMADDR_CID_ANY and VMADDR_PORT_ANY). Note that * like AF_INET prevents binding to a non-local IP address (in most - * cases), we only allow binding to the local CID. + * cases), we only allow binding to a local CID. */ - cid = transport->get_local_cid(); - if (addr->svm_cid != cid && addr->svm_cid != VMADDR_CID_ANY) + if (addr->svm_cid != VMADDR_CID_ANY && !vsock_find_cid(addr->svm_cid)) return -EADDRNOTAVAIL; switch (sk->sk_socket->type) { @@ -571,12 +680,12 @@ static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr) static void vsock_connect_timeout(struct work_struct *work); -struct sock *__vsock_create(struct net *net, - struct socket *sock, - struct sock *parent, - gfp_t priority, - unsigned short type, - int kern) +static struct sock *__vsock_create(struct net *net, + struct socket *sock, + struct sock *parent, + gfp_t priority, + unsigned short type, + int kern) { struct sock *sk; struct vsock_sock *psk; @@ -620,46 +729,52 @@ struct sock *__vsock_create(struct net *net, vsk->trusted = psk->trusted; vsk->owner = get_cred(psk->owner); vsk->connect_timeout = psk->connect_timeout; + vsk->buffer_size = psk->buffer_size; + vsk->buffer_min_size = psk->buffer_min_size; + vsk->buffer_max_size = psk->buffer_max_size; } else { vsk->trusted = capable(CAP_NET_ADMIN); vsk->owner = get_current_cred(); vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT; + vsk->buffer_size = VSOCK_DEFAULT_BUFFER_SIZE; + vsk->buffer_min_size = VSOCK_DEFAULT_BUFFER_MIN_SIZE; + vsk->buffer_max_size = VSOCK_DEFAULT_BUFFER_MAX_SIZE; } - if (transport->init(vsk, psk) < 0) { - sk_free(sk); - return NULL; - } - - if (sock) - vsock_insert_unbound(vsk); - return sk; } -EXPORT_SYMBOL_GPL(__vsock_create); -static void __vsock_release(struct sock *sk) +static void __vsock_release(struct sock *sk, int level) { if (sk) { - struct sk_buff *skb; struct sock *pending; struct vsock_sock *vsk; vsk = vsock_sk(sk); pending = NULL; /* Compiler warning. */ - transport->release(vsk); - - lock_sock(sk); + /* The release call is supposed to use lock_sock_nested() + * rather than lock_sock(), if a sock lock should be acquired. + */ + if (vsk->transport) + vsk->transport->release(vsk); + else if (sk->sk_type == SOCK_STREAM) + vsock_remove_sock(vsk); + + /* When "level" is SINGLE_DEPTH_NESTING, use the nested + * version to avoid the warning "possible recursive locking + * detected". When "level" is 0, lock_sock_nested(sk, level) + * is the same as lock_sock(sk). + */ + lock_sock_nested(sk, level); sock_orphan(sk); sk->sk_shutdown = SHUTDOWN_MASK; - while ((skb = skb_dequeue(&sk->sk_receive_queue))) - kfree_skb(skb); + skb_queue_purge(&sk->sk_receive_queue); /* Clean up any sockets that never were accepted. */ while ((pending = vsock_dequeue_accept(sk)) != NULL) { - __vsock_release(pending); + __vsock_release(pending, SINGLE_DEPTH_NESTING); sock_put(pending); } @@ -672,7 +787,7 @@ static void vsock_sk_destruct(struct sock *sk) { struct vsock_sock *vsk = vsock_sk(sk); - transport->destruct(vsk); + vsock_deassign_transport(vsk); /* When clearing these addresses, there's no need to set the family and * possibly register the address family with the kernel. @@ -694,21 +809,28 @@ static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) return err; } +struct sock *vsock_create_connected(struct sock *parent) +{ + return __vsock_create(sock_net(parent), NULL, parent, GFP_KERNEL, + parent->sk_type, 0); +} +EXPORT_SYMBOL_GPL(vsock_create_connected); + s64 vsock_stream_has_data(struct vsock_sock *vsk) { - return transport->stream_has_data(vsk); + return vsk->transport->stream_has_data(vsk); } EXPORT_SYMBOL_GPL(vsock_stream_has_data); s64 vsock_stream_has_space(struct vsock_sock *vsk) { - return transport->stream_has_space(vsk); + return vsk->transport->stream_has_space(vsk); } EXPORT_SYMBOL_GPL(vsock_stream_has_space); static int vsock_release(struct socket *sock) { - __vsock_release(sock->sk); + __vsock_release(sock->sk, 0); sock->sk = NULL; sock->state = SS_FREE; @@ -862,7 +984,7 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock, * the queue and write as long as the socket isn't shutdown for * sending. */ - if (!skb_queue_empty(&sk->sk_receive_queue) || + if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || (sk->sk_shutdown & RCV_SHUTDOWN)) { mask |= EPOLLIN | EPOLLRDNORM; } @@ -871,6 +993,7 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock, mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; } else if (sock->type == SOCK_STREAM) { + const struct vsock_transport *transport = vsk->transport; lock_sock(sk); /* Listening sockets that have connections in their accept @@ -881,7 +1004,7 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock, mask |= EPOLLIN | EPOLLRDNORM; /* If there is something in the queue then we can read. */ - if (transport->stream_is_active(vsk) && + if (transport && transport->stream_is_active(vsk) && !(sk->sk_shutdown & RCV_SHUTDOWN)) { bool data_ready_now = false; int ret = transport->notify_poll_in( @@ -946,6 +1069,7 @@ static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg, struct sock *sk; struct vsock_sock *vsk; struct sockaddr_vm *remote_addr; + const struct vsock_transport *transport; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; @@ -954,6 +1078,7 @@ static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg, err = 0; sk = sock->sk; vsk = vsock_sk(sk); + transport = vsk->transport; lock_sock(sk); @@ -1038,8 +1163,8 @@ static int vsock_dgram_connect(struct socket *sock, if (err) goto out; - if (!transport->dgram_allow(remote_addr->svm_cid, - remote_addr->svm_port)) { + if (!vsk->transport->dgram_allow(remote_addr->svm_cid, + remote_addr->svm_port)) { err = -EINVAL; goto out; } @@ -1055,7 +1180,9 @@ out: static int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { - return transport->dgram_dequeue(vsock_sk(sock->sk), msg, len, flags); + struct vsock_sock *vsk = vsock_sk(sock->sk); + + return vsk->transport->dgram_dequeue(vsk, msg, len, flags); } static const struct proto_ops vsock_dgram_ops = { @@ -1081,6 +1208,8 @@ static const struct proto_ops vsock_dgram_ops = { static int vsock_transport_cancel_pkt(struct vsock_sock *vsk) { + const struct vsock_transport *transport = vsk->transport; + if (!transport->cancel_pkt) return -EOPNOTSUPP; @@ -1117,6 +1246,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, int err; struct sock *sk; struct vsock_sock *vsk; + const struct vsock_transport *transport; struct sockaddr_vm *remote_addr; long timeout; DEFINE_WAIT(wait); @@ -1151,19 +1281,26 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, goto out; } + /* Set the remote address that we are connecting to. */ + memcpy(&vsk->remote_addr, remote_addr, + sizeof(vsk->remote_addr)); + + err = vsock_assign_transport(vsk, NULL); + if (err) + goto out; + + transport = vsk->transport; + /* The hypervisor and well-known contexts do not have socket * endpoints. */ - if (!transport->stream_allow(remote_addr->svm_cid, + if (!transport || + !transport->stream_allow(remote_addr->svm_cid, remote_addr->svm_port)) { err = -ENETUNREACH; goto out; } - /* Set the remote address that we are connecting to. */ - memcpy(&vsk->remote_addr, remote_addr, - sizeof(vsk->remote_addr)); - err = vsock_auto_bind(vsk); if (err) goto out; @@ -1293,7 +1430,7 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags, err = -listener->sk_err; if (connected) { - listener->sk_ack_backlog--; + sk_acceptq_removed(listener); lock_sock_nested(connected, SINGLE_DEPTH_NESTING); vconnected = vsock_sk(connected); @@ -1358,6 +1495,23 @@ out: return err; } +static void vsock_update_buffer_size(struct vsock_sock *vsk, + const struct vsock_transport *transport, + u64 val) +{ + if (val > vsk->buffer_max_size) + val = vsk->buffer_max_size; + + if (val < vsk->buffer_min_size) + val = vsk->buffer_min_size; + + if (val != vsk->buffer_size && + transport && transport->notify_buffer_size) + transport->notify_buffer_size(vsk, &val); + + vsk->buffer_size = val; +} + static int vsock_stream_setsockopt(struct socket *sock, int level, int optname, @@ -1367,6 +1521,7 @@ static int vsock_stream_setsockopt(struct socket *sock, int err; struct sock *sk; struct vsock_sock *vsk; + const struct vsock_transport *transport; u64 val; if (level != AF_VSOCK) @@ -1387,23 +1542,26 @@ static int vsock_stream_setsockopt(struct socket *sock, err = 0; sk = sock->sk; vsk = vsock_sk(sk); + transport = vsk->transport; lock_sock(sk); switch (optname) { case SO_VM_SOCKETS_BUFFER_SIZE: COPY_IN(val); - transport->set_buffer_size(vsk, val); + vsock_update_buffer_size(vsk, transport, val); break; case SO_VM_SOCKETS_BUFFER_MAX_SIZE: COPY_IN(val); - transport->set_max_buffer_size(vsk, val); + vsk->buffer_max_size = val; + vsock_update_buffer_size(vsk, transport, vsk->buffer_size); break; case SO_VM_SOCKETS_BUFFER_MIN_SIZE: COPY_IN(val); - transport->set_min_buffer_size(vsk, val); + vsk->buffer_min_size = val; + vsock_update_buffer_size(vsk, transport, vsk->buffer_size); break; case SO_VM_SOCKETS_CONNECT_TIMEOUT: { @@ -1470,17 +1628,17 @@ static int vsock_stream_getsockopt(struct socket *sock, switch (optname) { case SO_VM_SOCKETS_BUFFER_SIZE: - val = transport->get_buffer_size(vsk); + val = vsk->buffer_size; COPY_OUT(val); break; case SO_VM_SOCKETS_BUFFER_MAX_SIZE: - val = transport->get_max_buffer_size(vsk); + val = vsk->buffer_max_size; COPY_OUT(val); break; case SO_VM_SOCKETS_BUFFER_MIN_SIZE: - val = transport->get_min_buffer_size(vsk); + val = vsk->buffer_min_size; COPY_OUT(val); break; @@ -1511,6 +1669,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, { struct sock *sk; struct vsock_sock *vsk; + const struct vsock_transport *transport; ssize_t total_written; long timeout; int err; @@ -1519,6 +1678,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, sk = sock->sk; vsk = vsock_sk(sk); + transport = vsk->transport; total_written = 0; err = 0; @@ -1540,7 +1700,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, goto out; } - if (sk->sk_state != TCP_ESTABLISHED || + if (!transport || sk->sk_state != TCP_ESTABLISHED || !vsock_addr_bound(&vsk->local_addr)) { err = -ENOTCONN; goto out; @@ -1650,6 +1810,7 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, { struct sock *sk; struct vsock_sock *vsk; + const struct vsock_transport *transport; int err; size_t target; ssize_t copied; @@ -1660,11 +1821,12 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, sk = sock->sk; vsk = vsock_sk(sk); + transport = vsk->transport; err = 0; lock_sock(sk); - if (sk->sk_state != TCP_ESTABLISHED) { + if (!transport || sk->sk_state != TCP_ESTABLISHED) { /* Recvmsg is supposed to return 0 if a peer performs an * orderly shutdown. Differentiate between that case and when a * peer has not connected or a local shutdown occured with the @@ -1838,6 +2000,10 @@ static const struct proto_ops vsock_stream_ops = { static int vsock_create(struct net *net, struct socket *sock, int protocol, int kern) { + struct vsock_sock *vsk; + struct sock *sk; + int ret; + if (!sock) return -EINVAL; @@ -1857,7 +2023,23 @@ static int vsock_create(struct net *net, struct socket *sock, sock->state = SS_UNCONNECTED; - return __vsock_create(net, sock, NULL, GFP_KERNEL, 0, kern) ? 0 : -ENOMEM; + sk = __vsock_create(net, sock, NULL, GFP_KERNEL, 0, kern); + if (!sk) + return -ENOMEM; + + vsk = vsock_sk(sk); + + if (sock->type == SOCK_DGRAM) { + ret = vsock_assign_transport(vsk, NULL); + if (ret < 0) { + sock_put(sk); + return ret; + } + } + + vsock_insert_unbound(vsk); + + return 0; } static const struct net_proto_family vsock_family_ops = { @@ -1870,11 +2052,20 @@ static long vsock_dev_do_ioctl(struct file *filp, unsigned int cmd, void __user *ptr) { u32 __user *p = ptr; + u32 cid = VMADDR_CID_ANY; int retval = 0; switch (cmd) { case IOCTL_VM_SOCKETS_GET_LOCAL_CID: - if (put_user(transport->get_local_cid(), p) != 0) + /* To be compatible with the VMCI behavior, we prioritize the + * guest CID instead of well-know host CID (VMADDR_CID_HOST). + */ + if (transport_g2h) + cid = transport_g2h->get_local_cid(); + else if (transport_h2g) + cid = transport_h2g->get_local_cid(); + + if (put_user(cid, p) != 0) retval = -EFAULT; break; @@ -1914,24 +2105,13 @@ static struct miscdevice vsock_device = { .fops = &vsock_device_ops, }; -int __vsock_core_init(const struct vsock_transport *t, struct module *owner) +static int __init vsock_init(void) { - int err = mutex_lock_interruptible(&vsock_register_mutex); + int err = 0; - if (err) - return err; - - if (transport) { - err = -EBUSY; - goto err_busy; - } - - /* Transport must be the owner of the protocol so that it can't - * unload while there are open sockets. - */ - vsock_proto.owner = owner; - transport = t; + vsock_init_tables(); + vsock_proto.owner = THIS_MODULE; vsock_device.minor = MISC_DYNAMIC_MINOR; err = misc_register(&vsock_device); if (err) { @@ -1952,7 +2132,6 @@ int __vsock_core_init(const struct vsock_transport *t, struct module *owner) goto err_unregister_proto; } - mutex_unlock(&vsock_register_mutex); return 0; err_unregister_proto: @@ -1960,44 +2139,99 @@ err_unregister_proto: err_deregister_misc: misc_deregister(&vsock_device); err_reset_transport: - transport = NULL; -err_busy: - mutex_unlock(&vsock_register_mutex); return err; } -EXPORT_SYMBOL_GPL(__vsock_core_init); -void vsock_core_exit(void) +static void __exit vsock_exit(void) { - mutex_lock(&vsock_register_mutex); - misc_deregister(&vsock_device); sock_unregister(AF_VSOCK); proto_unregister(&vsock_proto); - - /* We do not want the assignment below re-ordered. */ - mb(); - transport = NULL; - - mutex_unlock(&vsock_register_mutex); } -EXPORT_SYMBOL_GPL(vsock_core_exit); -const struct vsock_transport *vsock_core_get_transport(void) +const struct vsock_transport *vsock_core_get_transport(struct vsock_sock *vsk) { - /* vsock_register_mutex not taken since only the transport uses this - * function and only while registered. - */ - return transport; + return vsk->transport; } EXPORT_SYMBOL_GPL(vsock_core_get_transport); -static void __exit vsock_exit(void) +int vsock_core_register(const struct vsock_transport *t, int features) +{ + const struct vsock_transport *t_h2g, *t_g2h, *t_dgram, *t_local; + int err = mutex_lock_interruptible(&vsock_register_mutex); + + if (err) + return err; + + t_h2g = transport_h2g; + t_g2h = transport_g2h; + t_dgram = transport_dgram; + t_local = transport_local; + + if (features & VSOCK_TRANSPORT_F_H2G) { + if (t_h2g) { + err = -EBUSY; + goto err_busy; + } + t_h2g = t; + } + + if (features & VSOCK_TRANSPORT_F_G2H) { + if (t_g2h) { + err = -EBUSY; + goto err_busy; + } + t_g2h = t; + } + + if (features & VSOCK_TRANSPORT_F_DGRAM) { + if (t_dgram) { + err = -EBUSY; + goto err_busy; + } + t_dgram = t; + } + + if (features & VSOCK_TRANSPORT_F_LOCAL) { + if (t_local) { + err = -EBUSY; + goto err_busy; + } + t_local = t; + } + + transport_h2g = t_h2g; + transport_g2h = t_g2h; + transport_dgram = t_dgram; + transport_local = t_local; + +err_busy: + mutex_unlock(&vsock_register_mutex); + return err; +} +EXPORT_SYMBOL_GPL(vsock_core_register); + +void vsock_core_unregister(const struct vsock_transport *t) { - /* Do nothing. This function makes this module removable. */ + mutex_lock(&vsock_register_mutex); + + if (transport_h2g == t) + transport_h2g = NULL; + + if (transport_g2h == t) + transport_g2h = NULL; + + if (transport_dgram == t) + transport_dgram = NULL; + + if (transport_local == t) + transport_local = NULL; + + mutex_unlock(&vsock_register_mutex); } +EXPORT_SYMBOL_GPL(vsock_core_unregister); -module_init(vsock_init_tables); +module_init(vsock_init); module_exit(vsock_exit); MODULE_AUTHOR("VMware, Inc."); diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index 9d864ebeb7b3..3492c021925f 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -13,15 +13,16 @@ #include <linux/hyperv.h> #include <net/sock.h> #include <net/af_vsock.h> +#include <asm/hyperv-tlfs.h> /* Older (VMBUS version 'VERSION_WIN10' or before) Windows hosts have some - * stricter requirements on the hv_sock ring buffer size of six 4K pages. Newer - * hosts don't have this limitation; but, keep the defaults the same for compat. + * stricter requirements on the hv_sock ring buffer size of six 4K pages. + * hyperv-tlfs defines HV_HYP_PAGE_SIZE as 4K. Newer hosts don't have this + * limitation; but, keep the defaults the same for compat. */ -#define PAGE_SIZE_4K 4096 -#define RINGBUFFER_HVS_RCV_SIZE (PAGE_SIZE_4K * 6) -#define RINGBUFFER_HVS_SND_SIZE (PAGE_SIZE_4K * 6) -#define RINGBUFFER_HVS_MAX_SIZE (PAGE_SIZE_4K * 64) +#define RINGBUFFER_HVS_RCV_SIZE (HV_HYP_PAGE_SIZE * 6) +#define RINGBUFFER_HVS_SND_SIZE (HV_HYP_PAGE_SIZE * 6) +#define RINGBUFFER_HVS_MAX_SIZE (HV_HYP_PAGE_SIZE * 64) /* The MTU is 16KB per the host side's design */ #define HVS_MTU_SIZE (1024 * 16) @@ -54,7 +55,8 @@ struct hvs_recv_buf { * ringbuffer APIs that allow us to directly copy data from userspace buffer * to VMBus ringbuffer. */ -#define HVS_SEND_BUF_SIZE (PAGE_SIZE_4K - sizeof(struct vmpipe_proto_header)) +#define HVS_SEND_BUF_SIZE \ + (HV_HYP_PAGE_SIZE - sizeof(struct vmpipe_proto_header)) struct hvs_send_buf { /* The header before the payload data */ @@ -77,11 +79,11 @@ struct hvs_send_buf { VMBUS_PKT_TRAILER_SIZE) union hvs_service_id { - uuid_le srv_id; + guid_t srv_id; struct { unsigned int svm_port; - unsigned char b[sizeof(uuid_le) - sizeof(unsigned int)]; + unsigned char b[sizeof(guid_t) - sizeof(unsigned int)]; }; }; @@ -89,8 +91,8 @@ union hvs_service_id { struct hvsock { struct vsock_sock *vsk; - uuid_le vm_srv_id; - uuid_le host_srv_id; + guid_t vm_srv_id; + guid_t host_srv_id; struct vmbus_channel *chan; struct vmpacket_descriptor *recv_desc; @@ -136,77 +138,39 @@ struct hvsock { **************************************************************************** * The only valid Service GUIDs, from the perspectives of both the host and * * Linux VM, that can be connected by the other end, must conform to this * - * format: <port>-facb-11e6-bd58-64006a7986d3, and the "port" must be in * - * this range [0, 0x7FFFFFFF]. * + * format: <port>-facb-11e6-bd58-64006a7986d3. * **************************************************************************** * * When we write apps on the host to connect(), the GUID ServiceID is used. * When we write apps in Linux VM to connect(), we only need to specify the * port and the driver will form the GUID and use that to request the host. * - * From the perspective of Linux VM: - * 1. the local ephemeral port (i.e. the local auto-bound port when we call - * connect() without explicit bind()) is generated by __vsock_bind_stream(), - * and the range is [1024, 0xFFFFFFFF). - * 2. the remote ephemeral port (i.e. the auto-generated remote port for - * a connect request initiated by the host's connect()) is generated by - * hvs_remote_addr_init() and the range is [0x80000000, 0xFFFFFFFF). */ -#define MAX_LISTEN_PORT ((u32)0x7FFFFFFF) -#define MAX_VM_LISTEN_PORT MAX_LISTEN_PORT -#define MAX_HOST_LISTEN_PORT MAX_LISTEN_PORT -#define MIN_HOST_EPHEMERAL_PORT (MAX_HOST_LISTEN_PORT + 1) - /* 00000000-facb-11e6-bd58-64006a7986d3 */ -static const uuid_le srv_id_template = - UUID_LE(0x00000000, 0xfacb, 0x11e6, 0xbd, 0x58, - 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3); +static const guid_t srv_id_template = + GUID_INIT(0x00000000, 0xfacb, 0x11e6, 0xbd, 0x58, + 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3); + +static bool hvs_check_transport(struct vsock_sock *vsk); -static bool is_valid_srv_id(const uuid_le *id) +static bool is_valid_srv_id(const guid_t *id) { - return !memcmp(&id->b[4], &srv_id_template.b[4], sizeof(uuid_le) - 4); + return !memcmp(&id->b[4], &srv_id_template.b[4], sizeof(guid_t) - 4); } -static unsigned int get_port_by_srv_id(const uuid_le *svr_id) +static unsigned int get_port_by_srv_id(const guid_t *svr_id) { return *((unsigned int *)svr_id); } -static void hvs_addr_init(struct sockaddr_vm *addr, const uuid_le *svr_id) +static void hvs_addr_init(struct sockaddr_vm *addr, const guid_t *svr_id) { unsigned int port = get_port_by_srv_id(svr_id); vsock_addr_init(addr, VMADDR_CID_ANY, port); } -static void hvs_remote_addr_init(struct sockaddr_vm *remote, - struct sockaddr_vm *local) -{ - static u32 host_ephemeral_port = MIN_HOST_EPHEMERAL_PORT; - struct sock *sk; - - vsock_addr_init(remote, VMADDR_CID_ANY, VMADDR_PORT_ANY); - - while (1) { - /* Wrap around ? */ - if (host_ephemeral_port < MIN_HOST_EPHEMERAL_PORT || - host_ephemeral_port == VMADDR_PORT_ANY) - host_ephemeral_port = MIN_HOST_EPHEMERAL_PORT; - - remote->svm_port = host_ephemeral_port++; - - sk = vsock_find_connected_socket(remote, local); - if (!sk) { - /* Found an available ephemeral port */ - return; - } - - /* Release refcnt got in vsock_find_connected_socket */ - sock_put(sk); - } -} - static void hvs_set_channel_pending_send_size(struct vmbus_channel *chan) { set_channel_pending_send_size(chan, @@ -321,7 +285,7 @@ static void hvs_close_connection(struct vmbus_channel *chan) static void hvs_open_connection(struct vmbus_channel *chan) { - uuid_le *if_instance, *if_type; + guid_t *if_instance, *if_type; unsigned char conn_from_host; struct sockaddr_vm addr; @@ -336,12 +300,7 @@ static void hvs_open_connection(struct vmbus_channel *chan) if_type = &chan->offermsg.offer.if_type; if_instance = &chan->offermsg.offer.if_instance; conn_from_host = chan->offermsg.offer.u.pipe.user_def[0]; - - /* The host or the VM should only listen on a port in - * [0, MAX_LISTEN_PORT] - */ - if (!is_valid_srv_id(if_type) || - get_port_by_srv_id(if_type) > MAX_LISTEN_PORT) + if (!is_valid_srv_id(if_type)) return; hvs_addr_init(&addr, conn_from_host ? if_type : if_instance); @@ -358,13 +317,27 @@ static void hvs_open_connection(struct vmbus_channel *chan) if (sk->sk_ack_backlog >= sk->sk_max_ack_backlog) goto out; - new = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL, - sk->sk_type, 0); + new = vsock_create_connected(sk); if (!new) goto out; new->sk_state = TCP_SYN_SENT; vnew = vsock_sk(new); + + hvs_addr_init(&vnew->local_addr, if_type); + + /* Remote peer is always the host */ + vsock_addr_init(&vnew->remote_addr, + VMADDR_CID_HOST, VMADDR_PORT_ANY); + vnew->remote_addr.svm_port = get_port_by_srv_id(if_instance); + ret = vsock_assign_transport(vnew, vsock_sk(sk)); + /* Transport assigned (looking at remote_addr) must be the + * same where we received the request. + */ + if (ret || !hvs_check_transport(vnew)) { + sock_put(new); + goto out; + } hvs_new = vnew->trans; hvs_new->chan = chan; } else { @@ -393,10 +366,10 @@ static void hvs_open_connection(struct vmbus_channel *chan) } else { sndbuf = max_t(int, sk->sk_sndbuf, RINGBUFFER_HVS_SND_SIZE); sndbuf = min_t(int, sndbuf, RINGBUFFER_HVS_MAX_SIZE); - sndbuf = ALIGN(sndbuf, PAGE_SIZE); + sndbuf = ALIGN(sndbuf, HV_HYP_PAGE_SIZE); rcvbuf = max_t(int, sk->sk_rcvbuf, RINGBUFFER_HVS_RCV_SIZE); rcvbuf = min_t(int, rcvbuf, RINGBUFFER_HVS_MAX_SIZE); - rcvbuf = ALIGN(rcvbuf, PAGE_SIZE); + rcvbuf = ALIGN(rcvbuf, HV_HYP_PAGE_SIZE); } ret = vmbus_open(chan, sndbuf, rcvbuf, NULL, 0, hvs_channel_cb, @@ -426,10 +399,7 @@ static void hvs_open_connection(struct vmbus_channel *chan) if (conn_from_host) { new->sk_state = TCP_ESTABLISHED; - sk->sk_ack_backlog++; - - hvs_addr_init(&vnew->local_addr, if_type); - hvs_remote_addr_init(&vnew->remote_addr, &vnew->local_addr); + sk_acceptq_added(sk); hvs_new->vm_srv_id = *if_type; hvs_new->host_srv_id = *if_instance; @@ -559,7 +529,7 @@ static void hvs_release(struct vsock_sock *vsk) struct sock *sk = sk_vsock(vsk); bool remove_sock; - lock_sock(sk); + lock_sock_nested(sk, SINGLE_DEPTH_NESTING); remove_sock = hvs_close_lock_held(vsk); release_sock(sk); if (remove_sock) @@ -670,7 +640,7 @@ static ssize_t hvs_stream_enqueue(struct vsock_sock *vsk, struct msghdr *msg, ssize_t ret = 0; ssize_t bytes_written = 0; - BUILD_BUG_ON(sizeof(*send_buf) != PAGE_SIZE_4K); + BUILD_BUG_ON(sizeof(*send_buf) != HV_HYP_PAGE_SIZE); send_buf = kmalloc(sizeof(*send_buf), GFP_KERNEL); if (!send_buf) @@ -753,16 +723,6 @@ static bool hvs_stream_is_active(struct vsock_sock *vsk) static bool hvs_stream_allow(u32 cid, u32 port) { - /* The host's port range [MIN_HOST_EPHEMERAL_PORT, 0xFFFFFFFF) is - * reserved as ephemeral ports, which are used as the host's ports - * when the host initiates connections. - * - * Perform this check in the guest so an immediate error is produced - * instead of a timeout. - */ - if (port > MAX_HOST_LISTEN_PORT) - return false; - if (cid == VMADDR_CID_HOST) return true; @@ -843,37 +803,9 @@ int hvs_notify_send_post_enqueue(struct vsock_sock *vsk, ssize_t written, return 0; } -static void hvs_set_buffer_size(struct vsock_sock *vsk, u64 val) -{ - /* Ignored. */ -} - -static void hvs_set_min_buffer_size(struct vsock_sock *vsk, u64 val) -{ - /* Ignored. */ -} - -static void hvs_set_max_buffer_size(struct vsock_sock *vsk, u64 val) -{ - /* Ignored. */ -} - -static u64 hvs_get_buffer_size(struct vsock_sock *vsk) -{ - return -ENOPROTOOPT; -} - -static u64 hvs_get_min_buffer_size(struct vsock_sock *vsk) -{ - return -ENOPROTOOPT; -} - -static u64 hvs_get_max_buffer_size(struct vsock_sock *vsk) -{ - return -ENOPROTOOPT; -} - static struct vsock_transport hvs_transport = { + .module = THIS_MODULE, + .get_local_cid = hvs_get_local_cid, .init = hvs_sock_init, @@ -906,14 +838,13 @@ static struct vsock_transport hvs_transport = { .notify_send_pre_enqueue = hvs_notify_send_pre_enqueue, .notify_send_post_enqueue = hvs_notify_send_post_enqueue, - .set_buffer_size = hvs_set_buffer_size, - .set_min_buffer_size = hvs_set_min_buffer_size, - .set_max_buffer_size = hvs_set_max_buffer_size, - .get_buffer_size = hvs_get_buffer_size, - .get_min_buffer_size = hvs_get_min_buffer_size, - .get_max_buffer_size = hvs_get_max_buffer_size, }; +static bool hvs_check_transport(struct vsock_sock *vsk) +{ + return vsk->transport == &hvs_transport; +} + static int hvs_probe(struct hv_device *hdev, const struct hv_vmbus_device_id *dev_id) { @@ -938,6 +869,24 @@ static int hvs_remove(struct hv_device *hdev) return 0; } +/* hv_sock connections can not persist across hibernation, and all the hv_sock + * channels are forced to be rescinded before hibernation: see + * vmbus_bus_suspend(). Here the dummy hvs_suspend() and hvs_resume() + * are only needed because hibernation requires that every vmbus device's + * driver should have a .suspend and .resume callback: see vmbus_suspend(). + */ +static int hvs_suspend(struct hv_device *hv_dev) +{ + /* Dummy */ + return 0; +} + +static int hvs_resume(struct hv_device *dev) +{ + /* Dummy */ + return 0; +} + /* This isn't really used. See vmbus_match() and vmbus_probe() */ static const struct hv_vmbus_device_id id_table[] = { {}, @@ -949,6 +898,8 @@ static struct hv_driver hvs_drv = { .id_table = id_table, .probe = hvs_probe, .remove = hvs_remove, + .suspend = hvs_suspend, + .resume = hvs_resume, }; static int __init hvs_init(void) @@ -962,7 +913,7 @@ static int __init hvs_init(void) if (ret != 0) return ret; - ret = vsock_core_init(&hvs_transport); + ret = vsock_core_register(&hvs_transport, VSOCK_TRANSPORT_F_G2H); if (ret) { vmbus_driver_unregister(&hvs_drv); return ret; @@ -973,7 +924,7 @@ static int __init hvs_init(void) static void __exit hvs_exit(void) { - vsock_core_exit(); + vsock_core_unregister(&hvs_transport); vmbus_driver_unregister(&hvs_drv); } diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 0815d1357861..dfbaf6bd8b1c 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -44,10 +44,6 @@ struct virtio_vsock { spinlock_t send_pkt_list_lock; struct list_head send_pkt_list; - struct work_struct loopback_work; - spinlock_t loopback_list_lock; /* protects loopback_list */ - struct list_head loopback_list; - atomic_t queued_replies; /* The following fields are protected by rx_lock. vqs[VSOCK_VQ_RX] @@ -86,47 +82,6 @@ out_rcu: return ret; } -static void virtio_transport_loopback_work(struct work_struct *work) -{ - struct virtio_vsock *vsock = - container_of(work, struct virtio_vsock, loopback_work); - LIST_HEAD(pkts); - - spin_lock_bh(&vsock->loopback_list_lock); - list_splice_init(&vsock->loopback_list, &pkts); - spin_unlock_bh(&vsock->loopback_list_lock); - - mutex_lock(&vsock->rx_lock); - - if (!vsock->rx_run) - goto out; - - while (!list_empty(&pkts)) { - struct virtio_vsock_pkt *pkt; - - pkt = list_first_entry(&pkts, struct virtio_vsock_pkt, list); - list_del_init(&pkt->list); - - virtio_transport_recv_pkt(pkt); - } -out: - mutex_unlock(&vsock->rx_lock); -} - -static int virtio_transport_send_pkt_loopback(struct virtio_vsock *vsock, - struct virtio_vsock_pkt *pkt) -{ - int len = pkt->len; - - spin_lock_bh(&vsock->loopback_list_lock); - list_add_tail(&pkt->list, &vsock->loopback_list); - spin_unlock_bh(&vsock->loopback_list_lock); - - queue_work(virtio_vsock_workqueue, &vsock->loopback_work); - - return len; -} - static void virtio_transport_send_pkt_work(struct work_struct *work) { @@ -221,7 +176,8 @@ virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt) } if (le64_to_cpu(pkt->hdr.dst_cid) == vsock->guest_cid) { - len = virtio_transport_send_pkt_loopback(vsock, pkt); + virtio_transport_free_pkt(pkt); + len = -ENODEV; goto out_rcu; } @@ -307,6 +263,7 @@ static void virtio_vsock_rx_fill(struct virtio_vsock *vsock) break; } + pkt->buf_len = buf_len; pkt->len = buf_len; sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr)); @@ -369,59 +326,6 @@ static bool virtio_transport_more_replies(struct virtio_vsock *vsock) return val < virtqueue_get_vring_size(vq); } -static void virtio_transport_rx_work(struct work_struct *work) -{ - struct virtio_vsock *vsock = - container_of(work, struct virtio_vsock, rx_work); - struct virtqueue *vq; - - vq = vsock->vqs[VSOCK_VQ_RX]; - - mutex_lock(&vsock->rx_lock); - - if (!vsock->rx_run) - goto out; - - do { - virtqueue_disable_cb(vq); - for (;;) { - struct virtio_vsock_pkt *pkt; - unsigned int len; - - if (!virtio_transport_more_replies(vsock)) { - /* Stop rx until the device processes already - * pending replies. Leave rx virtqueue - * callbacks disabled. - */ - goto out; - } - - pkt = virtqueue_get_buf(vq, &len); - if (!pkt) { - break; - } - - vsock->rx_buf_nr--; - - /* Drop short/long packets */ - if (unlikely(len < sizeof(pkt->hdr) || - len > sizeof(pkt->hdr) + pkt->len)) { - virtio_transport_free_pkt(pkt); - continue; - } - - pkt->len = len - sizeof(pkt->hdr); - virtio_transport_deliver_tap_pkt(pkt); - virtio_transport_recv_pkt(pkt); - } - } while (!virtqueue_enable_cb(vq)); - -out: - if (vsock->rx_buf_nr < vsock->rx_buf_max_nr / 2) - virtio_vsock_rx_fill(vsock); - mutex_unlock(&vsock->rx_lock); -} - /* event_lock must be held */ static int virtio_vsock_event_fill_one(struct virtio_vsock *vsock, struct virtio_vsock_event *event) @@ -541,6 +445,8 @@ static void virtio_vsock_rx_done(struct virtqueue *vq) static struct virtio_transport virtio_transport = { .transport = { + .module = THIS_MODULE, + .get_local_cid = virtio_transport_get_local_cid, .init = virtio_transport_do_socket_init, @@ -573,18 +479,65 @@ static struct virtio_transport virtio_transport = { .notify_send_pre_block = virtio_transport_notify_send_pre_block, .notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue, .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue, - - .set_buffer_size = virtio_transport_set_buffer_size, - .set_min_buffer_size = virtio_transport_set_min_buffer_size, - .set_max_buffer_size = virtio_transport_set_max_buffer_size, - .get_buffer_size = virtio_transport_get_buffer_size, - .get_min_buffer_size = virtio_transport_get_min_buffer_size, - .get_max_buffer_size = virtio_transport_get_max_buffer_size, + .notify_buffer_size = virtio_transport_notify_buffer_size, }, .send_pkt = virtio_transport_send_pkt, }; +static void virtio_transport_rx_work(struct work_struct *work) +{ + struct virtio_vsock *vsock = + container_of(work, struct virtio_vsock, rx_work); + struct virtqueue *vq; + + vq = vsock->vqs[VSOCK_VQ_RX]; + + mutex_lock(&vsock->rx_lock); + + if (!vsock->rx_run) + goto out; + + do { + virtqueue_disable_cb(vq); + for (;;) { + struct virtio_vsock_pkt *pkt; + unsigned int len; + + if (!virtio_transport_more_replies(vsock)) { + /* Stop rx until the device processes already + * pending replies. Leave rx virtqueue + * callbacks disabled. + */ + goto out; + } + + pkt = virtqueue_get_buf(vq, &len); + if (!pkt) { + break; + } + + vsock->rx_buf_nr--; + + /* Drop short/long packets */ + if (unlikely(len < sizeof(pkt->hdr) || + len > sizeof(pkt->hdr) + pkt->len)) { + virtio_transport_free_pkt(pkt); + continue; + } + + pkt->len = len - sizeof(pkt->hdr); + virtio_transport_deliver_tap_pkt(pkt); + virtio_transport_recv_pkt(&virtio_transport, pkt); + } + } while (!virtqueue_enable_cb(vq)); + +out: + if (vsock->rx_buf_nr < vsock->rx_buf_max_nr / 2) + virtio_vsock_rx_fill(vsock); + mutex_unlock(&vsock->rx_lock); +} + static int virtio_vsock_probe(struct virtio_device *vdev) { vq_callback_t *callbacks[] = { @@ -636,13 +589,10 @@ static int virtio_vsock_probe(struct virtio_device *vdev) mutex_init(&vsock->event_lock); spin_lock_init(&vsock->send_pkt_list_lock); INIT_LIST_HEAD(&vsock->send_pkt_list); - spin_lock_init(&vsock->loopback_list_lock); - INIT_LIST_HEAD(&vsock->loopback_list); INIT_WORK(&vsock->rx_work, virtio_transport_rx_work); INIT_WORK(&vsock->tx_work, virtio_transport_tx_work); INIT_WORK(&vsock->event_work, virtio_transport_event_work); INIT_WORK(&vsock->send_pkt_work, virtio_transport_send_pkt_work); - INIT_WORK(&vsock->loopback_work, virtio_transport_loopback_work); mutex_lock(&vsock->tx_lock); vsock->tx_run = true; @@ -723,22 +673,12 @@ static void virtio_vsock_remove(struct virtio_device *vdev) } spin_unlock_bh(&vsock->send_pkt_list_lock); - spin_lock_bh(&vsock->loopback_list_lock); - while (!list_empty(&vsock->loopback_list)) { - pkt = list_first_entry(&vsock->loopback_list, - struct virtio_vsock_pkt, list); - list_del(&pkt->list); - virtio_transport_free_pkt(pkt); - } - spin_unlock_bh(&vsock->loopback_list_lock); - /* Delete virtqueues and flush outstanding callbacks if any */ vdev->config->del_vqs(vdev); /* Other works can be queued before 'config->del_vqs()', so we flush * all works before to free the vsock object to avoid use after free. */ - flush_work(&vsock->loopback_work); flush_work(&vsock->rx_work); flush_work(&vsock->tx_work); flush_work(&vsock->event_work); @@ -775,7 +715,8 @@ static int __init virtio_vsock_init(void) if (!virtio_vsock_workqueue) return -ENOMEM; - ret = vsock_core_init(&virtio_transport.transport); + ret = vsock_core_register(&virtio_transport.transport, + VSOCK_TRANSPORT_F_G2H); if (ret) goto out_wq; @@ -786,7 +727,7 @@ static int __init virtio_vsock_init(void) return 0; out_vci: - vsock_core_exit(); + vsock_core_unregister(&virtio_transport.transport); out_wq: destroy_workqueue(virtio_vsock_workqueue); return ret; @@ -795,7 +736,7 @@ out_wq: static void __exit virtio_vsock_exit(void) { unregister_virtio_driver(&virtio_vsock_driver); - vsock_core_exit(); + vsock_core_unregister(&virtio_transport.transport); destroy_workqueue(virtio_vsock_workqueue); } diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 6f1a8aff65c5..d9f0c9c5425a 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -11,9 +11,6 @@ #include <linux/sched/signal.h> #include <linux/ctype.h> #include <linux/list.h> -#include <linux/virtio.h> -#include <linux/virtio_ids.h> -#include <linux/virtio_config.h> #include <linux/virtio_vsock.h> #include <uapi/linux/vsockmon.h> @@ -26,9 +23,16 @@ /* How long to wait for graceful shutdown of a connection */ #define VSOCK_CLOSE_TIMEOUT (8 * HZ) -static const struct virtio_transport *virtio_transport_get_ops(void) +/* Threshold for detecting small packets to copy */ +#define GOOD_COPY_LEN 128 + +static const struct virtio_transport * +virtio_transport_get_ops(struct vsock_sock *vsk) { - const struct vsock_transport *t = vsock_core_get_transport(); + const struct vsock_transport *t = vsock_core_get_transport(vsk); + + if (WARN_ON(!t)) + return NULL; return container_of(t, struct virtio_transport, transport); } @@ -64,6 +68,9 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info, pkt->buf = kmalloc(len, GFP_KERNEL); if (!pkt->buf) goto out_pkt; + + pkt->buf_len = len; + err = memcpy_from_msg(pkt->buf, info->msg, len); if (err) goto out; @@ -91,8 +98,17 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque) struct virtio_vsock_pkt *pkt = opaque; struct af_vsockmon_hdr *hdr; struct sk_buff *skb; + size_t payload_len; + void *payload_buf; - skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) + pkt->len, + /* A packet could be split to fit the RX buffer, so we can retrieve + * the payload length from the header and the buffer pointer taking + * care of the offset in the original packet. + */ + payload_len = le32_to_cpu(pkt->hdr.len); + payload_buf = pkt->buf + pkt->off; + + skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) + payload_len, GFP_ATOMIC); if (!skb) return NULL; @@ -132,8 +148,8 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque) skb_put_data(skb, &pkt->hdr, sizeof(pkt->hdr)); - if (pkt->len) { - skb_put_data(skb, pkt->buf, pkt->len); + if (payload_len) { + skb_put_data(skb, payload_buf, payload_len); } return skb; @@ -145,15 +161,25 @@ void virtio_transport_deliver_tap_pkt(struct virtio_vsock_pkt *pkt) } EXPORT_SYMBOL_GPL(virtio_transport_deliver_tap_pkt); +/* This function can only be used on connecting/connected sockets, + * since a socket assigned to a transport is required. + * + * Do not use on listener sockets! + */ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, struct virtio_vsock_pkt_info *info) { u32 src_cid, src_port, dst_cid, dst_port; + const struct virtio_transport *t_ops; struct virtio_vsock_sock *vvs; struct virtio_vsock_pkt *pkt; u32 pkt_len = info->pkt_len; - src_cid = vm_sockets_get_local_cid(); + t_ops = virtio_transport_get_ops(vsk); + if (unlikely(!t_ops)) + return -EFAULT; + + src_cid = t_ops->transport.get_local_cid(); src_port = vsk->local_addr.svm_port; if (!info->remote_cid) { dst_cid = vsk->remote_addr.svm_cid; @@ -166,8 +192,8 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, vvs = vsk->trans; /* we can send less than pkt_len bytes */ - if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE) - pkt_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE; + if (pkt_len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE) + pkt_len = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE; /* virtio_transport_get_credit might return less than pkt_len credit */ pkt_len = virtio_transport_get_credit(vvs, pkt_len); @@ -186,13 +212,17 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, virtio_transport_inc_tx_pkt(vvs, pkt); - return virtio_transport_get_ops()->send_pkt(pkt); + return t_ops->send_pkt(pkt); } -static void virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, +static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt) { + if (vvs->rx_bytes + pkt->len > vvs->buf_alloc) + return false; + vvs->rx_bytes += pkt->len; + return true; } static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs, @@ -204,10 +234,11 @@ static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs, void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt) { - spin_lock_bh(&vvs->tx_lock); + spin_lock_bh(&vvs->rx_lock); + vvs->last_fwd_cnt = vvs->fwd_cnt; pkt->hdr.fwd_cnt = cpu_to_le32(vvs->fwd_cnt); pkt->hdr.buf_alloc = cpu_to_le32(vvs->buf_alloc); - spin_unlock_bh(&vvs->tx_lock); + spin_unlock_bh(&vvs->rx_lock); } EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt); @@ -248,6 +279,55 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk, } static ssize_t +virtio_transport_stream_do_peek(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + struct virtio_vsock_pkt *pkt; + size_t bytes, total = 0, off; + int err = -EFAULT; + + spin_lock_bh(&vvs->rx_lock); + + list_for_each_entry(pkt, &vvs->rx_queue, list) { + off = pkt->off; + + if (total == len) + break; + + while (total < len && off < pkt->len) { + bytes = len - total; + if (bytes > pkt->len - off) + bytes = pkt->len - off; + + /* sk_lock is held by caller so no one else can dequeue. + * Unlock rx_lock since memcpy_to_msg() may sleep. + */ + spin_unlock_bh(&vvs->rx_lock); + + err = memcpy_to_msg(msg, pkt->buf + off, bytes); + if (err) + goto out; + + spin_lock_bh(&vvs->rx_lock); + + total += bytes; + off += bytes; + } + } + + spin_unlock_bh(&vvs->rx_lock); + + return total; + +out: + if (total) + err = total; + return err; +} + +static ssize_t virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, struct msghdr *msg, size_t len) @@ -255,6 +335,7 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, struct virtio_vsock_sock *vvs = vsk->trans; struct virtio_vsock_pkt *pkt; size_t bytes, total = 0; + u32 free_space; int err = -EFAULT; spin_lock_bh(&vvs->rx_lock); @@ -285,11 +366,24 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, virtio_transport_free_pkt(pkt); } } + + free_space = vvs->buf_alloc - (vvs->fwd_cnt - vvs->last_fwd_cnt); + spin_unlock_bh(&vvs->rx_lock); - /* Send a credit pkt to peer */ - virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_STREAM, - NULL); + /* To reduce the number of credit update messages, + * don't update credits as long as lots of space is available. + * Note: the limit chosen here is arbitrary. Setting the limit + * too high causes extra messages. Too low causes transmitter + * stalls. As stalls are in theory more expensive than extra + * messages, we set the limit to a high value. TODO: experiment + * with different values. + */ + if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE) { + virtio_transport_send_credit_update(vsk, + VIRTIO_VSOCK_TYPE_STREAM, + NULL); + } return total; @@ -305,9 +399,9 @@ virtio_transport_stream_dequeue(struct vsock_sock *vsk, size_t len, int flags) { if (flags & MSG_PEEK) - return -EOPNOTSUPP; - - return virtio_transport_stream_do_dequeue(vsk, msg, len); + return virtio_transport_stream_do_peek(vsk, msg, len); + else + return virtio_transport_stream_do_dequeue(vsk, msg, len); } EXPORT_SYMBOL_GPL(virtio_transport_stream_dequeue); @@ -369,20 +463,16 @@ int virtio_transport_do_socket_init(struct vsock_sock *vsk, vsk->trans = vvs; vvs->vsk = vsk; - if (psk) { + if (psk && psk->trans) { struct virtio_vsock_sock *ptrans = psk->trans; - vvs->buf_size = ptrans->buf_size; - vvs->buf_size_min = ptrans->buf_size_min; - vvs->buf_size_max = ptrans->buf_size_max; vvs->peer_buf_alloc = ptrans->peer_buf_alloc; - } else { - vvs->buf_size = VIRTIO_VSOCK_DEFAULT_BUF_SIZE; - vvs->buf_size_min = VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE; - vvs->buf_size_max = VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE; } - vvs->buf_alloc = vvs->buf_size; + if (vsk->buffer_size > VIRTIO_VSOCK_MAX_BUF_SIZE) + vsk->buffer_size = VIRTIO_VSOCK_MAX_BUF_SIZE; + + vvs->buf_alloc = vsk->buffer_size; spin_lock_init(&vvs->rx_lock); spin_lock_init(&vvs->tx_lock); @@ -392,68 +482,20 @@ int virtio_transport_do_socket_init(struct vsock_sock *vsk, } EXPORT_SYMBOL_GPL(virtio_transport_do_socket_init); -u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk) +/* sk_lock held by the caller */ +void virtio_transport_notify_buffer_size(struct vsock_sock *vsk, u64 *val) { struct virtio_vsock_sock *vvs = vsk->trans; - return vvs->buf_size; -} -EXPORT_SYMBOL_GPL(virtio_transport_get_buffer_size); - -u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk) -{ - struct virtio_vsock_sock *vvs = vsk->trans; + if (*val > VIRTIO_VSOCK_MAX_BUF_SIZE) + *val = VIRTIO_VSOCK_MAX_BUF_SIZE; - return vvs->buf_size_min; -} -EXPORT_SYMBOL_GPL(virtio_transport_get_min_buffer_size); - -u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk) -{ - struct virtio_vsock_sock *vvs = vsk->trans; - - return vvs->buf_size_max; -} -EXPORT_SYMBOL_GPL(virtio_transport_get_max_buffer_size); - -void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val) -{ - struct virtio_vsock_sock *vvs = vsk->trans; - - if (val > VIRTIO_VSOCK_MAX_BUF_SIZE) - val = VIRTIO_VSOCK_MAX_BUF_SIZE; - if (val < vvs->buf_size_min) - vvs->buf_size_min = val; - if (val > vvs->buf_size_max) - vvs->buf_size_max = val; - vvs->buf_size = val; - vvs->buf_alloc = val; -} -EXPORT_SYMBOL_GPL(virtio_transport_set_buffer_size); - -void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val) -{ - struct virtio_vsock_sock *vvs = vsk->trans; - - if (val > VIRTIO_VSOCK_MAX_BUF_SIZE) - val = VIRTIO_VSOCK_MAX_BUF_SIZE; - if (val > vvs->buf_size) - vvs->buf_size = val; - vvs->buf_size_min = val; -} -EXPORT_SYMBOL_GPL(virtio_transport_set_min_buffer_size); - -void virtio_transport_set_max_buffer_size(struct vsock_sock *vsk, u64 val) -{ - struct virtio_vsock_sock *vvs = vsk->trans; + vvs->buf_alloc = *val; - if (val > VIRTIO_VSOCK_MAX_BUF_SIZE) - val = VIRTIO_VSOCK_MAX_BUF_SIZE; - if (val < vvs->buf_size) - vvs->buf_size = val; - vvs->buf_size_max = val; + virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_STREAM, + NULL); } -EXPORT_SYMBOL_GPL(virtio_transport_set_max_buffer_size); +EXPORT_SYMBOL_GPL(virtio_transport_notify_buffer_size); int virtio_transport_notify_poll_in(struct vsock_sock *vsk, @@ -545,9 +587,7 @@ EXPORT_SYMBOL_GPL(virtio_transport_notify_send_post_enqueue); u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk) { - struct virtio_vsock_sock *vvs = vsk->trans; - - return vvs->buf_size; + return vsk->buffer_size; } EXPORT_SYMBOL_GPL(virtio_transport_stream_rcvhiwat); @@ -659,9 +699,9 @@ static int virtio_transport_reset(struct vsock_sock *vsk, /* Normally packets are associated with a socket. There may be no socket if an * attempt was made to connect to a socket that does not exist. */ -static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt) +static int virtio_transport_reset_no_sock(const struct virtio_transport *t, + struct virtio_vsock_pkt *pkt) { - const struct virtio_transport *t; struct virtio_vsock_pkt *reply; struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_RST, @@ -681,7 +721,6 @@ static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt) if (!reply) return -ENOMEM; - t = virtio_transport_get_ops(); if (!t) { virtio_transport_free_pkt(reply); return -ENOTCONN; @@ -790,7 +829,7 @@ void virtio_transport_release(struct vsock_sock *vsk) struct sock *sk = &vsk->sk; bool remove_sock = true; - lock_sock(sk); + lock_sock_nested(sk, SINGLE_DEPTH_NESTING); if (sk->sk_type == SOCK_STREAM) remove_sock = virtio_transport_close(vsk); @@ -841,24 +880,64 @@ destroy: return err; } +static void +virtio_transport_recv_enqueue(struct vsock_sock *vsk, + struct virtio_vsock_pkt *pkt) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + bool can_enqueue, free_pkt = false; + + pkt->len = le32_to_cpu(pkt->hdr.len); + pkt->off = 0; + + spin_lock_bh(&vvs->rx_lock); + + can_enqueue = virtio_transport_inc_rx_pkt(vvs, pkt); + if (!can_enqueue) { + free_pkt = true; + goto out; + } + + /* Try to copy small packets into the buffer of last packet queued, + * to avoid wasting memory queueing the entire buffer with a small + * payload. + */ + if (pkt->len <= GOOD_COPY_LEN && !list_empty(&vvs->rx_queue)) { + struct virtio_vsock_pkt *last_pkt; + + last_pkt = list_last_entry(&vvs->rx_queue, + struct virtio_vsock_pkt, list); + + /* If there is space in the last packet queued, we copy the + * new packet in its buffer. + */ + if (pkt->len <= last_pkt->buf_len - last_pkt->len) { + memcpy(last_pkt->buf + last_pkt->len, pkt->buf, + pkt->len); + last_pkt->len += pkt->len; + free_pkt = true; + goto out; + } + } + + list_add_tail(&pkt->list, &vvs->rx_queue); + +out: + spin_unlock_bh(&vvs->rx_lock); + if (free_pkt) + virtio_transport_free_pkt(pkt); +} + static int virtio_transport_recv_connected(struct sock *sk, struct virtio_vsock_pkt *pkt) { struct vsock_sock *vsk = vsock_sk(sk); - struct virtio_vsock_sock *vvs = vsk->trans; int err = 0; switch (le16_to_cpu(pkt->hdr.op)) { case VIRTIO_VSOCK_OP_RW: - pkt->len = le32_to_cpu(pkt->hdr.len); - pkt->off = 0; - - spin_lock_bh(&vvs->rx_lock); - virtio_transport_inc_rx_pkt(vvs, pkt); - list_add_tail(&pkt->list, &vvs->rx_queue); - spin_unlock_bh(&vvs->rx_lock); - + virtio_transport_recv_enqueue(vsk, pkt); sk->sk_data_ready(sk); return err; case VIRTIO_VSOCK_OP_CREDIT_UPDATE: @@ -870,9 +949,11 @@ virtio_transport_recv_connected(struct sock *sk, if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_SEND) vsk->peer_shutdown |= SEND_SHUTDOWN; if (vsk->peer_shutdown == SHUTDOWN_MASK && - vsock_stream_has_data(vsk) <= 0) { - sock_set_flag(sk, SOCK_DONE); - sk->sk_state = TCP_CLOSING; + vsock_stream_has_data(vsk) <= 0 && + !sock_flag(sk, SOCK_DONE)) { + (void)virtio_transport_reset(vsk, NULL); + + virtio_transport_do_close(vsk, true); } if (le32_to_cpu(pkt->hdr.flags)) sk->sk_state_change(sk); @@ -915,32 +996,57 @@ virtio_transport_send_response(struct vsock_sock *vsk, return virtio_transport_send_pkt_info(vsk, &info); } +static bool virtio_transport_space_update(struct sock *sk, + struct virtio_vsock_pkt *pkt) +{ + struct vsock_sock *vsk = vsock_sk(sk); + struct virtio_vsock_sock *vvs = vsk->trans; + bool space_available; + + /* Listener sockets are not associated with any transport, so we are + * not able to take the state to see if there is space available in the + * remote peer, but since they are only used to receive requests, we + * can assume that there is always space available in the other peer. + */ + if (!vvs) + return true; + + /* buf_alloc and fwd_cnt is always included in the hdr */ + spin_lock_bh(&vvs->tx_lock); + vvs->peer_buf_alloc = le32_to_cpu(pkt->hdr.buf_alloc); + vvs->peer_fwd_cnt = le32_to_cpu(pkt->hdr.fwd_cnt); + space_available = virtio_transport_has_space(vsk); + spin_unlock_bh(&vvs->tx_lock); + return space_available; +} + /* Handle server socket */ static int -virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt) +virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt, + struct virtio_transport *t) { struct vsock_sock *vsk = vsock_sk(sk); struct vsock_sock *vchild; struct sock *child; + int ret; if (le16_to_cpu(pkt->hdr.op) != VIRTIO_VSOCK_OP_REQUEST) { - virtio_transport_reset(vsk, pkt); + virtio_transport_reset_no_sock(t, pkt); return -EINVAL; } if (sk_acceptq_is_full(sk)) { - virtio_transport_reset(vsk, pkt); + virtio_transport_reset_no_sock(t, pkt); return -ENOMEM; } - child = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL, - sk->sk_type, 0); + child = vsock_create_connected(sk); if (!child) { - virtio_transport_reset(vsk, pkt); + virtio_transport_reset_no_sock(t, pkt); return -ENOMEM; } - sk->sk_ack_backlog++; + sk_acceptq_added(sk); lock_sock_nested(child, SINGLE_DEPTH_NESTING); @@ -952,6 +1058,20 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt) vsock_addr_init(&vchild->remote_addr, le64_to_cpu(pkt->hdr.src_cid), le32_to_cpu(pkt->hdr.src_port)); + ret = vsock_assign_transport(vchild, vsk); + /* Transport assigned (looking at remote_addr) must be the same + * where we received the request. + */ + if (ret || vchild->transport != &t->transport) { + release_sock(child); + virtio_transport_reset_no_sock(t, pkt); + sock_put(child); + return ret; + } + + if (virtio_transport_space_update(child, pkt)) + child->sk_write_space(child); + vsock_insert_connected(vchild); vsock_enqueue_accept(sk, child); virtio_transport_send_response(vchild, pkt); @@ -962,26 +1082,11 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt) return 0; } -static bool virtio_transport_space_update(struct sock *sk, - struct virtio_vsock_pkt *pkt) -{ - struct vsock_sock *vsk = vsock_sk(sk); - struct virtio_vsock_sock *vvs = vsk->trans; - bool space_available; - - /* buf_alloc and fwd_cnt is always included in the hdr */ - spin_lock_bh(&vvs->tx_lock); - vvs->peer_buf_alloc = le32_to_cpu(pkt->hdr.buf_alloc); - vvs->peer_fwd_cnt = le32_to_cpu(pkt->hdr.fwd_cnt); - space_available = virtio_transport_has_space(vsk); - spin_unlock_bh(&vvs->tx_lock); - return space_available; -} - /* We are under the virtio-vsock's vsock->rx_lock or vhost-vsock's vq->mutex * lock. */ -void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt) +void virtio_transport_recv_pkt(struct virtio_transport *t, + struct virtio_vsock_pkt *pkt) { struct sockaddr_vm src, dst; struct vsock_sock *vsk; @@ -1003,7 +1108,7 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt) le32_to_cpu(pkt->hdr.fwd_cnt)); if (le16_to_cpu(pkt->hdr.type) != VIRTIO_VSOCK_TYPE_STREAM) { - (void)virtio_transport_reset_no_sock(pkt); + (void)virtio_transport_reset_no_sock(t, pkt); goto free_pkt; } @@ -1014,7 +1119,7 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt) if (!sk) { sk = vsock_find_bound_socket(&dst); if (!sk) { - (void)virtio_transport_reset_no_sock(pkt); + (void)virtio_transport_reset_no_sock(t, pkt); goto free_pkt; } } @@ -1033,7 +1138,7 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt) switch (sk->sk_state) { case TCP_LISTEN: - virtio_transport_recv_listen(sk, pkt); + virtio_transport_recv_listen(sk, pkt, t); virtio_transport_free_pkt(pkt); break; case TCP_SYN_SENT: @@ -1051,6 +1156,7 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt) virtio_transport_free_pkt(pkt); break; } + release_sock(sk); /* Release refcnt obtained when we fetched this socket out of the diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 8c9c4ed90fa7..4b8b1150a738 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -57,6 +57,7 @@ static bool vmci_transport_old_proto_override(bool *old_pkt_proto); static u16 vmci_transport_new_proto_supported_versions(void); static bool vmci_transport_proto_to_notify_struct(struct sock *sk, u16 *proto, bool old_pkt_proto); +static bool vmci_check_transport(struct vsock_sock *vsk); struct vmci_transport_recv_pkt_info { struct work_struct work; @@ -74,15 +75,6 @@ static u32 vmci_transport_qp_resumed_sub_id = VMCI_INVALID_ID; static int PROTOCOL_OVERRIDE = -1; -#define VMCI_TRANSPORT_DEFAULT_QP_SIZE_MIN 128 -#define VMCI_TRANSPORT_DEFAULT_QP_SIZE 262144 -#define VMCI_TRANSPORT_DEFAULT_QP_SIZE_MAX 262144 - -/* The default peer timeout indicates how long we will wait for a peer response - * to a control message. - */ -#define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ) - /* Helper function to convert from a VMCI error code to a VSock error code. */ static s32 vmci_transport_error_to_vsock_error(s32 vmci_error) @@ -656,7 +648,7 @@ static int vmci_transport_recv_dgram_cb(void *data, struct vmci_datagram *dg) static bool vmci_transport_stream_allow(u32 cid, u32 port) { static const u32 non_socket_contexts[] = { - VMADDR_CID_RESERVED, + VMADDR_CID_LOCAL, }; int i; @@ -1013,8 +1005,7 @@ static int vmci_transport_recv_listen(struct sock *sk, return -ECONNREFUSED; } - pending = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL, - sk->sk_type, 0); + pending = vsock_create_connected(sk); if (!pending) { vmci_transport_send_reset(sk, pkt); return -ENOMEM; @@ -1027,14 +1018,24 @@ static int vmci_transport_recv_listen(struct sock *sk, vsock_addr_init(&vpending->remote_addr, pkt->dg.src.context, pkt->src_port); + err = vsock_assign_transport(vpending, vsock_sk(sk)); + /* Transport assigned (looking at remote_addr) must be the same + * where we received the request. + */ + if (err || !vmci_check_transport(vpending)) { + vmci_transport_send_reset(sk, pkt); + sock_put(pending); + return err; + } + /* If the proposed size fits within our min/max, accept it. Otherwise * propose our own size. */ - if (pkt->u.size >= vmci_trans(vpending)->queue_pair_min_size && - pkt->u.size <= vmci_trans(vpending)->queue_pair_max_size) { + if (pkt->u.size >= vpending->buffer_min_size && + pkt->u.size <= vpending->buffer_max_size) { qp_size = pkt->u.size; } else { - qp_size = vmci_trans(vpending)->queue_pair_size; + qp_size = vpending->buffer_size; } /* Figure out if we are using old or new requests based on the @@ -1098,12 +1099,12 @@ static int vmci_transport_recv_listen(struct sock *sk, } vsock_add_pending(sk, pending); - sk->sk_ack_backlog++; + sk_acceptq_added(sk); pending->sk_state = TCP_SYN_SENT; vmci_trans(vpending)->produce_size = vmci_trans(vpending)->consume_size = qp_size; - vmci_trans(vpending)->queue_pair_size = qp_size; + vpending->buffer_size = qp_size; vmci_trans(vpending)->notify_ops->process_request(pending); @@ -1397,8 +1398,8 @@ static int vmci_transport_recv_connecting_client_negotiate( vsk->ignore_connecting_rst = false; /* Verify that we're OK with the proposed queue pair size */ - if (pkt->u.size < vmci_trans(vsk)->queue_pair_min_size || - pkt->u.size > vmci_trans(vsk)->queue_pair_max_size) { + if (pkt->u.size < vsk->buffer_min_size || + pkt->u.size > vsk->buffer_max_size) { err = -EINVAL; goto destroy; } @@ -1503,8 +1504,7 @@ vmci_transport_recv_connecting_client_invalid(struct sock *sk, vsk->sent_request = false; vsk->ignore_connecting_rst = true; - err = vmci_transport_send_conn_request( - sk, vmci_trans(vsk)->queue_pair_size); + err = vmci_transport_send_conn_request(sk, vsk->buffer_size); if (err < 0) err = vmci_transport_error_to_vsock_error(err); else @@ -1588,21 +1588,6 @@ static int vmci_transport_socket_init(struct vsock_sock *vsk, INIT_LIST_HEAD(&vmci_trans(vsk)->elem); vmci_trans(vsk)->sk = &vsk->sk; spin_lock_init(&vmci_trans(vsk)->lock); - if (psk) { - vmci_trans(vsk)->queue_pair_size = - vmci_trans(psk)->queue_pair_size; - vmci_trans(vsk)->queue_pair_min_size = - vmci_trans(psk)->queue_pair_min_size; - vmci_trans(vsk)->queue_pair_max_size = - vmci_trans(psk)->queue_pair_max_size; - } else { - vmci_trans(vsk)->queue_pair_size = - VMCI_TRANSPORT_DEFAULT_QP_SIZE; - vmci_trans(vsk)->queue_pair_min_size = - VMCI_TRANSPORT_DEFAULT_QP_SIZE_MIN; - vmci_trans(vsk)->queue_pair_max_size = - VMCI_TRANSPORT_DEFAULT_QP_SIZE_MAX; - } return 0; } @@ -1818,8 +1803,7 @@ static int vmci_transport_connect(struct vsock_sock *vsk) if (vmci_transport_old_proto_override(&old_pkt_proto) && old_pkt_proto) { - err = vmci_transport_send_conn_request( - sk, vmci_trans(vsk)->queue_pair_size); + err = vmci_transport_send_conn_request(sk, vsk->buffer_size); if (err < 0) { sk->sk_state = TCP_CLOSE; return err; @@ -1827,8 +1811,7 @@ static int vmci_transport_connect(struct vsock_sock *vsk) } else { int supported_proto_versions = vmci_transport_new_proto_supported_versions(); - err = vmci_transport_send_conn_request2( - sk, vmci_trans(vsk)->queue_pair_size, + err = vmci_transport_send_conn_request2(sk, vsk->buffer_size, supported_proto_versions); if (err < 0) { sk->sk_state = TCP_CLOSE; @@ -1881,46 +1864,6 @@ static bool vmci_transport_stream_is_active(struct vsock_sock *vsk) return !vmci_handle_is_invalid(vmci_trans(vsk)->qp_handle); } -static u64 vmci_transport_get_buffer_size(struct vsock_sock *vsk) -{ - return vmci_trans(vsk)->queue_pair_size; -} - -static u64 vmci_transport_get_min_buffer_size(struct vsock_sock *vsk) -{ - return vmci_trans(vsk)->queue_pair_min_size; -} - -static u64 vmci_transport_get_max_buffer_size(struct vsock_sock *vsk) -{ - return vmci_trans(vsk)->queue_pair_max_size; -} - -static void vmci_transport_set_buffer_size(struct vsock_sock *vsk, u64 val) -{ - if (val < vmci_trans(vsk)->queue_pair_min_size) - vmci_trans(vsk)->queue_pair_min_size = val; - if (val > vmci_trans(vsk)->queue_pair_max_size) - vmci_trans(vsk)->queue_pair_max_size = val; - vmci_trans(vsk)->queue_pair_size = val; -} - -static void vmci_transport_set_min_buffer_size(struct vsock_sock *vsk, - u64 val) -{ - if (val > vmci_trans(vsk)->queue_pair_size) - vmci_trans(vsk)->queue_pair_size = val; - vmci_trans(vsk)->queue_pair_min_size = val; -} - -static void vmci_transport_set_max_buffer_size(struct vsock_sock *vsk, - u64 val) -{ - if (val < vmci_trans(vsk)->queue_pair_size) - vmci_trans(vsk)->queue_pair_size = val; - vmci_trans(vsk)->queue_pair_max_size = val; -} - static int vmci_transport_notify_poll_in( struct vsock_sock *vsk, size_t target, @@ -2076,7 +2019,8 @@ static u32 vmci_transport_get_local_cid(void) return vmci_get_context_id(); } -static const struct vsock_transport vmci_transport = { +static struct vsock_transport vmci_transport = { + .module = THIS_MODULE, .init = vmci_transport_socket_init, .destruct = vmci_transport_destruct, .release = vmci_transport_release, @@ -2103,15 +2047,26 @@ static const struct vsock_transport vmci_transport = { .notify_send_pre_enqueue = vmci_transport_notify_send_pre_enqueue, .notify_send_post_enqueue = vmci_transport_notify_send_post_enqueue, .shutdown = vmci_transport_shutdown, - .set_buffer_size = vmci_transport_set_buffer_size, - .set_min_buffer_size = vmci_transport_set_min_buffer_size, - .set_max_buffer_size = vmci_transport_set_max_buffer_size, - .get_buffer_size = vmci_transport_get_buffer_size, - .get_min_buffer_size = vmci_transport_get_min_buffer_size, - .get_max_buffer_size = vmci_transport_get_max_buffer_size, .get_local_cid = vmci_transport_get_local_cid, }; +static bool vmci_check_transport(struct vsock_sock *vsk) +{ + return vsk->transport == &vmci_transport; +} + +void vmci_vsock_transport_cb(bool is_host) +{ + int features; + + if (is_host) + features = VSOCK_TRANSPORT_F_H2G; + else + features = VSOCK_TRANSPORT_F_G2H; + + vsock_core_register(&vmci_transport, features); +} + static int __init vmci_transport_init(void) { int err; @@ -2128,7 +2083,6 @@ static int __init vmci_transport_init(void) pr_err("Unable to create datagram handle. (%d)\n", err); return vmci_transport_error_to_vsock_error(err); } - err = vmci_event_subscribe(VMCI_EVENT_QP_RESUMED, vmci_transport_qp_resumed_cb, NULL, &vmci_transport_qp_resumed_sub_id); @@ -2139,12 +2093,21 @@ static int __init vmci_transport_init(void) goto err_destroy_stream_handle; } - err = vsock_core_init(&vmci_transport); + /* Register only with dgram feature, other features (H2G, G2H) will be + * registered when the first host or guest becomes active. + */ + err = vsock_core_register(&vmci_transport, VSOCK_TRANSPORT_F_DGRAM); if (err < 0) goto err_unsubscribe; + err = vmci_register_vsock_callback(vmci_vsock_transport_cb); + if (err < 0) + goto err_unregister; + return 0; +err_unregister: + vsock_core_unregister(&vmci_transport); err_unsubscribe: vmci_event_unsubscribe(vmci_transport_qp_resumed_sub_id); err_destroy_stream_handle: @@ -2170,7 +2133,8 @@ static void __exit vmci_transport_exit(void) vmci_transport_qp_resumed_sub_id = VMCI_INVALID_ID; } - vsock_core_exit(); + vmci_register_vsock_callback(NULL); + vsock_core_unregister(&vmci_transport); } module_exit(vmci_transport_exit); diff --git a/net/vmw_vsock/vmci_transport.h b/net/vmw_vsock/vmci_transport.h index 1ca1e8640b31..b7b072194282 100644 --- a/net/vmw_vsock/vmci_transport.h +++ b/net/vmw_vsock/vmci_transport.h @@ -108,9 +108,6 @@ struct vmci_transport { struct vmci_qp *qpair; u64 produce_size; u64 consume_size; - u64 queue_pair_size; - u64 queue_pair_min_size; - u64 queue_pair_max_size; u32 detach_sub_id; union vmci_transport_notify notify; const struct vmci_transport_notify_ops *notify_ops; diff --git a/net/vmw_vsock/vmci_transport_notify.h b/net/vmw_vsock/vmci_transport_notify.h index 7843f08d4290..a1aa5a998c0e 100644 --- a/net/vmw_vsock/vmci_transport_notify.h +++ b/net/vmw_vsock/vmci_transport_notify.h @@ -11,7 +11,6 @@ #include <linux/types.h> #include <linux/vmw_vmci_defs.h> #include <linux/vmw_vmci_api.h> -#include <linux/vm_sockets.h> #include "vmci_transport.h" diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c new file mode 100644 index 000000000000..a45f7ffca8c5 --- /dev/null +++ b/net/vmw_vsock/vsock_loopback.c @@ -0,0 +1,180 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* loopback transport for vsock using virtio_transport_common APIs + * + * Copyright (C) 2013-2019 Red Hat, Inc. + * Authors: Asias He <asias@redhat.com> + * Stefan Hajnoczi <stefanha@redhat.com> + * Stefano Garzarella <sgarzare@redhat.com> + * + */ +#include <linux/spinlock.h> +#include <linux/module.h> +#include <linux/list.h> +#include <linux/virtio_vsock.h> + +struct vsock_loopback { + struct workqueue_struct *workqueue; + + spinlock_t pkt_list_lock; /* protects pkt_list */ + struct list_head pkt_list; + struct work_struct pkt_work; +}; + +static struct vsock_loopback the_vsock_loopback; + +static u32 vsock_loopback_get_local_cid(void) +{ + return VMADDR_CID_LOCAL; +} + +static int vsock_loopback_send_pkt(struct virtio_vsock_pkt *pkt) +{ + struct vsock_loopback *vsock = &the_vsock_loopback; + int len = pkt->len; + + spin_lock_bh(&vsock->pkt_list_lock); + list_add_tail(&pkt->list, &vsock->pkt_list); + spin_unlock_bh(&vsock->pkt_list_lock); + + queue_work(vsock->workqueue, &vsock->pkt_work); + + return len; +} + +static int vsock_loopback_cancel_pkt(struct vsock_sock *vsk) +{ + struct vsock_loopback *vsock = &the_vsock_loopback; + struct virtio_vsock_pkt *pkt, *n; + LIST_HEAD(freeme); + + spin_lock_bh(&vsock->pkt_list_lock); + list_for_each_entry_safe(pkt, n, &vsock->pkt_list, list) { + if (pkt->vsk != vsk) + continue; + list_move(&pkt->list, &freeme); + } + spin_unlock_bh(&vsock->pkt_list_lock); + + list_for_each_entry_safe(pkt, n, &freeme, list) { + list_del(&pkt->list); + virtio_transport_free_pkt(pkt); + } + + return 0; +} + +static struct virtio_transport loopback_transport = { + .transport = { + .module = THIS_MODULE, + + .get_local_cid = vsock_loopback_get_local_cid, + + .init = virtio_transport_do_socket_init, + .destruct = virtio_transport_destruct, + .release = virtio_transport_release, + .connect = virtio_transport_connect, + .shutdown = virtio_transport_shutdown, + .cancel_pkt = vsock_loopback_cancel_pkt, + + .dgram_bind = virtio_transport_dgram_bind, + .dgram_dequeue = virtio_transport_dgram_dequeue, + .dgram_enqueue = virtio_transport_dgram_enqueue, + .dgram_allow = virtio_transport_dgram_allow, + + .stream_dequeue = virtio_transport_stream_dequeue, + .stream_enqueue = virtio_transport_stream_enqueue, + .stream_has_data = virtio_transport_stream_has_data, + .stream_has_space = virtio_transport_stream_has_space, + .stream_rcvhiwat = virtio_transport_stream_rcvhiwat, + .stream_is_active = virtio_transport_stream_is_active, + .stream_allow = virtio_transport_stream_allow, + + .notify_poll_in = virtio_transport_notify_poll_in, + .notify_poll_out = virtio_transport_notify_poll_out, + .notify_recv_init = virtio_transport_notify_recv_init, + .notify_recv_pre_block = virtio_transport_notify_recv_pre_block, + .notify_recv_pre_dequeue = virtio_transport_notify_recv_pre_dequeue, + .notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue, + .notify_send_init = virtio_transport_notify_send_init, + .notify_send_pre_block = virtio_transport_notify_send_pre_block, + .notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue, + .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue, + .notify_buffer_size = virtio_transport_notify_buffer_size, + }, + + .send_pkt = vsock_loopback_send_pkt, +}; + +static void vsock_loopback_work(struct work_struct *work) +{ + struct vsock_loopback *vsock = + container_of(work, struct vsock_loopback, pkt_work); + LIST_HEAD(pkts); + + spin_lock_bh(&vsock->pkt_list_lock); + list_splice_init(&vsock->pkt_list, &pkts); + spin_unlock_bh(&vsock->pkt_list_lock); + + while (!list_empty(&pkts)) { + struct virtio_vsock_pkt *pkt; + + pkt = list_first_entry(&pkts, struct virtio_vsock_pkt, list); + list_del_init(&pkt->list); + + virtio_transport_deliver_tap_pkt(pkt); + virtio_transport_recv_pkt(&loopback_transport, pkt); + } +} + +static int __init vsock_loopback_init(void) +{ + struct vsock_loopback *vsock = &the_vsock_loopback; + int ret; + + vsock->workqueue = alloc_workqueue("vsock-loopback", 0, 0); + if (!vsock->workqueue) + return -ENOMEM; + + spin_lock_init(&vsock->pkt_list_lock); + INIT_LIST_HEAD(&vsock->pkt_list); + INIT_WORK(&vsock->pkt_work, vsock_loopback_work); + + ret = vsock_core_register(&loopback_transport.transport, + VSOCK_TRANSPORT_F_LOCAL); + if (ret) + goto out_wq; + + return 0; + +out_wq: + destroy_workqueue(vsock->workqueue); + return ret; +} + +static void __exit vsock_loopback_exit(void) +{ + struct vsock_loopback *vsock = &the_vsock_loopback; + struct virtio_vsock_pkt *pkt; + + vsock_core_unregister(&loopback_transport.transport); + + flush_work(&vsock->pkt_work); + + spin_lock_bh(&vsock->pkt_list_lock); + while (!list_empty(&vsock->pkt_list)) { + pkt = list_first_entry(&vsock->pkt_list, + struct virtio_vsock_pkt, list); + list_del(&pkt->list); + virtio_transport_free_pkt(pkt); + } + spin_unlock_bh(&vsock->pkt_list_lock); + + destroy_workqueue(vsock->workqueue); +} + +module_init(vsock_loopback_init); +module_exit(vsock_loopback_exit); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Stefano Garzarella <sgarzare@redhat.com>"); +MODULE_DESCRIPTION("loopback transport for vsock"); +MODULE_ALIAS_NETPROTO(PF_VSOCK); diff --git a/net/wimax/debugfs.c b/net/wimax/debugfs.c index 1af56df30276..3c54bb6b925a 100644 --- a/net/wimax/debugfs.c +++ b/net/wimax/debugfs.c @@ -13,49 +13,23 @@ #define D_SUBMODULE debugfs #include "debug-levels.h" - -#define __debugfs_register(prefix, name, parent) \ -do { \ - result = d_level_register_debugfs(prefix, name, parent); \ - if (result < 0) \ - goto error; \ -} while (0) - - -int wimax_debugfs_add(struct wimax_dev *wimax_dev) +void wimax_debugfs_add(struct wimax_dev *wimax_dev) { - int result; struct net_device *net_dev = wimax_dev->net_dev; - struct device *dev = net_dev->dev.parent; struct dentry *dentry; char buf[128]; snprintf(buf, sizeof(buf), "wimax:%s", net_dev->name); dentry = debugfs_create_dir(buf, NULL); - result = PTR_ERR(dentry); - if (IS_ERR(dentry)) { - if (result == -ENODEV) - result = 0; /* No debugfs support */ - else - dev_err(dev, "Can't create debugfs dentry: %d\n", - result); - goto out; - } wimax_dev->debugfs_dentry = dentry; - __debugfs_register("wimax_dl_", debugfs, dentry); - __debugfs_register("wimax_dl_", id_table, dentry); - __debugfs_register("wimax_dl_", op_msg, dentry); - __debugfs_register("wimax_dl_", op_reset, dentry); - __debugfs_register("wimax_dl_", op_rfkill, dentry); - __debugfs_register("wimax_dl_", op_state_get, dentry); - __debugfs_register("wimax_dl_", stack, dentry); - result = 0; -out: - return result; -error: - debugfs_remove_recursive(wimax_dev->debugfs_dentry); - return result; + d_level_register_debugfs("wimax_dl_", debugfs, dentry); + d_level_register_debugfs("wimax_dl_", id_table, dentry); + d_level_register_debugfs("wimax_dl_", op_msg, dentry); + d_level_register_debugfs("wimax_dl_", op_reset, dentry); + d_level_register_debugfs("wimax_dl_", op_rfkill, dentry); + d_level_register_debugfs("wimax_dl_", op_state_get, dentry); + d_level_register_debugfs("wimax_dl_", stack, dentry); } void wimax_debugfs_rm(struct wimax_dev *wimax_dev) diff --git a/net/wimax/stack.c b/net/wimax/stack.c index 1ba99d65feca..4b9b1c5e8f3a 100644 --- a/net/wimax/stack.c +++ b/net/wimax/stack.c @@ -481,12 +481,7 @@ int wimax_dev_add(struct wimax_dev *wimax_dev, struct net_device *net_dev) /* Set up user-space interaction */ mutex_lock(&wimax_dev->mutex); wimax_id_table_add(wimax_dev); - result = wimax_debugfs_add(wimax_dev); - if (result < 0) { - dev_err(dev, "cannot initialize debugfs: %d\n", - result); - goto error_debugfs_add; - } + wimax_debugfs_add(wimax_dev); __wimax_state_set(wimax_dev, WIMAX_ST_DOWN); mutex_unlock(&wimax_dev->mutex); @@ -498,10 +493,6 @@ int wimax_dev_add(struct wimax_dev *wimax_dev, struct net_device *net_dev) d_fnend(3, dev, "(wimax_dev %p net_dev %p) = 0\n", wimax_dev, net_dev); return 0; -error_debugfs_add: - wimax_id_table_rm(wimax_dev); - mutex_unlock(&wimax_dev->mutex); - wimax_rfkill_rm(wimax_dev); error_rfkill_add: d_fnend(3, dev, "(wimax_dev %p net_dev %p) = %d\n", wimax_dev, net_dev, result); diff --git a/net/wimax/wimax-internal.h b/net/wimax/wimax-internal.h index e819a09337ee..40751207296c 100644 --- a/net/wimax/wimax-internal.h +++ b/net/wimax/wimax-internal.h @@ -57,13 +57,10 @@ void __wimax_state_set(struct wimax_dev *wimax_dev, enum wimax_st state) void __wimax_state_change(struct wimax_dev *, enum wimax_st); #ifdef CONFIG_DEBUG_FS -int wimax_debugfs_add(struct wimax_dev *); +void wimax_debugfs_add(struct wimax_dev *); void wimax_debugfs_rm(struct wimax_dev *); #else -static inline int wimax_debugfs_add(struct wimax_dev *wimax_dev) -{ - return 0; -} +static inline void wimax_debugfs_add(struct wimax_dev *wimax_dev) {} static inline void wimax_debugfs_rm(struct wimax_dev *wimax_dev) {} #endif diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig index 67f8360dfcee..63cf7131f601 100644 --- a/net/wireless/Kconfig +++ b/net/wireless/Kconfig @@ -217,6 +217,8 @@ config LIB80211_CRYPT_WEP config LIB80211_CRYPT_CCMP tristate + select CRYPTO_AES + select CRYPTO_CCM config LIB80211_CRYPT_TKIP tristate diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 7dc1bbd0888f..fcac5c6366e1 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -14,6 +14,11 @@ #include "core.h" #include "rdev-ops.h" +static bool cfg80211_valid_60g_freq(u32 freq) +{ + return freq >= 58320 && freq <= 70200; +} + void cfg80211_chandef_create(struct cfg80211_chan_def *chandef, struct ieee80211_channel *chan, enum nl80211_channel_type chan_type) @@ -23,6 +28,8 @@ void cfg80211_chandef_create(struct cfg80211_chan_def *chandef, chandef->chan = chan; chandef->center_freq2 = 0; + chandef->edmg.bw_config = 0; + chandef->edmg.channels = 0; switch (chan_type) { case NL80211_CHAN_NO_HT: @@ -47,6 +54,91 @@ void cfg80211_chandef_create(struct cfg80211_chan_def *chandef, } EXPORT_SYMBOL(cfg80211_chandef_create); +static bool cfg80211_edmg_chandef_valid(const struct cfg80211_chan_def *chandef) +{ + int max_contiguous = 0; + int num_of_enabled = 0; + int contiguous = 0; + int i; + + if (!chandef->edmg.channels || !chandef->edmg.bw_config) + return false; + + if (!cfg80211_valid_60g_freq(chandef->chan->center_freq)) + return false; + + for (i = 0; i < 6; i++) { + if (chandef->edmg.channels & BIT(i)) { + contiguous++; + num_of_enabled++; + } else { + contiguous = 0; + } + + max_contiguous = max(contiguous, max_contiguous); + } + /* basic verification of edmg configuration according to + * IEEE P802.11ay/D4.0 section 9.4.2.251 + */ + /* check bw_config against contiguous edmg channels */ + switch (chandef->edmg.bw_config) { + case IEEE80211_EDMG_BW_CONFIG_4: + case IEEE80211_EDMG_BW_CONFIG_8: + case IEEE80211_EDMG_BW_CONFIG_12: + if (max_contiguous < 1) + return false; + break; + case IEEE80211_EDMG_BW_CONFIG_5: + case IEEE80211_EDMG_BW_CONFIG_9: + case IEEE80211_EDMG_BW_CONFIG_13: + if (max_contiguous < 2) + return false; + break; + case IEEE80211_EDMG_BW_CONFIG_6: + case IEEE80211_EDMG_BW_CONFIG_10: + case IEEE80211_EDMG_BW_CONFIG_14: + if (max_contiguous < 3) + return false; + break; + case IEEE80211_EDMG_BW_CONFIG_7: + case IEEE80211_EDMG_BW_CONFIG_11: + case IEEE80211_EDMG_BW_CONFIG_15: + if (max_contiguous < 4) + return false; + break; + + default: + return false; + } + + /* check bw_config against aggregated (non contiguous) edmg channels */ + switch (chandef->edmg.bw_config) { + case IEEE80211_EDMG_BW_CONFIG_4: + case IEEE80211_EDMG_BW_CONFIG_5: + case IEEE80211_EDMG_BW_CONFIG_6: + case IEEE80211_EDMG_BW_CONFIG_7: + break; + case IEEE80211_EDMG_BW_CONFIG_8: + case IEEE80211_EDMG_BW_CONFIG_9: + case IEEE80211_EDMG_BW_CONFIG_10: + case IEEE80211_EDMG_BW_CONFIG_11: + if (num_of_enabled < 2) + return false; + break; + case IEEE80211_EDMG_BW_CONFIG_12: + case IEEE80211_EDMG_BW_CONFIG_13: + case IEEE80211_EDMG_BW_CONFIG_14: + case IEEE80211_EDMG_BW_CONFIG_15: + if (num_of_enabled < 4 || max_contiguous < 2) + return false; + break; + default: + return false; + } + + return true; +} + bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) { u32 control_freq; @@ -112,6 +204,15 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) return false; } + /* channel 14 is only for IEEE 802.11b */ + if (chandef->center_freq1 == 2484 && + chandef->width != NL80211_CHAN_WIDTH_20_NOHT) + return false; + + if (cfg80211_chandef_is_edmg(chandef) && + !cfg80211_edmg_chandef_valid(chandef)) + return false; + return true; } EXPORT_SYMBOL(cfg80211_chandef_valid); @@ -721,12 +822,66 @@ static bool cfg80211_secondary_chans_ok(struct wiphy *wiphy, return true; } +/* check if the operating channels are valid and supported */ +static bool cfg80211_edmg_usable(struct wiphy *wiphy, u8 edmg_channels, + enum ieee80211_edmg_bw_config edmg_bw_config, + int primary_channel, + struct ieee80211_edmg *edmg_cap) +{ + struct ieee80211_channel *chan; + int i, freq; + int channels_counter = 0; + + if (!edmg_channels && !edmg_bw_config) + return true; + + if ((!edmg_channels && edmg_bw_config) || + (edmg_channels && !edmg_bw_config)) + return false; + + if (!(edmg_channels & BIT(primary_channel - 1))) + return false; + + /* 60GHz channels 1..6 */ + for (i = 0; i < 6; i++) { + if (!(edmg_channels & BIT(i))) + continue; + + if (!(edmg_cap->channels & BIT(i))) + return false; + + channels_counter++; + + freq = ieee80211_channel_to_frequency(i + 1, + NL80211_BAND_60GHZ); + chan = ieee80211_get_channel(wiphy, freq); + if (!chan || chan->flags & IEEE80211_CHAN_DISABLED) + return false; + } + + /* IEEE802.11 allows max 4 channels */ + if (channels_counter > 4) + return false; + + /* check bw_config is a subset of what driver supports + * (see IEEE P802.11ay/D4.0 section 9.4.2.251, Table 13) + */ + if ((edmg_bw_config % 4) > (edmg_cap->bw_config % 4)) + return false; + + if (edmg_bw_config > edmg_cap->bw_config) + return false; + + return true; +} + bool cfg80211_chandef_usable(struct wiphy *wiphy, const struct cfg80211_chan_def *chandef, u32 prohibited_flags) { struct ieee80211_sta_ht_cap *ht_cap; struct ieee80211_sta_vht_cap *vht_cap; + struct ieee80211_edmg *edmg_cap; u32 width, control_freq, cap; if (WARN_ON(!cfg80211_chandef_valid(chandef))) @@ -734,6 +889,15 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, ht_cap = &wiphy->bands[chandef->chan->band]->ht_cap; vht_cap = &wiphy->bands[chandef->chan->band]->vht_cap; + edmg_cap = &wiphy->bands[chandef->chan->band]->edmg_cap; + + if (edmg_cap->channels && + !cfg80211_edmg_usable(wiphy, + chandef->edmg.channels, + chandef->edmg.bw_config, + chandef->chan->hw_value, + edmg_cap)) + return false; control_freq = chandef->chan->center_freq; @@ -894,7 +1058,8 @@ static bool cfg80211_ir_permissive_chan(struct wiphy *wiphy, if (chan == other_chan) return true; - if (chan->band != NL80211_BAND_5GHZ) + if (chan->band != NL80211_BAND_5GHZ && + chan->band != NL80211_BAND_6GHZ) continue; r1 = cfg80211_get_unii(chan->center_freq); diff --git a/net/wireless/core.c b/net/wireless/core.c index 32b3c719fdfc..3e25229a059d 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -5,7 +5,7 @@ * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018 Intel Corporation + * Copyright (C) 2018-2019 Intel Corporation */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -142,12 +142,10 @@ int cfg80211_dev_rename(struct cfg80211_registered_device *rdev, if (result) return result; - if (rdev->wiphy.debugfsdir && - !debugfs_rename(rdev->wiphy.debugfsdir->d_parent, - rdev->wiphy.debugfsdir, - rdev->wiphy.debugfsdir->d_parent, - newname)) - pr_err("failed to rename debugfs dir to %s!\n", newname); + if (rdev->wiphy.debugfsdir) + debugfs_rename(rdev->wiphy.debugfsdir->d_parent, + rdev->wiphy.debugfsdir, + rdev->wiphy.debugfsdir->d_parent, newname); nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY); @@ -302,12 +300,13 @@ static int cfg80211_rfkill_set_block(void *data, bool blocked) return 0; } -static void cfg80211_rfkill_sync_work(struct work_struct *work) +static void cfg80211_rfkill_block_work(struct work_struct *work) { struct cfg80211_registered_device *rdev; - rdev = container_of(work, struct cfg80211_registered_device, rfkill_sync); - cfg80211_rfkill_set_block(rdev, rfkill_blocked(rdev->rfkill)); + rdev = container_of(work, struct cfg80211_registered_device, + rfkill_block); + cfg80211_rfkill_set_block(rdev, true); } static void cfg80211_event_work(struct work_struct *work) @@ -518,7 +517,7 @@ use_default_name: return NULL; } - INIT_WORK(&rdev->rfkill_sync, cfg80211_rfkill_sync_work); + INIT_WORK(&rdev->rfkill_block, cfg80211_rfkill_block_work); INIT_WORK(&rdev->conn_work, cfg80211_conn_work); INIT_WORK(&rdev->event_work, cfg80211_event_work); @@ -899,11 +898,8 @@ int wiphy_register(struct wiphy *wiphy) cfg80211_rdev_list_generation++; /* add to debugfs */ - rdev->wiphy.debugfsdir = - debugfs_create_dir(wiphy_name(&rdev->wiphy), - ieee80211_debugfs_dir); - if (IS_ERR(rdev->wiphy.debugfsdir)) - rdev->wiphy.debugfsdir = NULL; + rdev->wiphy.debugfsdir = debugfs_create_dir(wiphy_name(&rdev->wiphy), + ieee80211_debugfs_dir); cfg80211_debugfs_rdev_add(rdev); nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY); @@ -1066,7 +1062,7 @@ void wiphy_rfkill_set_hw_state(struct wiphy *wiphy, bool blocked) struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); if (rfkill_set_hw_state(rdev->rfkill, blocked)) - schedule_work(&rdev->rfkill_sync); + schedule_work(&rdev->rfkill_block); } EXPORT_SYMBOL(wiphy_rfkill_set_hw_state); @@ -1106,6 +1102,7 @@ static void __cfg80211_unregister_wdev(struct wireless_dev *wdev, bool sync) #ifdef CONFIG_CFG80211_WEXT kzfree(wdev->wext.keys); + wdev->wext.keys = NULL; #endif /* only initialized if we have a netdev */ if (wdev->netdev) diff --git a/net/wireless/core.h b/net/wireless/core.h index ee8388fe4a92..ed487e324571 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -28,7 +28,7 @@ struct cfg80211_registered_device { /* rfkill support */ struct rfkill_ops rfkill_ops; struct rfkill *rfkill; - struct work_struct rfkill_sync; + struct work_struct rfkill_block; /* ISO / IEC 3166 alpha2 for which this device is receiving * country IEs on, this can help disregard country IEs from APs @@ -306,6 +306,8 @@ void ieee80211_set_bitrate_flags(struct wiphy *wiphy); void cfg80211_bss_expire(struct cfg80211_registered_device *rdev); void cfg80211_bss_age(struct cfg80211_registered_device *rdev, unsigned long age_secs); +void cfg80211_update_assoc_bss_entry(struct wireless_dev *wdev, + struct ieee80211_channel *channel); /* IBSS */ int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, diff --git a/net/wireless/ethtool.c b/net/wireless/ethtool.c index a9c0f368db5d..24e18405cdb4 100644 --- a/net/wireless/ethtool.c +++ b/net/wireless/ethtool.c @@ -7,9 +7,13 @@ void cfg80211_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { struct wireless_dev *wdev = dev->ieee80211_ptr; + struct device *pdev = wiphy_dev(wdev->wiphy); - strlcpy(info->driver, wiphy_dev(wdev->wiphy)->driver->name, - sizeof(info->driver)); + if (pdev->driver) + strlcpy(info->driver, pdev->driver->name, + sizeof(info->driver)); + else + strlcpy(info->driver, "N/A", sizeof(info->driver)); strlcpy(info->version, init_utsname()->release, sizeof(info->version)); diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c index d1743e6abc34..ae8fe66a9bb8 100644 --- a/net/wireless/ibss.c +++ b/net/wireless/ibss.c @@ -104,13 +104,19 @@ int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, * use the mandatory rate set for 11b or * 11a for maximum compatibility. */ - struct ieee80211_supported_band *sband = - rdev->wiphy.bands[params->chandef.chan->band]; + struct ieee80211_supported_band *sband; + enum nl80211_band band; + u32 flag; int j; - u32 flag = params->chandef.chan->band == NL80211_BAND_5GHZ ? - IEEE80211_RATE_MANDATORY_A : - IEEE80211_RATE_MANDATORY_B; + band = params->chandef.chan->band; + if (band == NL80211_BAND_5GHZ || + band == NL80211_BAND_6GHZ) + flag = IEEE80211_RATE_MANDATORY_A; + else + flag = IEEE80211_RATE_MANDATORY_B; + + sband = rdev->wiphy.bands[band]; for (j = 0; j < sband->n_bitrates; j++) { if (sband->bitrates[j].flags & flag) params->basic_rates |= BIT(j); diff --git a/net/wireless/lib80211_crypt_ccmp.c b/net/wireless/lib80211_crypt_ccmp.c index 7e8ff9d7dcfa..6a5f08f7491e 100644 --- a/net/wireless/lib80211_crypt_ccmp.c +++ b/net/wireless/lib80211_crypt_ccmp.c @@ -22,6 +22,7 @@ #include <linux/ieee80211.h> #include <linux/crypto.h> +#include <crypto/aead.h> #include <net/lib80211.h> @@ -48,20 +49,13 @@ struct lib80211_ccmp_data { int key_idx; - struct crypto_cipher *tfm; + struct crypto_aead *tfm; /* scratch buffers for virt_to_page() (crypto API) */ - u8 tx_b0[AES_BLOCK_LEN], tx_b[AES_BLOCK_LEN], - tx_e[AES_BLOCK_LEN], tx_s0[AES_BLOCK_LEN]; - u8 rx_b0[AES_BLOCK_LEN], rx_b[AES_BLOCK_LEN], rx_a[AES_BLOCK_LEN]; + u8 tx_aad[2 * AES_BLOCK_LEN]; + u8 rx_aad[2 * AES_BLOCK_LEN]; }; -static inline void lib80211_ccmp_aes_encrypt(struct crypto_cipher *tfm, - const u8 pt[16], u8 ct[16]) -{ - crypto_cipher_encrypt_one(tfm, ct, pt); -} - static void *lib80211_ccmp_init(int key_idx) { struct lib80211_ccmp_data *priv; @@ -71,7 +65,7 @@ static void *lib80211_ccmp_init(int key_idx) goto fail; priv->key_idx = key_idx; - priv->tfm = crypto_alloc_cipher("aes", 0, 0); + priv->tfm = crypto_alloc_aead("ccm(aes)", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(priv->tfm)) { priv->tfm = NULL; goto fail; @@ -82,7 +76,7 @@ static void *lib80211_ccmp_init(int key_idx) fail: if (priv) { if (priv->tfm) - crypto_free_cipher(priv->tfm); + crypto_free_aead(priv->tfm); kfree(priv); } @@ -93,25 +87,16 @@ static void lib80211_ccmp_deinit(void *priv) { struct lib80211_ccmp_data *_priv = priv; if (_priv && _priv->tfm) - crypto_free_cipher(_priv->tfm); + crypto_free_aead(_priv->tfm); kfree(priv); } -static inline void xor_block(u8 * b, u8 * a, size_t len) -{ - int i; - for (i = 0; i < len; i++) - b[i] ^= a[i]; -} - -static void ccmp_init_blocks(struct crypto_cipher *tfm, - struct ieee80211_hdr *hdr, - u8 * pn, size_t dlen, u8 * b0, u8 * auth, u8 * s0) +static int ccmp_init_iv_and_aad(const struct ieee80211_hdr *hdr, + const u8 *pn, u8 *iv, u8 *aad) { u8 *pos, qc = 0; size_t aad_len; int a4_included, qc_included; - u8 aad[2 * AES_BLOCK_LEN]; a4_included = ieee80211_has_a4(hdr->frame_control); qc_included = ieee80211_is_data_qos(hdr->frame_control); @@ -127,17 +112,19 @@ static void ccmp_init_blocks(struct crypto_cipher *tfm, aad_len += 2; } - /* CCM Initial Block: - * Flag (Include authentication header, M=3 (8-octet MIC), - * L=1 (2-octet Dlen)) - * Nonce: 0x00 | A2 | PN - * Dlen */ - b0[0] = 0x59; - b0[1] = qc; - memcpy(b0 + 2, hdr->addr2, ETH_ALEN); - memcpy(b0 + 8, pn, CCMP_PN_LEN); - b0[14] = (dlen >> 8) & 0xff; - b0[15] = dlen & 0xff; + /* In CCM, the initial vectors (IV) used for CTR mode encryption and CBC + * mode authentication are not allowed to collide, yet both are derived + * from the same vector. We only set L := 1 here to indicate that the + * data size can be represented in (L+1) bytes. The CCM layer will take + * care of storing the data length in the top (L+1) bytes and setting + * and clearing the other bits as is required to derive the two IVs. + */ + iv[0] = 0x1; + + /* Nonce: QC | A2 | PN */ + iv[1] = qc; + memcpy(iv + 2, hdr->addr2, ETH_ALEN); + memcpy(iv + 8, pn, CCMP_PN_LEN); /* AAD: * FC with bits 4..6 and 11..13 masked to zero; 14 is always one @@ -147,31 +134,20 @@ static void ccmp_init_blocks(struct crypto_cipher *tfm, * QC (if present) */ pos = (u8 *) hdr; - aad[0] = 0; /* aad_len >> 8 */ - aad[1] = aad_len & 0xff; - aad[2] = pos[0] & 0x8f; - aad[3] = pos[1] & 0xc7; - memcpy(aad + 4, hdr->addr1, 3 * ETH_ALEN); + aad[0] = pos[0] & 0x8f; + aad[1] = pos[1] & 0xc7; + memcpy(aad + 2, hdr->addr1, 3 * ETH_ALEN); pos = (u8 *) & hdr->seq_ctrl; - aad[22] = pos[0] & 0x0f; - aad[23] = 0; /* all bits masked */ - memset(aad + 24, 0, 8); + aad[20] = pos[0] & 0x0f; + aad[21] = 0; /* all bits masked */ + memset(aad + 22, 0, 8); if (a4_included) - memcpy(aad + 24, hdr->addr4, ETH_ALEN); + memcpy(aad + 22, hdr->addr4, ETH_ALEN); if (qc_included) { - aad[a4_included ? 30 : 24] = qc; + aad[a4_included ? 28 : 22] = qc; /* rest of QC masked */ } - - /* Start with the first block and AAD */ - lib80211_ccmp_aes_encrypt(tfm, b0, auth); - xor_block(auth, aad, AES_BLOCK_LEN); - lib80211_ccmp_aes_encrypt(tfm, auth, auth); - xor_block(auth, &aad[AES_BLOCK_LEN], AES_BLOCK_LEN); - lib80211_ccmp_aes_encrypt(tfm, auth, auth); - b0[0] &= 0x07; - b0[14] = b0[15] = 0; - lib80211_ccmp_aes_encrypt(tfm, b0, s0); + return aad_len; } static int lib80211_ccmp_hdr(struct sk_buff *skb, int hdr_len, @@ -214,13 +190,13 @@ static int lib80211_ccmp_hdr(struct sk_buff *skb, int hdr_len, static int lib80211_ccmp_encrypt(struct sk_buff *skb, int hdr_len, void *priv) { struct lib80211_ccmp_data *key = priv; - int data_len, i, blocks, last, len; - u8 *pos, *mic; struct ieee80211_hdr *hdr; - u8 *b0 = key->tx_b0; - u8 *b = key->tx_b; - u8 *e = key->tx_e; - u8 *s0 = key->tx_s0; + struct aead_request *req; + struct scatterlist sg[2]; + u8 *aad = key->tx_aad; + u8 iv[AES_BLOCK_LEN]; + int len, data_len, aad_len; + int ret; if (skb_tailroom(skb) < CCMP_MIC_LEN || skb->len < hdr_len) return -1; @@ -230,31 +206,28 @@ static int lib80211_ccmp_encrypt(struct sk_buff *skb, int hdr_len, void *priv) if (len < 0) return -1; - pos = skb->data + hdr_len + CCMP_HDR_LEN; + req = aead_request_alloc(key->tfm, GFP_ATOMIC); + if (!req) + return -ENOMEM; + hdr = (struct ieee80211_hdr *)skb->data; - ccmp_init_blocks(key->tfm, hdr, key->tx_pn, data_len, b0, b, s0); - - blocks = DIV_ROUND_UP(data_len, AES_BLOCK_LEN); - last = data_len % AES_BLOCK_LEN; - - for (i = 1; i <= blocks; i++) { - len = (i == blocks && last) ? last : AES_BLOCK_LEN; - /* Authentication */ - xor_block(b, pos, len); - lib80211_ccmp_aes_encrypt(key->tfm, b, b); - /* Encryption, with counter */ - b0[14] = (i >> 8) & 0xff; - b0[15] = i & 0xff; - lib80211_ccmp_aes_encrypt(key->tfm, b0, e); - xor_block(pos, e, len); - pos += len; - } + aad_len = ccmp_init_iv_and_aad(hdr, key->tx_pn, iv, aad); - mic = skb_put(skb, CCMP_MIC_LEN); - for (i = 0; i < CCMP_MIC_LEN; i++) - mic[i] = b[i] ^ s0[i]; + skb_put(skb, CCMP_MIC_LEN); - return 0; + sg_init_table(sg, 2); + sg_set_buf(&sg[0], aad, aad_len); + sg_set_buf(&sg[1], skb->data + hdr_len + CCMP_HDR_LEN, + data_len + CCMP_MIC_LEN); + + aead_request_set_callback(req, 0, NULL, NULL); + aead_request_set_ad(req, aad_len); + aead_request_set_crypt(req, sg, sg, data_len, iv); + + ret = crypto_aead_encrypt(req); + aead_request_free(req); + + return ret; } /* @@ -283,13 +256,13 @@ static int lib80211_ccmp_decrypt(struct sk_buff *skb, int hdr_len, void *priv) struct lib80211_ccmp_data *key = priv; u8 keyidx, *pos; struct ieee80211_hdr *hdr; - u8 *b0 = key->rx_b0; - u8 *b = key->rx_b; - u8 *a = key->rx_a; + struct aead_request *req; + struct scatterlist sg[2]; + u8 *aad = key->rx_aad; + u8 iv[AES_BLOCK_LEN]; u8 pn[6]; - int i, blocks, last, len; - size_t data_len = skb->len - hdr_len - CCMP_HDR_LEN - CCMP_MIC_LEN; - u8 *mic = skb->data + skb->len - CCMP_MIC_LEN; + int aad_len, ret; + size_t data_len = skb->len - hdr_len - CCMP_HDR_LEN; if (skb->len < hdr_len + CCMP_HDR_LEN + CCMP_MIC_LEN) { key->dot11RSNAStatsCCMPFormatErrors++; @@ -337,28 +310,26 @@ static int lib80211_ccmp_decrypt(struct sk_buff *skb, int hdr_len, void *priv) return -4; } - ccmp_init_blocks(key->tfm, hdr, pn, data_len, b0, a, b); - xor_block(mic, b, CCMP_MIC_LEN); - - blocks = DIV_ROUND_UP(data_len, AES_BLOCK_LEN); - last = data_len % AES_BLOCK_LEN; - - for (i = 1; i <= blocks; i++) { - len = (i == blocks && last) ? last : AES_BLOCK_LEN; - /* Decrypt, with counter */ - b0[14] = (i >> 8) & 0xff; - b0[15] = i & 0xff; - lib80211_ccmp_aes_encrypt(key->tfm, b0, b); - xor_block(pos, b, len); - /* Authentication */ - xor_block(a, pos, len); - lib80211_ccmp_aes_encrypt(key->tfm, a, a); - pos += len; - } + req = aead_request_alloc(key->tfm, GFP_ATOMIC); + if (!req) + return -ENOMEM; - if (memcmp(mic, a, CCMP_MIC_LEN) != 0) { - net_dbg_ratelimited("CCMP: decrypt failed: STA=%pM\n", - hdr->addr2); + aad_len = ccmp_init_iv_and_aad(hdr, pn, iv, aad); + + sg_init_table(sg, 2); + sg_set_buf(&sg[0], aad, aad_len); + sg_set_buf(&sg[1], pos, data_len); + + aead_request_set_callback(req, 0, NULL, NULL); + aead_request_set_ad(req, aad_len); + aead_request_set_crypt(req, sg, sg, data_len, iv); + + ret = crypto_aead_decrypt(req); + aead_request_free(req); + + if (ret) { + net_dbg_ratelimited("CCMP: decrypt failed: STA=%pM (%d)\n", + hdr->addr2, ret); key->dot11RSNAStatsCCMPDecryptErrors++; return -5; } @@ -377,7 +348,7 @@ static int lib80211_ccmp_set_key(void *key, int len, u8 * seq, void *priv) { struct lib80211_ccmp_data *data = priv; int keyidx; - struct crypto_cipher *tfm = data->tfm; + struct crypto_aead *tfm = data->tfm; keyidx = data->key_idx; memset(data, 0, sizeof(*data)); @@ -394,7 +365,9 @@ static int lib80211_ccmp_set_key(void *key, int len, u8 * seq, void *priv) data->rx_pn[4] = seq[1]; data->rx_pn[5] = seq[0]; } - crypto_cipher_setkey(data->tfm, data->key, CCMP_TK_LEN); + if (crypto_aead_setauthsize(data->tfm, CCMP_MIC_LEN) || + crypto_aead_setkey(data->tfm, data->key, CCMP_TK_LEN)) + return -1; } else if (len == 0) data->key_set = 0; else diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index fd05ae1437a9..cedf17d4933f 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -201,6 +201,38 @@ cfg80211_get_dev_from_info(struct net *netns, struct genl_info *info) return __cfg80211_rdev_from_attrs(netns, info->attrs); } +static int validate_beacon_head(const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + const u8 *data = nla_data(attr); + unsigned int len = nla_len(attr); + const struct element *elem; + const struct ieee80211_mgmt *mgmt = (void *)data; + unsigned int fixedlen = offsetof(struct ieee80211_mgmt, + u.beacon.variable); + + if (len < fixedlen) + goto err; + + if (ieee80211_hdrlen(mgmt->frame_control) != + offsetof(struct ieee80211_mgmt, u.beacon)) + goto err; + + data += fixedlen; + len -= fixedlen; + + for_each_element(elem, data, len) { + /* nothing */ + } + + if (for_each_element_completed(elem, data, len)) + return 0; + +err: + NL_SET_ERR_MSG_ATTR(extack, attr, "malformed beacon head"); + return -EINVAL; +} + static int validate_ie_attr(const struct nlattr *attr, struct netlink_ext_ack *extack) { @@ -281,7 +313,16 @@ nl80211_pmsr_attr_policy[NL80211_PMSR_ATTR_MAX + 1] = { NLA_POLICY_NESTED_ARRAY(nl80211_psmr_peer_attr_policy), }; +static const struct nla_policy +he_obss_pd_policy[NL80211_HE_OBSS_PD_ATTR_MAX + 1] = { + [NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET] = + NLA_POLICY_RANGE(NLA_U8, 1, 20), + [NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET] = + NLA_POLICY_RANGE(NLA_U8, 1, 20), +}; + const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { + [0] = { .strict_start_type = NL80211_ATTR_HE_OBSS_PD }, [NL80211_ATTR_WIPHY] = { .type = NLA_U32 }, [NL80211_ATTR_WIPHY_NAME] = { .type = NLA_NUL_STRING, .len = 20-1 }, @@ -289,6 +330,13 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_WIPHY_FREQ] = { .type = NLA_U32 }, [NL80211_ATTR_WIPHY_CHANNEL_TYPE] = { .type = NLA_U32 }, + [NL80211_ATTR_WIPHY_EDMG_CHANNELS] = NLA_POLICY_RANGE(NLA_U8, + NL80211_EDMG_CHANNELS_MIN, + NL80211_EDMG_CHANNELS_MAX), + [NL80211_ATTR_WIPHY_EDMG_BW_CONFIG] = NLA_POLICY_RANGE(NLA_U8, + NL80211_EDMG_BW_CONFIG_MIN, + NL80211_EDMG_BW_CONFIG_MAX), + [NL80211_ATTR_CHANNEL_WIDTH] = { .type = NLA_U32 }, [NL80211_ATTR_CENTER_FREQ1] = { .type = NLA_U32 }, [NL80211_ATTR_CENTER_FREQ2] = { .type = NLA_U32 }, @@ -322,8 +370,9 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_BEACON_INTERVAL] = { .type = NLA_U32 }, [NL80211_ATTR_DTIM_PERIOD] = { .type = NLA_U32 }, - [NL80211_ATTR_BEACON_HEAD] = { .type = NLA_BINARY, - .len = IEEE80211_MAX_DATA_LEN }, + [NL80211_ATTR_BEACON_HEAD] = + NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_beacon_head, + IEEE80211_MAX_DATA_LEN), [NL80211_ATTR_BEACON_TAIL] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr, IEEE80211_MAX_DATA_LEN), @@ -344,7 +393,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_MNTR_FLAGS] = { /* NLA_NESTED can't be empty */ }, [NL80211_ATTR_MESH_ID] = { .type = NLA_BINARY, .len = IEEE80211_MAX_MESH_ID_LEN }, - [NL80211_ATTR_MPATH_NEXT_HOP] = { .type = NLA_U32 }, + [NL80211_ATTR_MPATH_NEXT_HOP] = NLA_POLICY_ETH_ADDR_COMPAT, [NL80211_ATTR_REG_ALPHA2] = { .type = NLA_STRING, .len = 2 }, [NL80211_ATTR_REG_RULES] = { .type = NLA_NESTED }, @@ -388,6 +437,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT] = { .type = NLA_FLAG }, [NL80211_ATTR_CONTROL_PORT_OVER_NL80211] = { .type = NLA_FLAG }, [NL80211_ATTR_PRIVACY] = { .type = NLA_FLAG }, + [NL80211_ATTR_STATUS_CODE] = { .type = NLA_U16 }, [NL80211_ATTR_CIPHER_SUITE_GROUP] = { .type = NLA_U32 }, [NL80211_ATTR_WPA_VERSIONS] = { .type = NLA_U32 }, [NL80211_ATTR_PID] = { .type = NLA_U32 }, @@ -574,6 +624,8 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_SAE_PASSWORD] = { .type = NLA_BINARY, .len = SAE_PASSWORD_MAX_LEN }, [NL80211_ATTR_TWT_RESPONDER] = { .type = NLA_FLAG }, + [NL80211_ATTR_HE_OBSS_PD] = NLA_POLICY_NESTED(he_obss_pd_policy), + [NL80211_ATTR_VLAN_ID] = NLA_POLICY_RANGE(NLA_U16, 1, VLAN_N_VID - 2), }; /* policy for the key attributes */ @@ -667,6 +719,7 @@ static const struct nla_policy nl80211_match_band_rssi_policy[NUM_NL80211_BANDS] = { [NL80211_BAND_2GHZ] = { .type = NLA_S32 }, [NL80211_BAND_5GHZ] = { .type = NLA_S32 }, + [NL80211_BAND_6GHZ] = { .type = NLA_S32 }, [NL80211_BAND_60GHZ] = { .type = NLA_S32 }, }; @@ -749,17 +802,25 @@ int nl80211_prepare_wdev_dump(struct netlink_callback *cb, int err; if (!cb->args[0]) { + struct nlattr **attrbuf; + + attrbuf = kcalloc(NUM_NL80211_ATTR, sizeof(*attrbuf), + GFP_KERNEL); + if (!attrbuf) + return -ENOMEM; + err = nlmsg_parse_deprecated(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize, - genl_family_attrbuf(&nl80211_fam), - nl80211_fam.maxattr, + attrbuf, nl80211_fam.maxattr, nl80211_policy, NULL); - if (err) + if (err) { + kfree(attrbuf); return err; + } - *wdev = __cfg80211_wdev_from_attrs( - sock_net(cb->skb->sk), - genl_family_attrbuf(&nl80211_fam)); + *wdev = __cfg80211_wdev_from_attrs(sock_net(cb->skb->sk), + attrbuf); + kfree(attrbuf); if (IS_ERR(*wdev)) return PTR_ERR(*wdev); *rdev = wiphy_to_rdev((*wdev)->wiphy); @@ -1555,6 +1616,15 @@ static int nl80211_send_band_rateinfo(struct sk_buff *msg, nla_nest_end(msg, nl_iftype_data); } + /* add EDMG info */ + if (sband->edmg_cap.channels && + (nla_put_u8(msg, NL80211_BAND_ATTR_EDMG_CHANNELS, + sband->edmg_cap.channels) || + nla_put_u8(msg, NL80211_BAND_ATTR_EDMG_BW_CONFIG, + sband->edmg_cap.bw_config))) + + return -ENOBUFS; + /* add bitrates */ nl_rates = nla_nest_start_noflag(msg, NL80211_BAND_ATTR_RATES); if (!nl_rates) @@ -2065,6 +2135,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, CMD(add_tx_ts, ADD_TX_TS); CMD(set_multicast_to_unicast, SET_MULTICAST_TO_UNICAST); CMD(update_connect_params, UPDATE_CONNECT_PARAMS); + CMD(update_ft_ies, UPDATE_FT_IES); } #undef CMD @@ -2172,6 +2243,30 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, rdev->wiphy.vht_capa_mod_mask)) goto nla_put_failure; + if (nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, + rdev->wiphy.perm_addr)) + goto nla_put_failure; + + if (!is_zero_ether_addr(rdev->wiphy.addr_mask) && + nla_put(msg, NL80211_ATTR_MAC_MASK, ETH_ALEN, + rdev->wiphy.addr_mask)) + goto nla_put_failure; + + if (rdev->wiphy.n_addresses > 1) { + void *attr; + + attr = nla_nest_start(msg, NL80211_ATTR_MAC_ADDRS); + if (!attr) + goto nla_put_failure; + + for (i = 0; i < rdev->wiphy.n_addresses; i++) + if (nla_put(msg, i + 1, ETH_ALEN, + rdev->wiphy.addresses[i].addr)) + goto nla_put_failure; + + nla_nest_end(msg, attr); + } + state->split_start++; break; case 10: @@ -2366,14 +2461,21 @@ static int nl80211_dump_wiphy_parse(struct sk_buff *skb, struct netlink_callback *cb, struct nl80211_dump_wiphy_state *state) { - struct nlattr **tb = genl_family_attrbuf(&nl80211_fam); - int ret = nlmsg_parse_deprecated(cb->nlh, - GENL_HDRLEN + nl80211_fam.hdrsize, - tb, nl80211_fam.maxattr, - nl80211_policy, NULL); + struct nlattr **tb = kcalloc(NUM_NL80211_ATTR, sizeof(*tb), GFP_KERNEL); + int ret; + + if (!tb) + return -ENOMEM; + + ret = nlmsg_parse_deprecated(cb->nlh, + GENL_HDRLEN + nl80211_fam.hdrsize, + tb, nl80211_fam.maxattr, + nl80211_policy, NULL); /* ignore parse errors for backward compatibility */ - if (ret) - return 0; + if (ret) { + ret = 0; + goto out; + } state->split = tb[NL80211_ATTR_SPLIT_WIPHY_DUMP]; if (tb[NL80211_ATTR_WIPHY]) @@ -2386,8 +2488,10 @@ static int nl80211_dump_wiphy_parse(struct sk_buff *skb, int ifidx = nla_get_u32(tb[NL80211_ATTR_IFINDEX]); netdev = __dev_get_by_index(sock_net(skb->sk), ifidx); - if (!netdev) - return -ENODEV; + if (!netdev) { + ret = -ENODEV; + goto out; + } if (netdev->ieee80211_ptr) { rdev = wiphy_to_rdev( netdev->ieee80211_ptr->wiphy); @@ -2395,7 +2499,10 @@ static int nl80211_dump_wiphy_parse(struct sk_buff *skb, } } - return 0; + ret = 0; +out: + kfree(tb); + return ret; } static int nl80211_dump_wiphy(struct sk_buff *skb, struct netlink_callback *cb) @@ -2564,6 +2671,8 @@ int nl80211_parse_chandef(struct cfg80211_registered_device *rdev, control_freq = nla_get_u32(attrs[NL80211_ATTR_WIPHY_FREQ]); + memset(chandef, 0, sizeof(*chandef)); + chandef->chan = ieee80211_get_channel(&rdev->wiphy, control_freq); chandef->width = NL80211_CHAN_WIDTH_20_NOHT; chandef->center_freq1 = control_freq; @@ -2622,6 +2731,18 @@ int nl80211_parse_chandef(struct cfg80211_registered_device *rdev, nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ2]); } + if (info->attrs[NL80211_ATTR_WIPHY_EDMG_CHANNELS]) { + chandef->edmg.channels = + nla_get_u8(info->attrs[NL80211_ATTR_WIPHY_EDMG_CHANNELS]); + + if (info->attrs[NL80211_ATTR_WIPHY_EDMG_BW_CONFIG]) + chandef->edmg.bw_config = + nla_get_u8(info->attrs[NL80211_ATTR_WIPHY_EDMG_BW_CONFIG]); + } else { + chandef->edmg.bw_config = 0; + chandef->edmg.channels = 0; + } + if (!cfg80211_chandef_valid(chandef)) { NL_SET_ERR_MSG(extack, "invalid channel definition"); return -EINVAL; @@ -3092,7 +3213,7 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag if (rdev->ops->get_channel) { int ret; - struct cfg80211_chan_def chandef; + struct cfg80211_chan_def chandef = {}; ret = rdev_get_channel(rdev, wdev, &chandef); if (ret == 0) { @@ -3821,6 +3942,10 @@ static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info) key.type != NL80211_KEYTYPE_GROUP) return -EINVAL; + if (key.type == NL80211_KEYTYPE_GROUP && + info->attrs[NL80211_ATTR_VLAN_ID]) + key.p.vlan_id = nla_get_u16(info->attrs[NL80211_ATTR_VLAN_ID]); + if (!rdev->ops->add_key) return -EOPNOTSUPP; @@ -4359,6 +4484,34 @@ static int nl80211_parse_beacon(struct cfg80211_registered_device *rdev, return 0; } +static int nl80211_parse_he_obss_pd(struct nlattr *attrs, + struct ieee80211_he_obss_pd *he_obss_pd) +{ + struct nlattr *tb[NL80211_HE_OBSS_PD_ATTR_MAX + 1]; + int err; + + err = nla_parse_nested(tb, NL80211_HE_OBSS_PD_ATTR_MAX, attrs, + he_obss_pd_policy, NULL); + if (err) + return err; + + if (!tb[NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET] || + !tb[NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET]) + return -EINVAL; + + he_obss_pd->min_offset = + nla_get_u32(tb[NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET]); + he_obss_pd->max_offset = + nla_get_u32(tb[NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET]); + + if (he_obss_pd->min_offset >= he_obss_pd->max_offset) + return -EINVAL; + + he_obss_pd->enable = true; + + return 0; +} + static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params, const u8 *rates) { @@ -4643,6 +4796,14 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) params.twt_responder = nla_get_flag(info->attrs[NL80211_ATTR_TWT_RESPONDER]); + if (info->attrs[NL80211_ATTR_HE_OBSS_PD]) { + err = nl80211_parse_he_obss_pd( + info->attrs[NL80211_ATTR_HE_OBSS_PD], + ¶ms.he_obss_pd); + if (err) + return err; + } + nl80211_calculate_ap_params(¶ms); if (info->attrs[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT]) @@ -4941,6 +5102,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, PUT_SINFO(CONNECTED_TIME, connected_time, u32); PUT_SINFO(INACTIVE_TIME, inactive_time, u32); + PUT_SINFO_U64(ASSOC_AT_BOOTTIME, assoc_at); if (sinfo->filled & (BIT_ULL(NL80211_STA_INFO_RX_BYTES) | BIT_ULL(NL80211_STA_INFO_RX_BYTES64)) && @@ -5555,6 +5717,9 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_STA_AID]) params.aid = nla_get_u16(info->attrs[NL80211_ATTR_STA_AID]); + if (info->attrs[NL80211_ATTR_VLAN_ID]) + params.vlan_id = nla_get_u16(info->attrs[NL80211_ATTR_VLAN_ID]); + if (info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]) params.listen_interval = nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]); @@ -5700,6 +5865,9 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) params.listen_interval = nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]); + if (info->attrs[NL80211_ATTR_VLAN_ID]) + params.vlan_id = nla_get_u16(info->attrs[NL80211_ATTR_VLAN_ID]); + if (info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]) { params.support_p2p_ps = nla_get_u8(info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]); @@ -6149,6 +6317,9 @@ static int nl80211_del_mpath(struct sk_buff *skb, struct genl_info *info) if (!rdev->ops->del_mpath) return -EOPNOTSUPP; + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) + return -EOPNOTSUPP; + return rdev_del_mpath(rdev, dev, dst); } @@ -8106,10 +8277,8 @@ static int nl80211_start_sched_scan(struct sk_buff *skb, /* leave request id zero for legacy request * or if driver does not support multi-scheduled scan */ - if (want_multi && rdev->wiphy.max_sched_scan_reqs > 1) { - while (!sched_scan_req->reqid) - sched_scan_req->reqid = cfg80211_assign_cookie(rdev); - } + if (want_multi && rdev->wiphy.max_sched_scan_reqs > 1) + sched_scan_req->reqid = cfg80211_assign_cookie(rdev); err = rdev_sched_scan_start(rdev, dev, sched_scan_req); if (err) @@ -8685,6 +8854,10 @@ static int nl80211_send_survey(struct sk_buff *msg, u32 portid, u32 seq, nla_put_u64_64bit(msg, NL80211_SURVEY_INFO_TIME_SCAN, survey->time_scan, NL80211_SURVEY_INFO_PAD)) goto nla_put_failure; + if ((survey->filled & SURVEY_INFO_TIME_BSS_RX) && + nla_put_u64_64bit(msg, NL80211_SURVEY_INFO_TIME_BSS_RX, + survey->time_bss_rx, NL80211_SURVEY_INFO_PAD)) + goto nla_put_failure; nla_nest_end(msg, infoattr); @@ -8698,7 +8871,7 @@ static int nl80211_send_survey(struct sk_buff *msg, u32 portid, u32 seq, static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb) { - struct nlattr **attrbuf = genl_family_attrbuf(&nl80211_fam); + struct nlattr **attrbuf; struct survey_info survey; struct cfg80211_registered_device *rdev; struct wireless_dev *wdev; @@ -8706,6 +8879,10 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb) int res; bool radio_stats; + attrbuf = kcalloc(NUM_NL80211_ATTR, sizeof(*attrbuf), GFP_KERNEL); + if (!attrbuf) + return -ENOMEM; + rtnl_lock(); res = nl80211_prepare_wdev_dump(cb, &rdev, &wdev); if (res) @@ -8750,6 +8927,7 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb) cb->args[2] = survey_idx; res = skb->len; out_err: + kfree(attrbuf); rtnl_unlock(); return res; } @@ -9609,6 +9787,7 @@ static int nl80211_testmode_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct cfg80211_registered_device *rdev; + struct nlattr **attrbuf = NULL; int err; long phy_idx; void *data = NULL; @@ -9629,7 +9808,12 @@ static int nl80211_testmode_dump(struct sk_buff *skb, goto out_err; } } else { - struct nlattr **attrbuf = genl_family_attrbuf(&nl80211_fam); + attrbuf = kcalloc(NUM_NL80211_ATTR, sizeof(*attrbuf), + GFP_KERNEL); + if (!attrbuf) { + err = -ENOMEM; + goto out_err; + } err = nlmsg_parse_deprecated(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize, @@ -9696,6 +9880,7 @@ static int nl80211_testmode_dump(struct sk_buff *skb, /* see above */ cb->args[0] = phy_idx + 1; out_err: + kfree(attrbuf); rtnl_unlock(); return err; } @@ -9790,6 +9975,15 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } + if (info->attrs[NL80211_ATTR_WIPHY_EDMG_CHANNELS]) { + connect.edmg.channels = + nla_get_u8(info->attrs[NL80211_ATTR_WIPHY_EDMG_CHANNELS]); + + if (info->attrs[NL80211_ATTR_WIPHY_EDMG_BW_CONFIG]) + connect.edmg.bw_config = + nla_get_u8(info->attrs[NL80211_ATTR_WIPHY_EDMG_BW_CONFIG]); + } + if (connect.privacy && info->attrs[NL80211_ATTR_KEYS]) { connkeys = nl80211_parse_connkeys(rdev, info, NULL); if (IS_ERR(connkeys)) @@ -10650,6 +10844,7 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev, if (err) return err; + cfg80211_sinfo_release_content(&sinfo); if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_BEACON_SIGNAL_AVG)) wdev->cqm_config->last_rssi_event_value = (s8) sinfo.rx_beacon_signal_avg; @@ -10659,9 +10854,11 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev, hyst = wdev->cqm_config->rssi_hyst; n = wdev->cqm_config->n_rssi_thresholds; - for (i = 0; i < n; i++) + for (i = 0; i < n; i++) { + i = array_index_nospec(i, n); if (last < wdev->cqm_config->rssi_thresholds[i]) break; + } low_index = i - 1; if (low_index >= 0) { @@ -12705,8 +12902,7 @@ static int nl80211_vendor_check_policy(const struct wiphy_vendor_command *vcmd, return -EINVAL; } - return nl80211_validate_nested(attr, vcmd->maxattr, vcmd->policy, - extack); + return nla_validate_nested(attr, vcmd->maxattr, vcmd->policy, extack); } static int nl80211_vendor_cmd(struct sk_buff *skb, struct genl_info *info) @@ -12789,7 +12985,7 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb, struct cfg80211_registered_device **rdev, struct wireless_dev **wdev) { - struct nlattr **attrbuf = genl_family_attrbuf(&nl80211_fam); + struct nlattr **attrbuf; u32 vid, subcmd; unsigned int i; int vcmd_idx = -1; @@ -12820,24 +13016,32 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb, return 0; } + attrbuf = kcalloc(NUM_NL80211_ATTR, sizeof(*attrbuf), GFP_KERNEL); + if (!attrbuf) + return -ENOMEM; + err = nlmsg_parse_deprecated(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize, attrbuf, nl80211_fam.maxattr, nl80211_policy, NULL); if (err) - return err; + goto out; if (!attrbuf[NL80211_ATTR_VENDOR_ID] || - !attrbuf[NL80211_ATTR_VENDOR_SUBCMD]) - return -EINVAL; + !attrbuf[NL80211_ATTR_VENDOR_SUBCMD]) { + err = -EINVAL; + goto out; + } *wdev = __cfg80211_wdev_from_attrs(sock_net(skb->sk), attrbuf); if (IS_ERR(*wdev)) *wdev = NULL; *rdev = __cfg80211_rdev_from_attrs(sock_net(skb->sk), attrbuf); - if (IS_ERR(*rdev)) - return PTR_ERR(*rdev); + if (IS_ERR(*rdev)) { + err = PTR_ERR(*rdev); + goto out; + } vid = nla_get_u32(attrbuf[NL80211_ATTR_VENDOR_ID]); subcmd = nla_get_u32(attrbuf[NL80211_ATTR_VENDOR_SUBCMD]); @@ -12850,15 +13054,19 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb, if (vcmd->info.vendor_id != vid || vcmd->info.subcmd != subcmd) continue; - if (!vcmd->dumpit) - return -EOPNOTSUPP; + if (!vcmd->dumpit) { + err = -EOPNOTSUPP; + goto out; + } vcmd_idx = i; break; } - if (vcmd_idx < 0) - return -EOPNOTSUPP; + if (vcmd_idx < 0) { + err = -EOPNOTSUPP; + goto out; + } if (attrbuf[NL80211_ATTR_VENDOR_DATA]) { data = nla_data(attrbuf[NL80211_ATTR_VENDOR_DATA]); @@ -12869,7 +13077,7 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb, attrbuf[NL80211_ATTR_VENDOR_DATA], cb->extack); if (err) - return err; + goto out; } /* 0 is the first index - add 1 to parse only once */ @@ -12881,7 +13089,10 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb, cb->args[4] = data_len; /* keep rtnl locked in successful case */ - return 0; + err = 0; +out: + kfree(attrbuf); + return err; } static int nl80211_vendor_cmd_dump(struct sk_buff *skb, @@ -13481,7 +13692,7 @@ static int nl80211_get_ftm_responder_stats(struct sk_buff *skb, hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0, NL80211_CMD_GET_FTM_RESPONDER_STATS); if (!hdr) - return -ENOBUFS; + goto nla_put_failure; if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex)) goto nla_put_failure; @@ -13586,6 +13797,8 @@ static int nl80211_probe_mesh_link(struct sk_buff *skb, struct genl_info *info) if (err) return err; + cfg80211_sinfo_release_content(&sinfo); + return rdev_probe_mesh_link(rdev, dev, dest, buf, len); } @@ -14559,6 +14772,7 @@ static struct genl_family nl80211_fam __ro_after_init = { .n_ops = ARRAY_SIZE(nl80211_ops), .mcgrps = nl80211_mcgrps, .n_mcgrps = ARRAY_SIZE(nl80211_mcgrps), + .parallel_ops = true, }; /* notification functions */ @@ -14835,12 +15049,10 @@ void nl80211_common_reg_change_event(enum nl80211_commands cmd_id, return; hdr = nl80211hdr_put(msg, 0, 0, 0, cmd_id); - if (!hdr) { - nlmsg_free(msg); - return; - } + if (!hdr) + goto nla_put_failure; - if (nl80211_reg_change_event_fill(msg, request) == false) + if (!nl80211_reg_change_event_fill(msg, request)) goto nla_put_failure; genlmsg_end(msg, hdr); @@ -16090,7 +16302,9 @@ void cfg80211_ch_switch_notify(struct net_device *dev, if (wdev->iftype == NL80211_IFTYPE_STATION && !WARN_ON(!wdev->current_bss)) - wdev->current_bss->pub.channel = chandef->chan; + cfg80211_update_assoc_bss_entry(wdev, chandef->chan); + + cfg80211_sched_dfs_chan_update(rdev); nl80211_ch_switch_notify(rdev, dev, chandef, GFP_KERNEL, NL80211_CMD_CH_SWITCH_NOTIFY, 0); diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index e853a4fe6f97..e0d34f796d0b 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -538,6 +538,10 @@ static inline int rdev_set_wiphy_params(struct cfg80211_registered_device *rdev, u32 changed) { int ret; + + if (!rdev->ops->set_wiphy_params) + return -EOPNOTSUPP; + trace_rdev_set_wiphy_params(&rdev->wiphy, changed); ret = rdev->ops->set_wiphy_params(&rdev->wiphy, changed); trace_rdev_return_int(&rdev->wiphy, ret); @@ -1167,6 +1171,16 @@ rdev_start_radar_detection(struct cfg80211_registered_device *rdev, return ret; } +static inline void +rdev_end_cac(struct cfg80211_registered_device *rdev, + struct net_device *dev) +{ + trace_rdev_end_cac(&rdev->wiphy, dev); + if (rdev->ops->end_cac) + rdev->ops->end_cac(&rdev->wiphy, dev); + trace_rdev_return_void(&rdev->wiphy); +} + static inline int rdev_set_mcast_rate(struct cfg80211_registered_device *rdev, struct net_device *dev, diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 4831ad745f91..fff9a74891fc 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -2108,7 +2108,7 @@ static void reg_call_notifier(struct wiphy *wiphy, static bool reg_wdev_chan_valid(struct wiphy *wiphy, struct wireless_dev *wdev) { - struct cfg80211_chan_def chandef; + struct cfg80211_chan_def chandef = {}; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); enum nl80211_iftype iftype; @@ -2261,14 +2261,15 @@ static void update_all_wiphy_regulatory(enum nl80211_reg_initiator initiator) static void handle_channel_custom(struct wiphy *wiphy, struct ieee80211_channel *chan, - const struct ieee80211_regdomain *regd) + const struct ieee80211_regdomain *regd, + u32 min_bw) { u32 bw_flags = 0; const struct ieee80211_reg_rule *reg_rule = NULL; const struct ieee80211_power_rule *power_rule = NULL; u32 bw; - for (bw = MHZ_TO_KHZ(20); bw >= MHZ_TO_KHZ(5); bw = bw / 2) { + for (bw = MHZ_TO_KHZ(20); bw >= min_bw; bw = bw / 2) { reg_rule = freq_reg_info_regd(MHZ_TO_KHZ(chan->center_freq), regd, bw); if (!IS_ERR(reg_rule)) @@ -2324,8 +2325,14 @@ static void handle_band_custom(struct wiphy *wiphy, if (!sband) return; + /* + * We currently assume that you always want at least 20 MHz, + * otherwise channel 12 might get enabled if this rule is + * compatible to US, which permits 2402 - 2472 MHz. + */ for (i = 0; i < sband->n_channels; i++) - handle_channel_custom(wiphy, &sband->channels[i], regd); + handle_channel_custom(wiphy, &sband->channels[i], regd, + MHZ_TO_KHZ(20)); } /* Used by drivers prior to wiphy registration */ @@ -2788,7 +2795,7 @@ static void reg_process_pending_hints(void) /* When last_request->processed becomes true this will be rescheduled */ if (lr && !lr->processed) { - reg_process_hint(lr); + pr_debug("Pending regulatory request, waiting for it to be processed...\n"); return; } @@ -3806,8 +3813,9 @@ void wiphy_regulatory_deregister(struct wiphy *wiphy) } /* - * See http://www.fcc.gov/document/5-ghz-unlicensed-spectrum-unii, for - * UNII band definitions + * See FCC notices for UNII band definitions + * 5GHz: https://www.fcc.gov/document/5-ghz-unlicensed-spectrum-unii + * 6GHz: https://www.fcc.gov/document/fcc-proposes-more-spectrum-unlicensed-use-0 */ int cfg80211_get_unii(int freq) { @@ -3831,6 +3839,22 @@ int cfg80211_get_unii(int freq) if (freq > 5725 && freq <= 5825) return 4; + /* UNII-5 */ + if (freq > 5925 && freq <= 6425) + return 5; + + /* UNII-6 */ + if (freq > 6425 && freq <= 6525) + return 6; + + /* UNII-7 */ + if (freq > 6525 && freq <= 6875) + return 7; + + /* UNII-8 */ + if (freq > 6875 && freq <= 7125) + return 8; + return -EINVAL; } @@ -3866,6 +3890,26 @@ bool regulatory_pre_cac_allowed(struct wiphy *wiphy) return pre_cac_allowed; } +EXPORT_SYMBOL(regulatory_pre_cac_allowed); + +static void cfg80211_check_and_end_cac(struct cfg80211_registered_device *rdev) +{ + struct wireless_dev *wdev; + /* If we finished CAC or received radar, we should end any + * CAC running on the same channels. + * the check !cfg80211_chandef_dfs_usable contain 2 options: + * either all channels are available - those the CAC_FINISHED + * event has effected another wdev state, or there is a channel + * in unavailable state in wdev chandef - those the RADAR_DETECTED + * event has effected another wdev state. + * In both cases we should end the CAC on the wdev. + */ + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { + if (wdev->cac_started && + !cfg80211_chandef_dfs_usable(&rdev->wiphy, &wdev->chandef)) + rdev_end_cac(rdev, wdev->netdev); + } +} void regulatory_propagate_dfs_state(struct wiphy *wiphy, struct cfg80211_chan_def *chandef, @@ -3893,8 +3937,10 @@ void regulatory_propagate_dfs_state(struct wiphy *wiphy, cfg80211_set_dfs_state(&rdev->wiphy, chandef, dfs_state); if (event == NL80211_RADAR_DETECTED || - event == NL80211_RADAR_CAC_FINISHED) + event == NL80211_RADAR_CAC_FINISHED) { cfg80211_sched_dfs_chan_update(rdev); + cfg80211_check_and_end_cac(rdev); + } nl80211_radar_notify(rdev, chandef, event, NULL, GFP_KERNEL); } diff --git a/net/wireless/reg.h b/net/wireless/reg.h index 504133d76de4..f9e83031a40a 100644 --- a/net/wireless/reg.h +++ b/net/wireless/reg.h @@ -114,7 +114,7 @@ void regulatory_hint_country_ie(struct wiphy *wiphy, u8 country_ie_len); /** - * regulatory_hint_disconnect - informs all devices have been disconneted + * regulatory_hint_disconnect - informs all devices have been disconnected * * Regulotory rules can be enhanced further upon scanning and upon * connection to an AP. These rules become stale if we disconnect @@ -156,14 +156,6 @@ bool regulatory_indoor_allowed(void); #define REG_PRE_CAC_EXPIRY_GRACE_MS 2000 /** - * regulatory_pre_cac_allowed - if pre-CAC allowed in the current dfs domain - * @wiphy: wiphy for which pre-CAC capability is checked. - - * Pre-CAC is allowed only in ETSI domain. - */ -bool regulatory_pre_cac_allowed(struct wiphy *wiphy); - -/** * regulatory_propagate_dfs_state - Propagate DFS channel state to other wiphys * @wiphy - wiphy on which radar is detected and the event will be propagated * to other available wiphys having the same DFS domain diff --git a/net/wireless/scan.c b/net/wireless/scan.c index d66e6d4b7555..aef240fdf8df 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -1091,6 +1091,93 @@ struct cfg80211_non_tx_bss { u8 bssid_index; }; +static bool +cfg80211_update_known_bss(struct cfg80211_registered_device *rdev, + struct cfg80211_internal_bss *known, + struct cfg80211_internal_bss *new, + bool signal_valid) +{ + lockdep_assert_held(&rdev->bss_lock); + + /* Update IEs */ + if (rcu_access_pointer(new->pub.proberesp_ies)) { + const struct cfg80211_bss_ies *old; + + old = rcu_access_pointer(known->pub.proberesp_ies); + + rcu_assign_pointer(known->pub.proberesp_ies, + new->pub.proberesp_ies); + /* Override possible earlier Beacon frame IEs */ + rcu_assign_pointer(known->pub.ies, + new->pub.proberesp_ies); + if (old) + kfree_rcu((struct cfg80211_bss_ies *)old, rcu_head); + } else if (rcu_access_pointer(new->pub.beacon_ies)) { + const struct cfg80211_bss_ies *old; + struct cfg80211_internal_bss *bss; + + if (known->pub.hidden_beacon_bss && + !list_empty(&known->hidden_list)) { + const struct cfg80211_bss_ies *f; + + /* The known BSS struct is one of the probe + * response members of a group, but we're + * receiving a beacon (beacon_ies in the new + * bss is used). This can only mean that the + * AP changed its beacon from not having an + * SSID to showing it, which is confusing so + * drop this information. + */ + + f = rcu_access_pointer(new->pub.beacon_ies); + kfree_rcu((struct cfg80211_bss_ies *)f, rcu_head); + return false; + } + + old = rcu_access_pointer(known->pub.beacon_ies); + + rcu_assign_pointer(known->pub.beacon_ies, new->pub.beacon_ies); + + /* Override IEs if they were from a beacon before */ + if (old == rcu_access_pointer(known->pub.ies)) + rcu_assign_pointer(known->pub.ies, new->pub.beacon_ies); + + /* Assign beacon IEs to all sub entries */ + list_for_each_entry(bss, &known->hidden_list, hidden_list) { + const struct cfg80211_bss_ies *ies; + + ies = rcu_access_pointer(bss->pub.beacon_ies); + WARN_ON(ies != old); + + rcu_assign_pointer(bss->pub.beacon_ies, + new->pub.beacon_ies); + } + + if (old) + kfree_rcu((struct cfg80211_bss_ies *)old, rcu_head); + } + + known->pub.beacon_interval = new->pub.beacon_interval; + + /* don't update the signal if beacon was heard on + * adjacent channel. + */ + if (signal_valid) + known->pub.signal = new->pub.signal; + known->pub.capability = new->pub.capability; + known->ts = new->ts; + known->ts_boottime = new->ts_boottime; + known->parent_tsf = new->parent_tsf; + known->pub.chains = new->pub.chains; + memcpy(known->pub.chain_signal, new->pub.chain_signal, + IEEE80211_MAX_CHAINS); + ether_addr_copy(known->parent_bssid, new->parent_bssid); + known->pub.max_bssid_indicator = new->pub.max_bssid_indicator; + known->pub.bssid_index = new->pub.bssid_index; + + return true; +} + /* Returned bss is reference counted and must be cleaned up appropriately. */ struct cfg80211_internal_bss * cfg80211_bss_update(struct cfg80211_registered_device *rdev, @@ -1114,88 +1201,8 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev, found = rb_find_bss(rdev, tmp, BSS_CMP_REGULAR); if (found) { - /* Update IEs */ - if (rcu_access_pointer(tmp->pub.proberesp_ies)) { - const struct cfg80211_bss_ies *old; - - old = rcu_access_pointer(found->pub.proberesp_ies); - - rcu_assign_pointer(found->pub.proberesp_ies, - tmp->pub.proberesp_ies); - /* Override possible earlier Beacon frame IEs */ - rcu_assign_pointer(found->pub.ies, - tmp->pub.proberesp_ies); - if (old) - kfree_rcu((struct cfg80211_bss_ies *)old, - rcu_head); - } else if (rcu_access_pointer(tmp->pub.beacon_ies)) { - const struct cfg80211_bss_ies *old; - struct cfg80211_internal_bss *bss; - - if (found->pub.hidden_beacon_bss && - !list_empty(&found->hidden_list)) { - const struct cfg80211_bss_ies *f; - - /* - * The found BSS struct is one of the probe - * response members of a group, but we're - * receiving a beacon (beacon_ies in the tmp - * bss is used). This can only mean that the - * AP changed its beacon from not having an - * SSID to showing it, which is confusing so - * drop this information. - */ - - f = rcu_access_pointer(tmp->pub.beacon_ies); - kfree_rcu((struct cfg80211_bss_ies *)f, - rcu_head); - goto drop; - } - - old = rcu_access_pointer(found->pub.beacon_ies); - - rcu_assign_pointer(found->pub.beacon_ies, - tmp->pub.beacon_ies); - - /* Override IEs if they were from a beacon before */ - if (old == rcu_access_pointer(found->pub.ies)) - rcu_assign_pointer(found->pub.ies, - tmp->pub.beacon_ies); - - /* Assign beacon IEs to all sub entries */ - list_for_each_entry(bss, &found->hidden_list, - hidden_list) { - const struct cfg80211_bss_ies *ies; - - ies = rcu_access_pointer(bss->pub.beacon_ies); - WARN_ON(ies != old); - - rcu_assign_pointer(bss->pub.beacon_ies, - tmp->pub.beacon_ies); - } - - if (old) - kfree_rcu((struct cfg80211_bss_ies *)old, - rcu_head); - } - - found->pub.beacon_interval = tmp->pub.beacon_interval; - /* - * don't update the signal if beacon was heard on - * adjacent channel. - */ - if (signal_valid) - found->pub.signal = tmp->pub.signal; - found->pub.capability = tmp->pub.capability; - found->ts = tmp->ts; - found->ts_boottime = tmp->ts_boottime; - found->parent_tsf = tmp->parent_tsf; - found->pub.chains = tmp->pub.chains; - memcpy(found->pub.chain_signal, tmp->pub.chain_signal, - IEEE80211_MAX_CHAINS); - ether_addr_copy(found->parent_bssid, tmp->parent_bssid); - found->pub.max_bssid_indicator = tmp->pub.max_bssid_indicator; - found->pub.bssid_index = tmp->pub.bssid_index; + if (!cfg80211_update_known_bss(rdev, found, tmp, signal_valid)) + goto drop; } else { struct cfg80211_internal_bss *new; struct cfg80211_internal_bss *hidden; @@ -1368,6 +1375,7 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy, struct cfg80211_internal_bss tmp = {}, *res; int bss_type; bool signal_valid; + unsigned long ts; if (WARN_ON(!wiphy)) return NULL; @@ -1390,8 +1398,11 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy, tmp.ts_boottime = data->boottime_ns; if (non_tx_data) { tmp.pub.transmitted_bss = non_tx_data->tx_bss; + ts = bss_from_pub(non_tx_data->tx_bss)->ts; tmp.pub.bssid_index = non_tx_data->bssid_index; tmp.pub.max_bssid_indicator = non_tx_data->max_bssid_indicator; + } else { + ts = jiffies; } /* @@ -1425,8 +1436,7 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy, signal_valid = abs(data->chan->center_freq - channel->center_freq) <= wiphy->max_adj_channel_rssi_comp; - res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid, - jiffies); + res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid, ts); if (!res) return NULL; @@ -1440,7 +1450,7 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy, regulatory_hint_found_beacon(wiphy, channel, gfp); } - if (non_tx_data && non_tx_data->tx_bss) { + if (non_tx_data) { /* this is a nontransmitting bss, we need to add it to * transmitting bss' list if it is not there */ @@ -1659,6 +1669,8 @@ cfg80211_inform_bss_data(struct wiphy *wiphy, res = cfg80211_inform_single_bss_data(wiphy, data, ftype, bssid, tsf, capability, beacon_interval, ie, ielen, NULL, gfp); + if (!res) + return NULL; non_tx_data.tx_bss = res; cfg80211_parse_mbssid_data(wiphy, data, ftype, bssid, tsf, beacon_interval, ie, ielen, &non_tx_data, @@ -1691,8 +1703,7 @@ cfg80211_parse_mbssid_frame_data(struct wiphy *wiphy, static void cfg80211_update_notlisted_nontrans(struct wiphy *wiphy, struct cfg80211_bss *nontrans_bss, - struct ieee80211_mgmt *mgmt, size_t len, - gfp_t gfp) + struct ieee80211_mgmt *mgmt, size_t len) { u8 *ie, *new_ie, *pos; const u8 *nontrans_ssid, *trans_ssid, *mbssid; @@ -1703,6 +1714,8 @@ cfg80211_update_notlisted_nontrans(struct wiphy *wiphy, const struct cfg80211_bss_ies *old; u8 cpy_len; + lockdep_assert_held(&wiphy_to_rdev(wiphy)->bss_lock); + ie = mgmt->u.probe_resp.variable; new_ie_len = ielen; @@ -1711,26 +1724,30 @@ cfg80211_update_notlisted_nontrans(struct wiphy *wiphy, return; new_ie_len -= trans_ssid[1]; mbssid = cfg80211_find_ie(WLAN_EID_MULTIPLE_BSSID, ie, ielen); - if (!mbssid) + /* + * It's not valid to have the MBSSID element before SSID + * ignore if that happens - the code below assumes it is + * after (while copying things inbetween). + */ + if (!mbssid || mbssid < trans_ssid) return; new_ie_len -= mbssid[1]; - rcu_read_lock(); + nontrans_ssid = ieee80211_bss_get_ie(nontrans_bss, WLAN_EID_SSID); - if (!nontrans_ssid) { - rcu_read_unlock(); + if (!nontrans_ssid) return; - } + new_ie_len += nontrans_ssid[1]; - rcu_read_unlock(); /* generate new ie for nontrans BSS * 1. replace SSID with nontrans BSS' SSID * 2. skip MBSSID IE */ - new_ie = kzalloc(new_ie_len, gfp); + new_ie = kzalloc(new_ie_len, GFP_ATOMIC); if (!new_ie) return; - new_ies = kzalloc(sizeof(*new_ies) + new_ie_len, gfp); + + new_ies = kzalloc(sizeof(*new_ies) + new_ie_len, GFP_ATOMIC); if (!new_ies) goto out_free; @@ -1776,7 +1793,6 @@ static struct cfg80211_bss * cfg80211_inform_single_bss_frame_data(struct wiphy *wiphy, struct cfg80211_inform_bss *data, struct ieee80211_mgmt *mgmt, size_t len, - struct cfg80211_non_tx_bss *non_tx_data, gfp_t gfp) { struct cfg80211_internal_bss tmp = {}, *res; @@ -1835,11 +1851,6 @@ cfg80211_inform_single_bss_frame_data(struct wiphy *wiphy, tmp.pub.chains = data->chains; memcpy(tmp.pub.chain_signal, data->chain_signal, IEEE80211_MAX_CHAINS); ether_addr_copy(tmp.parent_bssid, data->parent_bssid); - if (non_tx_data) { - tmp.pub.transmitted_bss = non_tx_data->tx_bss; - tmp.pub.bssid_index = non_tx_data->bssid_index; - tmp.pub.max_bssid_indicator = non_tx_data->max_bssid_indicator; - } signal_valid = abs(data->chan->center_freq - channel->center_freq) <= wiphy->max_adj_channel_rssi_comp; @@ -1877,7 +1888,7 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, struct cfg80211_non_tx_bss non_tx_data; res = cfg80211_inform_single_bss_frame_data(wiphy, data, mgmt, - len, NULL, gfp); + len, gfp); if (!res || !wiphy->support_mbssid || !cfg80211_find_ie(WLAN_EID_MULTIPLE_BSSID, ie, ielen)) return res; @@ -1890,6 +1901,8 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, cfg80211_parse_mbssid_frame_data(wiphy, data, mgmt, len, &non_tx_data, gfp); + spin_lock_bh(&wiphy_to_rdev(wiphy)->bss_lock); + /* check if the res has other nontransmitting bss which is not * in MBSSID IE */ @@ -1904,8 +1917,9 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, ies2 = rcu_access_pointer(tmp_bss->ies); if (ies2->tsf < ies1->tsf) cfg80211_update_notlisted_nontrans(wiphy, tmp_bss, - mgmt, len, gfp); + mgmt, len); } + spin_unlock_bh(&wiphy_to_rdev(wiphy)->bss_lock); return res; } @@ -1995,6 +2009,85 @@ void cfg80211_bss_iter(struct wiphy *wiphy, } EXPORT_SYMBOL(cfg80211_bss_iter); +void cfg80211_update_assoc_bss_entry(struct wireless_dev *wdev, + struct ieee80211_channel *chan) +{ + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct cfg80211_internal_bss *cbss = wdev->current_bss; + struct cfg80211_internal_bss *new = NULL; + struct cfg80211_internal_bss *bss; + struct cfg80211_bss *nontrans_bss; + struct cfg80211_bss *tmp; + + spin_lock_bh(&rdev->bss_lock); + + if (WARN_ON(cbss->pub.channel == chan)) + goto done; + + /* use transmitting bss */ + if (cbss->pub.transmitted_bss) + cbss = container_of(cbss->pub.transmitted_bss, + struct cfg80211_internal_bss, + pub); + + cbss->pub.channel = chan; + + list_for_each_entry(bss, &rdev->bss_list, list) { + if (!cfg80211_bss_type_match(bss->pub.capability, + bss->pub.channel->band, + wdev->conn_bss_type)) + continue; + + if (bss == cbss) + continue; + + if (!cmp_bss(&bss->pub, &cbss->pub, BSS_CMP_REGULAR)) { + new = bss; + break; + } + } + + if (new) { + /* to save time, update IEs for transmitting bss only */ + if (cfg80211_update_known_bss(rdev, cbss, new, false)) { + new->pub.proberesp_ies = NULL; + new->pub.beacon_ies = NULL; + } + + list_for_each_entry_safe(nontrans_bss, tmp, + &new->pub.nontrans_list, + nontrans_list) { + bss = container_of(nontrans_bss, + struct cfg80211_internal_bss, pub); + if (__cfg80211_unlink_bss(rdev, bss)) + rdev->bss_generation++; + } + + WARN_ON(atomic_read(&new->hold)); + if (!WARN_ON(!__cfg80211_unlink_bss(rdev, new))) + rdev->bss_generation++; + } + + rb_erase(&cbss->rbn, &rdev->bss_tree); + rb_insert_bss(rdev, cbss); + rdev->bss_generation++; + + list_for_each_entry_safe(nontrans_bss, tmp, + &cbss->pub.nontrans_list, + nontrans_list) { + bss = container_of(nontrans_bss, + struct cfg80211_internal_bss, pub); + bss->pub.channel = chan; + rb_erase(&bss->rbn, &rdev->bss_tree); + rb_insert_bss(rdev, bss); + rdev->bss_generation++; + } + +done: + spin_unlock_bh(&rdev->bss_lock); +} + #ifdef CONFIG_CFG80211_WEXT static struct cfg80211_registered_device * cfg80211_get_dev_from_ifindex(struct net *net, int ifindex) diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 7a6c38ddc65a..d32a2ec4d96a 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -1307,14 +1307,14 @@ void cfg80211_autodisconnect_wk(struct work_struct *work) if (wdev->conn_owner_nlportid) { switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: - cfg80211_leave_ibss(rdev, wdev->netdev, false); + __cfg80211_leave_ibss(rdev, wdev->netdev, false); break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: - cfg80211_stop_ap(rdev, wdev->netdev, false); + __cfg80211_stop_ap(rdev, wdev->netdev, false); break; case NL80211_IFTYPE_MESH_POINT: - cfg80211_leave_mesh(rdev, wdev->netdev); + __cfg80211_leave_mesh(rdev, wdev->netdev); break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 4fbb91a511ae..3ef1679b0e66 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -646,6 +646,11 @@ DEFINE_EVENT(wiphy_netdev_evt, rdev_flush_pmksa, TP_ARGS(wiphy, netdev) ); +DEFINE_EVENT(wiphy_netdev_evt, rdev_end_cac, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev), + TP_ARGS(wiphy, netdev) +); + DECLARE_EVENT_CLASS(station_add_change, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *mac, struct station_parameters *params), @@ -2009,7 +2014,7 @@ TRACE_EVENT(rdev_start_nan, WIPHY_ENTRY WDEV_ENTRY __field(u8, master_pref) - __field(u8, bands); + __field(u8, bands) ), TP_fast_assign( WIPHY_ASSIGN; @@ -2031,8 +2036,8 @@ TRACE_EVENT(rdev_nan_change_conf, WIPHY_ENTRY WDEV_ENTRY __field(u8, master_pref) - __field(u8, bands); - __field(u32, changes); + __field(u8, bands) + __field(u32, changes) ), TP_fast_assign( WIPHY_ASSIGN; @@ -2446,10 +2451,11 @@ TRACE_EVENT(rdev_set_mcast_rate, sizeof(int) * NUM_NL80211_BANDS); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " - "mcast_rates [2.4GHz=0x%x, 5.2GHz=0x%x, 60GHz=0x%x]", + "mcast_rates [2.4GHz=0x%x, 5.2GHz=0x%x, 6GHz=0x%x, 60GHz=0x%x]", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->mcast_rate[NL80211_BAND_2GHZ], __entry->mcast_rate[NL80211_BAND_5GHZ], + __entry->mcast_rate[NL80211_BAND_6GHZ], __entry->mcast_rate[NL80211_BAND_60GHZ]) ); diff --git a/net/wireless/util.c b/net/wireless/util.c index d0e35b7b9e35..8481e9ac33da 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -91,6 +91,11 @@ int ieee80211_channel_to_frequency(int chan, enum nl80211_band band) else return 5000 + chan * 5; break; + case NL80211_BAND_6GHZ: + /* see 802.11ax D4.1 27.3.22.2 */ + if (chan <= 253) + return 5940 + chan * 5; + break; case NL80211_BAND_60GHZ: if (chan < 7) return 56160 + chan * 2160; @@ -111,8 +116,11 @@ int ieee80211_frequency_to_channel(int freq) return (freq - 2407) / 5; else if (freq >= 4910 && freq <= 4980) return (freq - 4000) / 5; - else if (freq <= 45000) /* DMG band lower limit */ + else if (freq < 5945) return (freq - 5000) / 5; + else if (freq <= 45000) /* DMG band lower limit */ + /* see 802.11ax D4.1 27.3.22.2 */ + return (freq - 5940) / 5; else if (freq >= 58320 && freq <= 70200) return (freq - 56160) / 2160; else @@ -148,6 +156,7 @@ static void set_mandatory_flags_band(struct ieee80211_supported_band *sband) switch (sband->band) { case NL80211_BAND_5GHZ: + case NL80211_BAND_6GHZ: want = 3; for (i = 0; i < sband->n_bitrates; i++) { if (sband->bitrates[i].bitrate == 60 || @@ -233,25 +242,30 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, switch (params->cipher) { case WLAN_CIPHER_SUITE_TKIP: + /* Extended Key ID can only be used with CCMP/GCMP ciphers */ + if ((pairwise && key_idx) || + params->mode != NL80211_KEY_RX_TX) + return -EINVAL; + break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: - /* IEEE802.11-2016 allows only 0 and - when using Extended Key - * ID - 1 as index for pairwise keys. + /* IEEE802.11-2016 allows only 0 and - when supporting + * Extended Key ID - 1 as index for pairwise keys. * @NL80211_KEY_NO_TX is only allowed for pairwise keys when * the driver supports Extended Key ID. * @NL80211_KEY_SET_TX can't be set when installing and * validating a key. */ - if (params->mode == NL80211_KEY_NO_TX) { - if (!wiphy_ext_feature_isset(&rdev->wiphy, - NL80211_EXT_FEATURE_EXT_KEY_ID)) - return -EINVAL; - else if (!pairwise || key_idx < 0 || key_idx > 1) + if ((params->mode == NL80211_KEY_NO_TX && !pairwise) || + params->mode == NL80211_KEY_SET_TX) + return -EINVAL; + if (wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_EXT_KEY_ID)) { + if (pairwise && (key_idx < 0 || key_idx > 1)) return -EINVAL; - } else if ((pairwise && key_idx) || - params->mode == NL80211_KEY_SET_TX) { + } else if (pairwise && key_idx) { return -EINVAL; } break; @@ -550,7 +564,7 @@ __frame_add_frag(struct sk_buff *skb, struct page *page, struct skb_shared_info *sh = skb_shinfo(skb); int page_offset; - page_ref_inc(page); + get_page(page); page_offset = ptr - page_address(page); skb_add_rx_frag(skb, sh->nr_frags, page, page_offset, len, size); } @@ -955,6 +969,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, } cfg80211_process_rdev_events(rdev); + cfg80211_mlme_purge_registrations(dev->ieee80211_ptr); } err = rdev_change_virtual_intf(rdev, dev, ntype, params); @@ -1034,7 +1049,7 @@ static u32 cfg80211_calculate_bitrate_ht(struct rate_info *rate) return (bitrate + 50000) / 100000; } -static u32 cfg80211_calculate_bitrate_60g(struct rate_info *rate) +static u32 cfg80211_calculate_bitrate_dmg(struct rate_info *rate) { static const u32 __mcs2bitrate[] = { /* control PHY */ @@ -1081,6 +1096,40 @@ static u32 cfg80211_calculate_bitrate_60g(struct rate_info *rate) return __mcs2bitrate[rate->mcs]; } +static u32 cfg80211_calculate_bitrate_edmg(struct rate_info *rate) +{ + static const u32 __mcs2bitrate[] = { + /* control PHY */ + [0] = 275, + /* SC PHY */ + [1] = 3850, + [2] = 7700, + [3] = 9625, + [4] = 11550, + [5] = 12512, /* 1251.25 mbps */ + [6] = 13475, + [7] = 15400, + [8] = 19250, + [9] = 23100, + [10] = 25025, + [11] = 26950, + [12] = 30800, + [13] = 38500, + [14] = 46200, + [15] = 50050, + [16] = 53900, + [17] = 57750, + [18] = 69300, + [19] = 75075, + [20] = 80850, + }; + + if (WARN_ON_ONCE(rate->mcs >= ARRAY_SIZE(__mcs2bitrate))) + return 0; + + return __mcs2bitrate[rate->mcs] * rate->n_bonded_ch; +} + static u32 cfg80211_calculate_bitrate_vht(struct rate_info *rate) { static const u32 base[4][10] = { @@ -1253,8 +1302,10 @@ u32 cfg80211_calculate_bitrate(struct rate_info *rate) { if (rate->flags & RATE_INFO_FLAGS_MCS) return cfg80211_calculate_bitrate_ht(rate); - if (rate->flags & RATE_INFO_FLAGS_60G) - return cfg80211_calculate_bitrate_60g(rate); + if (rate->flags & RATE_INFO_FLAGS_DMG) + return cfg80211_calculate_bitrate_dmg(rate); + if (rate->flags & RATE_INFO_FLAGS_EDMG) + return cfg80211_calculate_bitrate_edmg(rate); if (rate->flags & RATE_INFO_FLAGS_VHT_MCS) return cfg80211_calculate_bitrate_vht(rate); if (rate->flags & RATE_INFO_FLAGS_HE_MCS) @@ -1466,6 +1517,9 @@ bool ieee80211_operating_class_to_band(u8 operating_class, case 128 ... 130: *band = NL80211_BAND_5GHZ; return true; + case 131 ... 135: + *band = NL80211_BAND_6GHZ; + return true; case 81: case 82: case 83: @@ -1505,7 +1559,8 @@ bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef, } if (freq == 2484) { - if (chandef->width > NL80211_CHAN_WIDTH_40) + /* channel 14 is only for IEEE 802.11b */ + if (chandef->width != NL80211_CHAN_WIDTH_20_NOHT) return false; *op_class = 82; /* channel 14 */ diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 46e4d69db845..cac9e28d852b 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -7,6 +7,7 @@ * we directly assign the wireless handlers of wireless interfaces. * * Copyright 2008-2009 Johannes Berg <johannes@sipsolutions.net> + * Copyright (C) 2019 Intel Corporation */ #include <linux/export.h> @@ -797,7 +798,7 @@ static int cfg80211_wext_giwfreq(struct net_device *dev, { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); - struct cfg80211_chan_def chandef; + struct cfg80211_chan_def chandef = {}; int ret; switch (wdev->iftype) { @@ -864,8 +865,8 @@ static int cfg80211_wext_siwtxpower(struct net_device *dev, } } } else { - rfkill_set_sw_state(rdev->rfkill, true); - schedule_work(&rdev->rfkill_sync); + if (rfkill_set_sw_state(rdev->rfkill, true)) + schedule_work(&rdev->rfkill_block); return 0; } diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index 5e677dac2a0c..69102fda9ebd 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -657,7 +657,8 @@ struct iw_statistics *get_wireless_stats(struct net_device *dev) return NULL; } -static int iw_handler_get_iwstats(struct net_device * dev, +/* noinline to avoid a bogus warning with -O3 */ +static noinline int iw_handler_get_iwstats(struct net_device * dev, struct iw_request_info * info, union iwreq_data * wrqu, char * extra) diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c index c67d7a82ab13..73fd0eae08ca 100644 --- a/net/wireless/wext-sme.c +++ b/net/wireless/wext-sme.c @@ -202,6 +202,7 @@ int cfg80211_mgd_wext_giwessid(struct net_device *dev, struct iw_point *data, char *ssid) { struct wireless_dev *wdev = dev->ieee80211_ptr; + int ret = 0; /* call only for station! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) @@ -219,7 +220,10 @@ int cfg80211_mgd_wext_giwessid(struct net_device *dev, if (ie) { data->flags = 1; data->length = ie[1]; - memcpy(ssid, ie + 2, data->length); + if (data->length > IW_ESSID_MAX_SIZE) + ret = -EINVAL; + else + memcpy(ssid, ie + 2, data->length); } rcu_read_unlock(); } else if (wdev->wext.connect.ssid && wdev->wext.connect.ssid_len) { @@ -229,7 +233,7 @@ int cfg80211_mgd_wext_giwessid(struct net_device *dev, } wdev_unlock(wdev); - return 0; + return ret; } int cfg80211_mgd_wext_siwap(struct net_device *dev, diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 6aee9f5e8e71..d5b09bbff375 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -659,6 +659,12 @@ static int x25_release(struct socket *sock) sock_set_flag(sk, SOCK_DEAD); sock_set_flag(sk, SOCK_DESTROY); break; + + case X25_STATE_5: + x25_write_internal(sk, X25_CLEAR_REQUEST); + x25_disconnect(sk, 0, 0, 0); + __x25_destroy_socket(sk); + goto out; } sock_orphan(sk); @@ -760,6 +766,10 @@ static int x25_connect(struct socket *sock, struct sockaddr *uaddr, if (sk->sk_state == TCP_ESTABLISHED) goto out; + rc = -EALREADY; /* Do nothing if call is already in progress */ + if (sk->sk_state == TCP_SYN_SENT) + goto out; + sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; @@ -806,7 +816,7 @@ static int x25_connect(struct socket *sock, struct sockaddr *uaddr, /* Now the loop */ rc = -EINPROGRESS; if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) - goto out_put_neigh; + goto out; rc = x25_wait_for_connection_establishment(sk); if (rc) @@ -891,7 +901,7 @@ static int x25_accept(struct socket *sock, struct socket *newsock, int flags, /* Now attach up the new socket */ skb->sk = NULL; kfree_skb(skb); - sk->sk_ack_backlog--; + sk_acceptq_removed(sk); newsock->state = SS_CONNECTED; rc = 0; out2: @@ -1054,6 +1064,8 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb, if (test_bit(X25_ACCPT_APPRV_FLAG, &makex25->flags)) { x25_write_internal(make, X25_CALL_ACCEPTED); makex25->state = X25_STATE_3; + } else { + makex25->state = X25_STATE_5; } /* @@ -1062,7 +1074,7 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb, skb_copy_from_linear_data(skb, makex25->calluserdata.cuddata, skb->len); makex25->calluserdata.cudlength = skb->len; - sk->sk_ack_backlog++; + sk_acceptq_added(sk); x25_insert_socket(make); diff --git a/net/x25/x25_dev.c b/net/x25/x25_dev.c index 5c111bc3c8ea..00e782335cb0 100644 --- a/net/x25/x25_dev.c +++ b/net/x25/x25_dev.c @@ -55,7 +55,7 @@ static int x25_receive_data(struct sk_buff *skb, struct x25_neigh *nb) if (!sock_owned_by_user(sk)) { queued = x25_process_rx_frame(sk, skb); } else { - queued = !sk_add_backlog(sk, skb, sk->sk_rcvbuf); + queued = !sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)); } bh_unlock_sock(sk); sock_put(sk); diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c index f97c43344e95..4d3bb46aaae0 100644 --- a/net/x25/x25_in.c +++ b/net/x25/x25_in.c @@ -382,6 +382,35 @@ out_clear: return 0; } +/* + * State machine for state 5, Call Accepted / Call Connected pending (X25_ACCPT_APPRV_FLAG). + * The handling of the timer(s) is in file x25_timer.c + * Handling of state 0 and connection release is in af_x25.c. + */ +static int x25_state5_machine(struct sock *sk, struct sk_buff *skb, int frametype) +{ + struct x25_sock *x25 = x25_sk(sk); + + switch (frametype) { + case X25_CLEAR_REQUEST: + if (!pskb_may_pull(skb, X25_STD_MIN_LEN + 2)) { + x25_write_internal(sk, X25_CLEAR_REQUEST); + x25->state = X25_STATE_2; + x25_start_t23timer(sk); + return 0; + } + + x25_write_internal(sk, X25_CLEAR_CONFIRMATION); + x25_disconnect(sk, 0, skb->data[3], skb->data[4]); + break; + + default: + break; + } + + return 0; +} + /* Higher level upcall for a LAPB frame */ int x25_process_rx_frame(struct sock *sk, struct sk_buff *skb) { @@ -406,6 +435,9 @@ int x25_process_rx_frame(struct sock *sk, struct sk_buff *skb) case X25_STATE_4: queued = x25_state4_machine(sk, skb, frametype); break; + case X25_STATE_5: + queued = x25_state5_machine(sk, skb, frametype); + break; } x25_kick(sk); diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 83de74ca729a..fa7bb5e060d0 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -14,6 +14,7 @@ #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/idr.h> +#include <linux/vmalloc.h> #include "xdp_umem.h" #include "xsk_queue.h" @@ -26,6 +27,9 @@ void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) { unsigned long flags; + if (!xs->tx) + return; + spin_lock_irqsave(&umem->xsk_list_lock, flags); list_add_rcu(&xs->list, &umem->xsk_list); spin_unlock_irqrestore(&umem->xsk_list_lock, flags); @@ -35,6 +39,9 @@ void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) { unsigned long flags; + if (!xs->tx) + return; + spin_lock_irqsave(&umem->xsk_list_lock, flags); list_del_rcu(&xs->list); spin_unlock_irqrestore(&umem->xsk_list_lock, flags); @@ -105,14 +112,22 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, umem->dev = dev; umem->queue_id = queue_id; + if (flags & XDP_USE_NEED_WAKEUP) { + umem->flags |= XDP_UMEM_USES_NEED_WAKEUP; + /* Tx needs to be explicitly woken up the first time. + * Also for supporting drivers that do not implement this + * feature. They will always have to call sendto(). + */ + xsk_set_tx_need_wakeup(umem); + } + dev_hold(dev); if (force_copy) /* For copy-mode, we are done. */ return 0; - if (!dev->netdev_ops->ndo_bpf || - !dev->netdev_ops->ndo_xsk_async_xmit) { + if (!dev->netdev_ops->ndo_bpf || !dev->netdev_ops->ndo_xsk_wakeup) { err = -EOPNOTSUPP; goto err_unreg_umem; } @@ -164,17 +179,41 @@ void xdp_umem_clear_dev(struct xdp_umem *umem) umem->zc = false; } -static void xdp_umem_unpin_pages(struct xdp_umem *umem) +static void xdp_umem_unmap_pages(struct xdp_umem *umem) { unsigned int i; + for (i = 0; i < umem->npgs; i++) + if (PageHighMem(umem->pgs[i])) + vunmap(umem->pages[i].addr); +} + +static int xdp_umem_map_pages(struct xdp_umem *umem) +{ + unsigned int i; + void *addr; + for (i = 0; i < umem->npgs; i++) { - struct page *page = umem->pgs[i]; + if (PageHighMem(umem->pgs[i])) + addr = vmap(&umem->pgs[i], 1, VM_MAP, PAGE_KERNEL); + else + addr = page_address(umem->pgs[i]); + + if (!addr) { + xdp_umem_unmap_pages(umem); + return -ENOMEM; + } - set_page_dirty_lock(page); - put_page(page); + umem->pages[i].addr = addr; } + return 0; +} + +static void xdp_umem_unpin_pages(struct xdp_umem *umem) +{ + unpin_user_pages_dirty_lock(umem->pgs, umem->npgs, true); + kfree(umem->pgs); umem->pgs = NULL; } @@ -207,9 +246,10 @@ static void xdp_umem_release(struct xdp_umem *umem) xsk_reuseq_destroy(umem); + xdp_umem_unmap_pages(umem); xdp_umem_unpin_pages(umem); - kfree(umem->pages); + kvfree(umem->pages); umem->pages = NULL; xdp_umem_unaccount_pages(umem); @@ -251,7 +291,7 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem) return -ENOMEM; down_read(¤t->mm->mmap_sem); - npgs = get_user_pages(umem->address, umem->npgs, + npgs = pin_user_pages(umem->address, umem->npgs, gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL); up_read(¤t->mm->mmap_sem); @@ -299,10 +339,11 @@ static int xdp_umem_account_pages(struct xdp_umem *umem) static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) { + bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG; u32 chunk_size = mr->chunk_size, headroom = mr->headroom; unsigned int chunks, chunks_per_page; u64 addr = mr->addr, size = mr->len; - int size_chk, err, i; + int size_chk, err; if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) { /* Strictly speaking we could support this, if: @@ -314,7 +355,11 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) return -EINVAL; } - if (!is_power_of_2(chunk_size)) + if (mr->flags & ~(XDP_UMEM_UNALIGNED_CHUNK_FLAG | + XDP_UMEM_USES_NEED_WAKEUP)) + return -EINVAL; + + if (!unaligned_chunks && !is_power_of_2(chunk_size)) return -EINVAL; if (!PAGE_ALIGNED(addr)) { @@ -331,24 +376,26 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if (chunks == 0) return -EINVAL; - chunks_per_page = PAGE_SIZE / chunk_size; - if (chunks < chunks_per_page || chunks % chunks_per_page) - return -EINVAL; - - headroom = ALIGN(headroom, 64); + if (!unaligned_chunks) { + chunks_per_page = PAGE_SIZE / chunk_size; + if (chunks < chunks_per_page || chunks % chunks_per_page) + return -EINVAL; + } size_chk = chunk_size - headroom - XDP_PACKET_HEADROOM; if (size_chk < 0) return -EINVAL; umem->address = (unsigned long)addr; - umem->chunk_mask = ~((u64)chunk_size - 1); + umem->chunk_mask = unaligned_chunks ? XSK_UNALIGNED_BUF_ADDR_MASK + : ~((u64)chunk_size - 1); umem->size = size; umem->headroom = headroom; umem->chunk_size_nohr = chunk_size - headroom; umem->npgs = size / PAGE_SIZE; umem->pgs = NULL; umem->user = NULL; + umem->flags = mr->flags; INIT_LIST_HEAD(&umem->xsk_list); spin_lock_init(&umem->xsk_list_lock); @@ -362,17 +409,21 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if (err) goto out_account; - umem->pages = kcalloc(umem->npgs, sizeof(*umem->pages), GFP_KERNEL); + umem->pages = kvcalloc(umem->npgs, sizeof(*umem->pages), + GFP_KERNEL_ACCOUNT); if (!umem->pages) { err = -ENOMEM; - goto out_account; + goto out_pin; } - for (i = 0; i < umem->npgs; i++) - umem->pages[i].addr = page_address(umem->pgs[i]); + err = xdp_umem_map_pages(umem); + if (!err) + return 0; - return 0; + kvfree(umem->pages); +out_pin: + xdp_umem_unpin_pages(umem); out_account: xdp_umem_unaccount_pages(umem); return err; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 59b57d708697..df600487a68d 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -31,6 +31,8 @@ #define TX_BATCH_SIZE 16 +static DEFINE_PER_CPU(struct list_head, xskmap_flush_list); + bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) { return READ_ONCE(xs->rx) && READ_ONCE(xs->umem) && @@ -39,37 +41,119 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt) { - return xskq_has_addrs(umem->fq, cnt); + return xskq_cons_has_entries(umem->fq, cnt); } EXPORT_SYMBOL(xsk_umem_has_addrs); -u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) +bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) { - return xskq_peek_addr(umem->fq, addr); + return xskq_cons_peek_addr(umem->fq, addr, umem); } EXPORT_SYMBOL(xsk_umem_peek_addr); -void xsk_umem_discard_addr(struct xdp_umem *umem) +void xsk_umem_release_addr(struct xdp_umem *umem) { - xskq_discard_addr(umem->fq); + xskq_cons_release(umem->fq); +} +EXPORT_SYMBOL(xsk_umem_release_addr); + +void xsk_set_rx_need_wakeup(struct xdp_umem *umem) +{ + if (umem->need_wakeup & XDP_WAKEUP_RX) + return; + + umem->fq->ring->flags |= XDP_RING_NEED_WAKEUP; + umem->need_wakeup |= XDP_WAKEUP_RX; +} +EXPORT_SYMBOL(xsk_set_rx_need_wakeup); + +void xsk_set_tx_need_wakeup(struct xdp_umem *umem) +{ + struct xdp_sock *xs; + + if (umem->need_wakeup & XDP_WAKEUP_TX) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(xs, &umem->xsk_list, list) { + xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; + } + rcu_read_unlock(); + + umem->need_wakeup |= XDP_WAKEUP_TX; +} +EXPORT_SYMBOL(xsk_set_tx_need_wakeup); + +void xsk_clear_rx_need_wakeup(struct xdp_umem *umem) +{ + if (!(umem->need_wakeup & XDP_WAKEUP_RX)) + return; + + umem->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP; + umem->need_wakeup &= ~XDP_WAKEUP_RX; +} +EXPORT_SYMBOL(xsk_clear_rx_need_wakeup); + +void xsk_clear_tx_need_wakeup(struct xdp_umem *umem) +{ + struct xdp_sock *xs; + + if (!(umem->need_wakeup & XDP_WAKEUP_TX)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(xs, &umem->xsk_list, list) { + xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP; + } + rcu_read_unlock(); + + umem->need_wakeup &= ~XDP_WAKEUP_TX; +} +EXPORT_SYMBOL(xsk_clear_tx_need_wakeup); + +bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem) +{ + return umem->flags & XDP_UMEM_USES_NEED_WAKEUP; +} +EXPORT_SYMBOL(xsk_umem_uses_need_wakeup); + +/* If a buffer crosses a page boundary, we need to do 2 memcpy's, one for + * each page. This is only required in copy mode. + */ +static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf, + u32 len, u32 metalen) +{ + void *to_buf = xdp_umem_get_data(umem, addr); + + addr = xsk_umem_add_offset_to_addr(addr); + if (xskq_cons_crosses_non_contig_pg(umem, addr, len + metalen)) { + void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr; + u64 page_start = addr & ~(PAGE_SIZE - 1); + u64 first_len = PAGE_SIZE - (addr - page_start); + + memcpy(to_buf, from_buf, first_len + metalen); + memcpy(next_pg_addr, from_buf + first_len, len - first_len); + + return; + } + + memcpy(to_buf, from_buf, len + metalen); } -EXPORT_SYMBOL(xsk_umem_discard_addr); static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { - void *to_buf, *from_buf; + u64 offset = xs->umem->headroom; + u64 addr, memcpy_addr; + void *from_buf; u32 metalen; - u64 addr; int err; - if (!xskq_peek_addr(xs->umem->fq, &addr) || + if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) || len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { xs->rx_dropped++; return -ENOSPC; } - addr += xs->umem->headroom; - if (unlikely(xdp_data_meta_unsupported(xdp))) { from_buf = xdp->data; metalen = 0; @@ -78,12 +162,14 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) metalen = xdp->data - xdp->data_meta; } - to_buf = xdp_umem_get_data(xs->umem, addr); - memcpy(to_buf, from_buf, len + metalen); - addr += metalen; - err = xskq_produce_batch_desc(xs->rx, addr, len); + memcpy_addr = xsk_umem_adjust_offset(xs->umem, addr, offset); + __xsk_rcv_memcpy(xs->umem, memcpy_addr, from_buf, len, metalen); + + offset += metalen; + addr = xsk_umem_adjust_offset(xs->umem, addr, offset); + err = xskq_prod_reserve_desc(xs->rx, addr, len); if (!err) { - xskq_discard_addr(xs->umem->fq); + xskq_cons_release(xs->umem->fq); xdp_return_buff(xdp); return 0; } @@ -94,7 +180,7 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { - int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len); + int err = xskq_prod_reserve_desc(xs->rx, xdp->handle, len); if (err) xs->rx_dropped++; @@ -102,10 +188,23 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) return err; } -int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +static bool xsk_is_bound(struct xdp_sock *xs) +{ + if (READ_ONCE(xs->state) == XSK_BOUND) { + /* Matches smp_wmb() in bind(). */ + smp_rmb(); + return true; + } + return false; +} + +static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { u32 len; + if (!xsk_is_bound(xs)) + return -EINVAL; + if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) return -EINVAL; @@ -115,16 +214,17 @@ int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len); } -void xsk_flush(struct xdp_sock *xs) +static void xsk_flush(struct xdp_sock *xs) { - xskq_produce_flush_desc(xs->rx); - xs->sk.sk_data_ready(&xs->sk); + xskq_prod_submit(xs->rx); + sock_def_readable(&xs->sk); } int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { u32 metalen = xdp->data - xdp->data_meta; u32 len = xdp->data_end - xdp->data; + u64 offset = xs->umem->headroom; void *buffer; u64 addr; int err; @@ -136,23 +236,23 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) goto out_unlock; } - if (!xskq_peek_addr(xs->umem->fq, &addr) || + if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) || len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { err = -ENOSPC; goto out_drop; } - addr += xs->umem->headroom; - + addr = xsk_umem_adjust_offset(xs->umem, addr, offset); buffer = xdp_umem_get_data(xs->umem, addr); memcpy(buffer, xdp->data_meta, len + metalen); - addr += metalen; - err = xskq_produce_batch_desc(xs->rx, addr, len); + + addr = xsk_umem_adjust_offset(xs->umem, addr, metalen); + err = xskq_prod_reserve_desc(xs->rx, addr, len); if (err) goto out_drop; - xskq_discard_addr(xs->umem->fq); - xskq_produce_flush_desc(xs->rx); + xskq_cons_release(xs->umem->fq); + xskq_prod_submit(xs->rx); spin_unlock_bh(&xs->rx_lock); @@ -166,9 +266,35 @@ out_unlock: return err; } +int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list); + int err; + + err = xsk_rcv(xs, xdp); + if (err) + return err; + + if (!xs->flush_node.prev) + list_add(&xs->flush_node, flush_list); + + return 0; +} + +void __xsk_map_flush(void) +{ + struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list); + struct xdp_sock *xs, *tmp; + + list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { + xsk_flush(xs); + __list_del_clearprev(&xs->flush_node); + } +} + void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) { - xskq_produce_flush_addr_n(umem->cq, nb_entries); + xskq_prod_submit_n(umem->cq, nb_entries); } EXPORT_SYMBOL(xsk_umem_complete_tx); @@ -190,13 +316,18 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc) rcu_read_lock(); list_for_each_entry_rcu(xs, &umem->xsk_list, list) { - if (!xskq_peek_desc(xs->tx, desc)) + if (!xskq_cons_peek_desc(xs->tx, desc, umem)) continue; - if (xskq_produce_addr_lazy(umem->cq, desc->addr)) + /* This is the backpreassure mechanism for the Tx path. + * Reserve space in the completion queue and only proceed + * if there is space in it. This avoids having to implement + * any buffering in the Tx path. + */ + if (xskq_prod_reserve_addr(umem->cq, desc->addr)) goto out; - xskq_discard_desc(xs->tx); + xskq_cons_release(xs->tx); rcu_read_unlock(); return true; } @@ -207,12 +338,21 @@ out: } EXPORT_SYMBOL(xsk_umem_consume_tx); -static int xsk_zc_xmit(struct sock *sk) +static int xsk_wakeup(struct xdp_sock *xs, u8 flags) { - struct xdp_sock *xs = xdp_sk(sk); struct net_device *dev = xs->dev; + int err; - return dev->netdev_ops->ndo_xsk_async_xmit(dev, xs->queue_id); + rcu_read_lock(); + err = dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags); + rcu_read_unlock(); + + return err; +} + +static int xsk_zc_xmit(struct xdp_sock *xs) +{ + return xsk_wakeup(xs, XDP_WAKEUP_TX); } static void xsk_destruct_skb(struct sk_buff *skb) @@ -222,17 +362,16 @@ static void xsk_destruct_skb(struct sk_buff *skb) unsigned long flags; spin_lock_irqsave(&xs->tx_completion_lock, flags); - WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr)); + xskq_prod_submit_addr(xs->umem->cq, addr); spin_unlock_irqrestore(&xs->tx_completion_lock, flags); sock_wfree(skb); } -static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, - size_t total_len) +static int xsk_generic_xmit(struct sock *sk) { - u32 max_batch = TX_BATCH_SIZE; struct xdp_sock *xs = xdp_sk(sk); + u32 max_batch = TX_BATCH_SIZE; bool sent_frame = false; struct xdp_desc desc; struct sk_buff *skb; @@ -243,7 +382,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, if (xs->queue_id >= xs->dev->real_num_tx_queues) goto out; - while (xskq_peek_desc(xs->tx, &desc)) { + while (xskq_cons_peek_desc(xs->tx, &desc, xs->umem)) { char *buffer; u64 addr; u32 len; @@ -264,7 +403,12 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, addr = desc.addr; buffer = xdp_umem_get_data(xs->umem, addr); err = skb_store_bits(skb, 0, buffer, len); - if (unlikely(err) || xskq_reserve_addr(xs->umem->cq)) { + /* This is the backpreassure mechanism for the Tx path. + * Reserve space in the completion queue and only proceed + * if there is space in it. This avoids having to implement + * any buffering in the Tx path. + */ + if (unlikely(err) || xskq_prod_reserve(xs->umem->cq)) { kfree_skb(skb); goto out; } @@ -272,11 +416,11 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, skb->dev = xs->dev; skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - skb_shinfo(skb)->destructor_arg = (void *)(long)addr; + skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr; skb->destructor = xsk_destruct_skb; err = dev_direct_xmit(skb, xs->queue_id); - xskq_discard_desc(xs->tx); + xskq_cons_release(xs->tx); /* Ignore NET_XMIT_CN as packet might have been sent */ if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) { /* SKB completed but not sent */ @@ -295,35 +439,57 @@ out: return err; } -static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) +static int __xsk_sendmsg(struct sock *sk) { - bool need_wait = !(m->msg_flags & MSG_DONTWAIT); - struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); - if (unlikely(!xs->dev)) - return -ENXIO; if (unlikely(!(xs->dev->flags & IFF_UP))) return -ENETDOWN; if (unlikely(!xs->tx)) return -ENOBUFS; - if (need_wait) + + return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk); +} + +static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) +{ + bool need_wait = !(m->msg_flags & MSG_DONTWAIT); + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); + + if (unlikely(!xsk_is_bound(xs))) + return -ENXIO; + if (unlikely(need_wait)) return -EOPNOTSUPP; - return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len); + return __xsk_sendmsg(sk); } -static unsigned int xsk_poll(struct file *file, struct socket *sock, +static __poll_t xsk_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { - unsigned int mask = datagram_poll(file, sock, wait); + __poll_t mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); + struct xdp_umem *umem; - if (xs->rx && !xskq_empty_desc(xs->rx)) - mask |= POLLIN | POLLRDNORM; - if (xs->tx && !xskq_full_desc(xs->tx)) - mask |= POLLOUT | POLLWRNORM; + if (unlikely(!xsk_is_bound(xs))) + return mask; + + umem = xs->umem; + + if (umem->need_wakeup) { + if (xs->zc) + xsk_wakeup(xs, umem->need_wakeup); + else + /* Poll needs to drive Tx also in copy mode */ + __xsk_sendmsg(sk); + } + + if (xs->rx && !xskq_prod_is_empty(xs->rx)) + mask |= EPOLLIN | EPOLLRDNORM; + if (xs->tx && !xskq_cons_is_full(xs->tx)) + mask |= EPOLLOUT | EPOLLWRNORM; return mask; } @@ -342,7 +508,7 @@ static int xsk_init_queue(u32 entries, struct xsk_queue **queue, /* Make sure queue is ready before it can be seen by others */ smp_wmb(); - *queue = q; + WRITE_ONCE(*queue, q); return 0; } @@ -350,10 +516,9 @@ static void xsk_unbind_dev(struct xdp_sock *xs) { struct net_device *dev = xs->dev; - if (!dev || xs->state != XSK_BOUND) + if (xs->state != XSK_BOUND) return; - - xs->state = XSK_UNBOUND; + WRITE_ONCE(xs->state, XSK_UNBOUND); /* Wait for driver to stop using the xdp socket. */ xdp_del_sk_umem(xs->umem, xs); @@ -362,6 +527,52 @@ static void xsk_unbind_dev(struct xdp_sock *xs) dev_put(dev); } +static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs, + struct xdp_sock ***map_entry) +{ + struct xsk_map *map = NULL; + struct xsk_map_node *node; + + *map_entry = NULL; + + spin_lock_bh(&xs->map_list_lock); + node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node, + node); + if (node) { + WARN_ON(xsk_map_inc(node->map)); + map = node->map; + *map_entry = node->map_entry; + } + spin_unlock_bh(&xs->map_list_lock); + return map; +} + +static void xsk_delete_from_maps(struct xdp_sock *xs) +{ + /* This function removes the current XDP socket from all the + * maps it resides in. We need to take extra care here, due to + * the two locks involved. Each map has a lock synchronizing + * updates to the entries, and each socket has a lock that + * synchronizes access to the list of maps (map_list). For + * deadlock avoidance the locks need to be taken in the order + * "map lock"->"socket map list lock". We start off by + * accessing the socket map list, and take a reference to the + * map to guarantee existence between the + * xsk_get_map_list_entry() and xsk_map_try_sock_delete() + * calls. Then we ask the map to remove the socket, which + * tries to remove the socket from the map. Note that there + * might be updates to the map between + * xsk_get_map_list_entry() and xsk_map_try_sock_delete(). + */ + struct xdp_sock **map_entry = NULL; + struct xsk_map *map; + + while ((map = xsk_get_map_list_entry(xs, &map_entry))) { + xsk_map_try_sock_delete(map, xs, map_entry); + xsk_map_put(map); + } +} + static int xsk_release(struct socket *sock) { struct sock *sk = sock->sk; @@ -381,7 +592,10 @@ static int xsk_release(struct socket *sock) sock_prot_inuse_add(net, sk->sk_prot, -1); local_bh_enable(); + xsk_delete_from_maps(xs); + mutex_lock(&xs->mutex); xsk_unbind_dev(xs); + mutex_unlock(&xs->mutex); xskq_destroy(xs->rx); xskq_destroy(xs->tx); @@ -412,6 +626,24 @@ static struct socket *xsk_lookup_xsk_from_fd(int fd) return sock; } +/* Check if umem pages are contiguous. + * If zero-copy mode, use the DMA address to do the page contiguity check + * For all other modes we use addr (kernel virtual address) + * Store the result in the low bits of addr. + */ +static void xsk_check_page_contiguity(struct xdp_umem *umem, u32 flags) +{ + struct xdp_umem_page *pgs = umem->pages; + int i, is_contig; + + for (i = 0; i < umem->npgs - 1; i++) { + is_contig = (flags & XDP_ZEROCOPY) ? + (pgs[i].dma + PAGE_SIZE == pgs[i + 1].dma) : + (pgs[i].addr + PAGE_SIZE == pgs[i + 1].addr); + pgs[i].addr += is_contig << XSK_NEXT_PG_CONTIG_SHIFT; + } +} + static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; @@ -427,7 +659,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) return -EINVAL; flags = sxdp->sxdp_flags; - if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY)) + if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY | + XDP_USE_NEED_WAKEUP)) return -EINVAL; rtnl_lock(); @@ -454,7 +687,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) struct xdp_sock *umem_xs; struct socket *sock; - if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY)) { + if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) || + (flags & XDP_USE_NEED_WAKEUP)) { /* Cannot specify flags for shared sockets. */ err = -EINVAL; goto out_unlock; @@ -473,19 +707,19 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) } umem_xs = xdp_sk(sock->sk); - if (!umem_xs->umem) { - /* No umem to inherit. */ + if (!xsk_is_bound(umem_xs)) { err = -EBADF; sockfd_put(sock); goto out_unlock; - } else if (umem_xs->dev != dev || umem_xs->queue_id != qid) { + } + if (umem_xs->dev != dev || umem_xs->queue_id != qid) { err = -EINVAL; sockfd_put(sock); goto out_unlock; } xdp_get_umem(umem_xs->umem); - xs->umem = umem_xs->umem; + WRITE_ONCE(xs->umem, umem_xs->umem); sockfd_put(sock); } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) { err = -EINVAL; @@ -500,6 +734,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) err = xdp_umem_assign_dev(xs->umem, dev, qid, flags); if (err) goto out_unlock; + + xsk_check_page_contiguity(xs->umem, flags); } xs->dev = dev; @@ -510,16 +746,28 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) xdp_add_sk_umem(xs->umem, xs); out_unlock: - if (err) + if (err) { dev_put(dev); - else - xs->state = XSK_BOUND; + } else { + /* Matches smp_rmb() in bind() for shared umem + * sockets, and xsk_is_bound(). + */ + smp_wmb(); + WRITE_ONCE(xs->state, XSK_BOUND); + } out_release: mutex_unlock(&xs->mutex); rtnl_unlock(); return err; } +struct xdp_umem_reg_v1 { + __u64 addr; /* Start of packet data area */ + __u64 len; /* Length of packet data area */ + __u32 chunk_size; + __u32 headroom; +}; + static int xsk_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) { @@ -549,15 +797,24 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, } q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx; err = xsk_init_queue(entries, q, false); + if (!err && optname == XDP_TX_RING) + /* Tx needs to be explicitly woken up the first time */ + xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; mutex_unlock(&xs->mutex); return err; } case XDP_UMEM_REG: { - struct xdp_umem_reg mr; + size_t mr_size = sizeof(struct xdp_umem_reg); + struct xdp_umem_reg mr = {}; struct xdp_umem *umem; - if (copy_from_user(&mr, optval, sizeof(mr))) + if (optlen < sizeof(struct xdp_umem_reg_v1)) + return -EINVAL; + else if (optlen < sizeof(mr)) + mr_size = sizeof(struct xdp_umem_reg_v1); + + if (copy_from_user(&mr, optval, mr_size)) return -EFAULT; mutex_lock(&xs->mutex); @@ -574,7 +831,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, /* Make sure umem is ready before it can be seen by others */ smp_wmb(); - xs->umem = umem; + WRITE_ONCE(xs->umem, umem); mutex_unlock(&xs->mutex); return 0; } @@ -610,6 +867,20 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, return -ENOPROTOOPT; } +static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring) +{ + ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); + ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); + ring->desc = offsetof(struct xdp_rxtx_ring, desc); +} + +static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring) +{ + ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer); + ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); + ring->desc = offsetof(struct xdp_umem_ring, desc); +} + static int xsk_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { @@ -649,26 +920,49 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname, case XDP_MMAP_OFFSETS: { struct xdp_mmap_offsets off; + struct xdp_mmap_offsets_v1 off_v1; + bool flags_supported = true; + void *to_copy; - if (len < sizeof(off)) + if (len < sizeof(off_v1)) return -EINVAL; + else if (len < sizeof(off)) + flags_supported = false; + + if (flags_supported) { + /* xdp_ring_offset is identical to xdp_ring_offset_v1 + * except for the flags field added to the end. + */ + xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *) + &off.rx); + xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *) + &off.tx); + xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *) + &off.fr); + xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *) + &off.cr); + off.rx.flags = offsetof(struct xdp_rxtx_ring, + ptrs.flags); + off.tx.flags = offsetof(struct xdp_rxtx_ring, + ptrs.flags); + off.fr.flags = offsetof(struct xdp_umem_ring, + ptrs.flags); + off.cr.flags = offsetof(struct xdp_umem_ring, + ptrs.flags); + + len = sizeof(off); + to_copy = &off; + } else { + xsk_enter_rxtx_offsets(&off_v1.rx); + xsk_enter_rxtx_offsets(&off_v1.tx); + xsk_enter_umem_offsets(&off_v1.fr); + xsk_enter_umem_offsets(&off_v1.cr); + + len = sizeof(off_v1); + to_copy = &off_v1; + } - off.rx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); - off.rx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); - off.rx.desc = offsetof(struct xdp_rxtx_ring, desc); - off.tx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); - off.tx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); - off.tx.desc = offsetof(struct xdp_rxtx_ring, desc); - - off.fr.producer = offsetof(struct xdp_umem_ring, ptrs.producer); - off.fr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); - off.fr.desc = offsetof(struct xdp_umem_ring, desc); - off.cr.producer = offsetof(struct xdp_umem_ring, ptrs.producer); - off.cr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); - off.cr.desc = offsetof(struct xdp_umem_ring, desc); - - len = sizeof(off); - if (copy_to_user(optval, &off, len)) + if (copy_to_user(optval, to_copy, len)) return -EFAULT; if (put_user(len, optlen)) return -EFAULT; @@ -713,7 +1007,7 @@ static int xsk_mmap(struct file *file, struct socket *sock, unsigned long pfn; struct page *qpg; - if (xs->state != XSK_READY) + if (READ_ONCE(xs->state) != XSK_READY) return -EBUSY; if (offset == XDP_PGOFF_RX_RING) { @@ -739,7 +1033,7 @@ static int xsk_mmap(struct file *file, struct socket *sock, /* Matches the smp_wmb() in xsk_init_queue */ smp_rmb(); qpg = virt_to_head_page(q->ring); - if (size > (PAGE_SIZE << compound_order(qpg))) + if (size > page_size(qpg)) return -EINVAL; pfn = virt_to_phys(q->ring) >> PAGE_SHIFT; @@ -855,6 +1149,9 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, spin_lock_init(&xs->rx_lock); spin_lock_init(&xs->tx_completion_lock); + INIT_LIST_HEAD(&xs->map_list); + spin_lock_init(&xs->map_list_lock); + mutex_lock(&net->xdp.lock); sk_add_node_rcu(sk, &net->xdp.list); mutex_unlock(&net->xdp.lock); @@ -895,7 +1192,7 @@ static struct pernet_operations xsk_net_ops = { static int __init xsk_init(void) { - int err; + int err, cpu; err = proto_register(&xsk_proto, 0 /* no slab */); if (err) @@ -913,6 +1210,8 @@ static int __init xsk_init(void) if (err) goto out_pernet; + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu)); return 0; out_pernet: diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h index ba8120610426..4cfd106bdb53 100644 --- a/net/xdp/xsk.h +++ b/net/xdp/xsk.h @@ -4,6 +4,19 @@ #ifndef XSK_H_ #define XSK_H_ +struct xdp_ring_offset_v1 { + __u64 producer; + __u64 consumer; + __u64 desc; +}; + +struct xdp_mmap_offsets_v1 { + struct xdp_ring_offset_v1 rx; + struct xdp_ring_offset_v1 tx; + struct xdp_ring_offset_v1 fr; + struct xdp_ring_offset_v1 cr; +}; + static inline struct xdp_sock *xdp_sk(struct sock *sk) { return (struct xdp_sock *)sk; diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c index d5e06c8e0cbf..f59791ba43a0 100644 --- a/net/xdp/xsk_diag.c +++ b/net/xdp/xsk_diag.c @@ -56,7 +56,7 @@ static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb) du.id = umem->id; du.size = umem->size; du.num_pages = umem->npgs; - du.chunk_size = (__u32)(~umem->chunk_mask + 1); + du.chunk_size = umem->chunk_size_nohr + umem->headroom; du.headroom = umem->headroom; du.ifindex = umem->dev ? umem->dev->ifindex : 0; du.queue_id = umem->queue_id; @@ -97,6 +97,7 @@ static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb, msg->xdiag_ino = sk_ino; sock_diag_save_cookie(sk, msg->xdiag_cookie); + mutex_lock(&xs->mutex); if ((req->xdiag_show & XDP_SHOW_INFO) && xsk_diag_put_info(xs, nlskb)) goto out_nlmsg_trim; @@ -117,10 +118,12 @@ static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb, sock_diag_put_meminfo(sk, nlskb, XDP_DIAG_MEMINFO)) goto out_nlmsg_trim; + mutex_unlock(&xs->mutex); nlmsg_end(nlskb, nlh); return 0; out_nlmsg_trim: + mutex_unlock(&xs->mutex); nlmsg_cancel(nlskb, nlh); return -EMSGSIZE; } diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index b66504592d9b..c90e9c1e3c63 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -18,14 +18,14 @@ void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask) q->chunk_mask = chunk_mask; } -static u32 xskq_umem_get_ring_size(struct xsk_queue *q) +static size_t xskq_get_ring_size(struct xsk_queue *q, bool umem_queue) { - return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u64); -} + struct xdp_umem_ring *umem_ring; + struct xdp_rxtx_ring *rxtx_ring; -static u32 xskq_rxtx_get_ring_size(struct xsk_queue *q) -{ - return sizeof(struct xdp_ring) + q->nentries * sizeof(struct xdp_desc); + if (umem_queue) + return struct_size(umem_ring, desc, q->nentries); + return struct_size(rxtx_ring, desc, q->nentries); } struct xsk_queue *xskq_create(u32 nentries, bool umem_queue) @@ -43,8 +43,7 @@ struct xsk_queue *xskq_create(u32 nentries, bool umem_queue) gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP | __GFP_NORETRY; - size = umem_queue ? xskq_umem_get_ring_size(q) : - xskq_rxtx_get_ring_size(q); + size = xskq_get_ring_size(q, umem_queue); q->ring = (struct xdp_ring *)__get_free_pages(gfp_flags, get_order(size)); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 909c5168ed0f..bec2af11853a 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -10,12 +10,10 @@ #include <linux/if_xdp.h> #include <net/xdp_sock.h> -#define RX_BATCH_SIZE 16 -#define LAZY_UPDATE_THRESHOLD 128 - struct xdp_ring { u32 producer ____cacheline_aligned_in_smp; u32 consumer ____cacheline_aligned_in_smp; + u32 flags; }; /* Used for the RX and TX queues for packets */ @@ -35,10 +33,8 @@ struct xsk_queue { u64 size; u32 ring_mask; u32 nentries; - u32 prod_head; - u32 prod_tail; - u32 cons_head; - u32 cons_tail; + u32 cached_prod; + u32 cached_cons; struct xdp_ring *ring; u64 invalid_descs; }; @@ -85,57 +81,116 @@ struct xsk_queue { * now and again after circling through the ring. */ -/* Common functions operating for both RXTX and umem queues */ +/* The operations on the rings are the following: + * + * producer consumer + * + * RESERVE entries PEEK in the ring for entries + * WRITE data into the ring READ data from the ring + * SUBMIT entries RELEASE entries + * + * The producer reserves one or more entries in the ring. It can then + * fill in these entries and finally submit them so that they can be + * seen and read by the consumer. + * + * The consumer peeks into the ring to see if the producer has written + * any new entries. If so, the producer can then read these entries + * and when it is done reading them release them back to the producer + * so that the producer can use these slots to fill in new entries. + * + * The function names below reflect these operations. + */ + +/* Functions that read and validate content from consumer rings. */ -static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) +static inline bool xskq_cons_crosses_non_contig_pg(struct xdp_umem *umem, + u64 addr, + u64 length) { - return q ? q->invalid_descs : 0; + bool cross_pg = (addr & (PAGE_SIZE - 1)) + length > PAGE_SIZE; + bool next_pg_contig = + (unsigned long)umem->pages[(addr >> PAGE_SHIFT)].addr & + XSK_NEXT_PG_CONTIG_MASK; + + return cross_pg && !next_pg_contig; } -static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt) +static inline bool xskq_cons_is_valid_unaligned(struct xsk_queue *q, + u64 addr, + u64 length, + struct xdp_umem *umem) { - u32 entries = q->prod_tail - q->cons_tail; + u64 base_addr = xsk_umem_extract_addr(addr); - if (entries == 0) { - /* Refresh the local pointer */ - q->prod_tail = READ_ONCE(q->ring->producer); - entries = q->prod_tail - q->cons_tail; + addr = xsk_umem_add_offset_to_addr(addr); + if (base_addr >= q->size || addr >= q->size || + xskq_cons_crosses_non_contig_pg(umem, addr, length)) { + q->invalid_descs++; + return false; } - return (entries > dcnt) ? dcnt : entries; + return true; } -static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt) +static inline bool xskq_cons_is_valid_addr(struct xsk_queue *q, u64 addr) { - u32 free_entries = q->nentries - (producer - q->cons_tail); - - if (free_entries >= dcnt) - return free_entries; + if (addr >= q->size) { + q->invalid_descs++; + return false; + } - /* Refresh the local tail pointer */ - q->cons_tail = READ_ONCE(q->ring->consumer); - return q->nentries - (producer - q->cons_tail); + return true; } -static inline bool xskq_has_addrs(struct xsk_queue *q, u32 cnt) +static inline bool xskq_cons_read_addr(struct xsk_queue *q, u64 *addr, + struct xdp_umem *umem) { - u32 entries = q->prod_tail - q->cons_tail; + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - if (entries >= cnt) - return true; + while (q->cached_cons != q->cached_prod) { + u32 idx = q->cached_cons & q->ring_mask; - /* Refresh the local pointer. */ - q->prod_tail = READ_ONCE(q->ring->producer); - entries = q->prod_tail - q->cons_tail; + *addr = ring->desc[idx] & q->chunk_mask; - return entries >= cnt; -} + if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) { + if (xskq_cons_is_valid_unaligned(q, *addr, + umem->chunk_size_nohr, + umem)) + return true; + goto out; + } -/* UMEM queue */ + if (xskq_cons_is_valid_addr(q, *addr)) + return true; -static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr) +out: + q->cached_cons++; + } + + return false; +} + +static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q, + struct xdp_desc *d, + struct xdp_umem *umem) { - if (addr >= q->size) { + if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) { + if (!xskq_cons_is_valid_unaligned(q, d->addr, d->len, umem)) + return false; + + if (d->len > umem->chunk_size_nohr || d->options) { + q->invalid_descs++; + return false; + } + + return true; + } + + if (!xskq_cons_is_valid_addr(q, d->addr)) + return false; + + if (((d->addr + d->len) & q->chunk_mask) != (d->addr & q->chunk_mask) || + d->options) { q->invalid_descs++; return false; } @@ -143,177 +198,184 @@ static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr) return true; } -static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr) +static inline bool xskq_cons_read_desc(struct xsk_queue *q, + struct xdp_desc *desc, + struct xdp_umem *umem) { - while (q->cons_tail != q->cons_head) { - struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - unsigned int idx = q->cons_tail & q->ring_mask; + while (q->cached_cons != q->cached_prod) { + struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; + u32 idx = q->cached_cons & q->ring_mask; - *addr = READ_ONCE(ring->desc[idx]) & q->chunk_mask; - if (xskq_is_valid_addr(q, *addr)) - return addr; + *desc = ring->desc[idx]; + if (xskq_cons_is_valid_desc(q, desc, umem)) + return true; - q->cons_tail++; + q->cached_cons++; } - return NULL; + return false; } -static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr) -{ - if (q->cons_tail == q->cons_head) { - smp_mb(); /* D, matches A */ - WRITE_ONCE(q->ring->consumer, q->cons_tail); - q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE); - - /* Order consumer and data */ - smp_rmb(); - } +/* Functions for consumers */ - return xskq_validate_addr(q, addr); +static inline void __xskq_cons_release(struct xsk_queue *q) +{ + smp_mb(); /* D, matches A */ + WRITE_ONCE(q->ring->consumer, q->cached_cons); } -static inline void xskq_discard_addr(struct xsk_queue *q) +static inline void __xskq_cons_peek(struct xsk_queue *q) { - q->cons_tail++; + /* Refresh the local pointer */ + q->cached_prod = READ_ONCE(q->ring->producer); + smp_rmb(); /* C, matches B */ } -static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr) +static inline void xskq_cons_get_entries(struct xsk_queue *q) { - struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + __xskq_cons_release(q); + __xskq_cons_peek(q); +} - if (xskq_nb_free(q, q->prod_tail, 1) == 0) - return -ENOSPC; +static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt) +{ + u32 entries = q->cached_prod - q->cached_cons; - /* A, matches D */ - ring->desc[q->prod_tail++ & q->ring_mask] = addr; + if (entries >= cnt) + return true; - /* Order producer and data */ - smp_wmb(); /* B, matches C */ + __xskq_cons_peek(q); + entries = q->cached_prod - q->cached_cons; - WRITE_ONCE(q->ring->producer, q->prod_tail); - return 0; + return entries >= cnt; } -static inline int xskq_produce_addr_lazy(struct xsk_queue *q, u64 addr) +static inline bool xskq_cons_peek_addr(struct xsk_queue *q, u64 *addr, + struct xdp_umem *umem) { - struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - - if (xskq_nb_free(q, q->prod_head, LAZY_UPDATE_THRESHOLD) == 0) - return -ENOSPC; - - /* A, matches D */ - ring->desc[q->prod_head++ & q->ring_mask] = addr; - return 0; + if (q->cached_prod == q->cached_cons) + xskq_cons_get_entries(q); + return xskq_cons_read_addr(q, addr, umem); } -static inline void xskq_produce_flush_addr_n(struct xsk_queue *q, - u32 nb_entries) +static inline bool xskq_cons_peek_desc(struct xsk_queue *q, + struct xdp_desc *desc, + struct xdp_umem *umem) { - /* Order producer and data */ - smp_wmb(); /* B, matches C */ - - q->prod_tail += nb_entries; - WRITE_ONCE(q->ring->producer, q->prod_tail); + if (q->cached_prod == q->cached_cons) + xskq_cons_get_entries(q); + return xskq_cons_read_desc(q, desc, umem); } -static inline int xskq_reserve_addr(struct xsk_queue *q) +static inline void xskq_cons_release(struct xsk_queue *q) { - if (xskq_nb_free(q, q->prod_head, 1) == 0) - return -ENOSPC; + /* To improve performance, only update local state here. + * Reflect this to global state when we get new entries + * from the ring in xskq_cons_get_entries(). + */ + q->cached_cons++; +} - /* A, matches D */ - q->prod_head++; - return 0; +static inline bool xskq_cons_is_full(struct xsk_queue *q) +{ + /* No barriers needed since data is not accessed */ + return READ_ONCE(q->ring->producer) - READ_ONCE(q->ring->consumer) == + q->nentries; } -/* Rx/Tx queue */ +/* Functions for producers */ -static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d) +static inline bool xskq_prod_is_full(struct xsk_queue *q) { - if (!xskq_is_valid_addr(q, d->addr)) - return false; + u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons); - if (((d->addr + d->len) & q->chunk_mask) != (d->addr & q->chunk_mask) || - d->options) { - q->invalid_descs++; + if (free_entries) return false; - } - return true; + /* Refresh the local tail pointer */ + q->cached_cons = READ_ONCE(q->ring->consumer); + free_entries = q->nentries - (q->cached_prod - q->cached_cons); + + return !free_entries; } -static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q, - struct xdp_desc *desc) +static inline int xskq_prod_reserve(struct xsk_queue *q) { - while (q->cons_tail != q->cons_head) { - struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; - unsigned int idx = q->cons_tail & q->ring_mask; - - *desc = READ_ONCE(ring->desc[idx]); - if (xskq_is_valid_desc(q, desc)) - return desc; - - q->cons_tail++; - } + if (xskq_prod_is_full(q)) + return -ENOSPC; - return NULL; + /* A, matches D */ + q->cached_prod++; + return 0; } -static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q, - struct xdp_desc *desc) +static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr) { - if (q->cons_tail == q->cons_head) { - smp_mb(); /* D, matches A */ - WRITE_ONCE(q->ring->consumer, q->cons_tail); - q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE); - - /* Order consumer and data */ - smp_rmb(); /* C, matches B */ - } + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - return xskq_validate_desc(q, desc); -} + if (xskq_prod_is_full(q)) + return -ENOSPC; -static inline void xskq_discard_desc(struct xsk_queue *q) -{ - q->cons_tail++; + /* A, matches D */ + ring->desc[q->cached_prod++ & q->ring_mask] = addr; + return 0; } -static inline int xskq_produce_batch_desc(struct xsk_queue *q, - u64 addr, u32 len) +static inline int xskq_prod_reserve_desc(struct xsk_queue *q, + u64 addr, u32 len) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; - unsigned int idx; + u32 idx; - if (xskq_nb_free(q, q->prod_head, 1) == 0) + if (xskq_prod_is_full(q)) return -ENOSPC; /* A, matches D */ - idx = (q->prod_head++) & q->ring_mask; + idx = q->cached_prod++ & q->ring_mask; ring->desc[idx].addr = addr; ring->desc[idx].len = len; return 0; } -static inline void xskq_produce_flush_desc(struct xsk_queue *q) +static inline void __xskq_prod_submit(struct xsk_queue *q, u32 idx) { - /* Order producer and data */ smp_wmb(); /* B, matches C */ - q->prod_tail = q->prod_head; - WRITE_ONCE(q->ring->producer, q->prod_tail); + WRITE_ONCE(q->ring->producer, idx); +} + +static inline void xskq_prod_submit(struct xsk_queue *q) +{ + __xskq_prod_submit(q, q->cached_prod); +} + +static inline void xskq_prod_submit_addr(struct xsk_queue *q, u64 addr) +{ + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + u32 idx = q->ring->producer; + + ring->desc[idx++ & q->ring_mask] = addr; + + __xskq_prod_submit(q, idx); } -static inline bool xskq_full_desc(struct xsk_queue *q) +static inline void xskq_prod_submit_n(struct xsk_queue *q, u32 nb_entries) { - return xskq_nb_avail(q, q->nentries) == q->nentries; + __xskq_prod_submit(q, q->ring->producer + nb_entries); } -static inline bool xskq_empty_desc(struct xsk_queue *q) +static inline bool xskq_prod_is_empty(struct xsk_queue *q) { - return xskq_nb_free(q, q->prod_tail, q->nentries) == q->nentries; + /* No barriers needed since data is not accessed */ + return READ_ONCE(q->ring->consumer) == READ_ONCE(q->ring->producer); +} + +/* For both producers and consumers */ + +static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) +{ + return q ? q->invalid_descs : 0; } void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask); diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index 51bb6018f3bf..6921a18201a0 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -3,20 +3,20 @@ # XFRM configuration # config XFRM - bool - depends on INET - select GRO_CELLS - select SKB_EXTENSIONS + bool + depends on INET + select GRO_CELLS + select SKB_EXTENSIONS config XFRM_OFFLOAD - bool + bool config XFRM_ALGO tristate select XFRM select CRYPTO select CRYPTO_HASH - select CRYPTO_BLKCIPHER + select CRYPTO_SKCIPHER if INET config XFRM_USER diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile index fbc4552d17b8..212a4fcb4a88 100644 --- a/net/xfrm/Makefile +++ b/net/xfrm/Makefile @@ -11,3 +11,4 @@ obj-$(CONFIG_XFRM_ALGO) += xfrm_algo.o obj-$(CONFIG_XFRM_USER) += xfrm_user.o obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o +obj-$(CONFIG_INET_ESPINTCP) += espintcp.o diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c new file mode 100644 index 000000000000..f15d6a564b0e --- /dev/null +++ b/net/xfrm/espintcp.c @@ -0,0 +1,509 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <net/tcp.h> +#include <net/strparser.h> +#include <net/xfrm.h> +#include <net/esp.h> +#include <net/espintcp.h> +#include <linux/skmsg.h> +#include <net/inet_common.h> + +static void handle_nonesp(struct espintcp_ctx *ctx, struct sk_buff *skb, + struct sock *sk) +{ + if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf || + !sk_rmem_schedule(sk, skb, skb->truesize)) { + kfree_skb(skb); + return; + } + + skb_set_owner_r(skb, sk); + + memset(skb->cb, 0, sizeof(skb->cb)); + skb_queue_tail(&ctx->ike_queue, skb); + ctx->saved_data_ready(sk); +} + +static void handle_esp(struct sk_buff *skb, struct sock *sk) +{ + skb_reset_transport_header(skb); + memset(skb->cb, 0, sizeof(skb->cb)); + + rcu_read_lock(); + skb->dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif); + local_bh_disable(); + xfrm4_rcv_encap(skb, IPPROTO_ESP, 0, TCP_ENCAP_ESPINTCP); + local_bh_enable(); + rcu_read_unlock(); +} + +static void espintcp_rcv(struct strparser *strp, struct sk_buff *skb) +{ + struct espintcp_ctx *ctx = container_of(strp, struct espintcp_ctx, + strp); + struct strp_msg *rxm = strp_msg(skb); + u32 nonesp_marker; + int err; + + err = skb_copy_bits(skb, rxm->offset + 2, &nonesp_marker, + sizeof(nonesp_marker)); + if (err < 0) { + kfree_skb(skb); + return; + } + + /* remove header, leave non-ESP marker/SPI */ + if (!__pskb_pull(skb, rxm->offset + 2)) { + kfree_skb(skb); + return; + } + + if (pskb_trim(skb, rxm->full_len - 2) != 0) { + kfree_skb(skb); + return; + } + + if (nonesp_marker == 0) + handle_nonesp(ctx, skb, strp->sk); + else + handle_esp(skb, strp->sk); +} + +static int espintcp_parse(struct strparser *strp, struct sk_buff *skb) +{ + struct strp_msg *rxm = strp_msg(skb); + __be16 blen; + u16 len; + int err; + + if (skb->len < rxm->offset + 2) + return 0; + + err = skb_copy_bits(skb, rxm->offset, &blen, sizeof(blen)); + if (err < 0) + return err; + + len = be16_to_cpu(blen); + if (len < 6) + return -EINVAL; + + return len; +} + +static int espintcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, int *addr_len) +{ + struct espintcp_ctx *ctx = espintcp_getctx(sk); + struct sk_buff *skb; + int err = 0; + int copied; + int off = 0; + + flags |= nonblock ? MSG_DONTWAIT : 0; + + skb = __skb_recv_datagram(sk, &ctx->ike_queue, flags, NULL, &off, &err); + if (!skb) + return err; + + copied = len; + if (copied > skb->len) + copied = skb->len; + else if (copied < skb->len) + msg->msg_flags |= MSG_TRUNC; + + err = skb_copy_datagram_msg(skb, 0, msg, copied); + if (unlikely(err)) { + kfree_skb(skb); + return err; + } + + if (flags & MSG_TRUNC) + copied = skb->len; + kfree_skb(skb); + return copied; +} + +int espintcp_queue_out(struct sock *sk, struct sk_buff *skb) +{ + struct espintcp_ctx *ctx = espintcp_getctx(sk); + + if (skb_queue_len(&ctx->out_queue) >= netdev_max_backlog) + return -ENOBUFS; + + __skb_queue_tail(&ctx->out_queue, skb); + + return 0; +} +EXPORT_SYMBOL_GPL(espintcp_queue_out); + +/* espintcp length field is 2B and length includes the length field's size */ +#define MAX_ESPINTCP_MSG (((1 << 16) - 1) - 2) + +static int espintcp_sendskb_locked(struct sock *sk, struct espintcp_msg *emsg, + int flags) +{ + do { + int ret; + + ret = skb_send_sock_locked(sk, emsg->skb, + emsg->offset, emsg->len); + if (ret < 0) + return ret; + + emsg->len -= ret; + emsg->offset += ret; + } while (emsg->len > 0); + + kfree_skb(emsg->skb); + memset(emsg, 0, sizeof(*emsg)); + + return 0; +} + +static int espintcp_sendskmsg_locked(struct sock *sk, + struct espintcp_msg *emsg, int flags) +{ + struct sk_msg *skmsg = &emsg->skmsg; + struct scatterlist *sg; + int done = 0; + int ret; + + flags |= MSG_SENDPAGE_NOTLAST; + sg = &skmsg->sg.data[skmsg->sg.start]; + do { + size_t size = sg->length - emsg->offset; + int offset = sg->offset + emsg->offset; + struct page *p; + + emsg->offset = 0; + + if (sg_is_last(sg)) + flags &= ~MSG_SENDPAGE_NOTLAST; + + p = sg_page(sg); +retry: + ret = do_tcp_sendpages(sk, p, offset, size, flags); + if (ret < 0) { + emsg->offset = offset - sg->offset; + skmsg->sg.start += done; + return ret; + } + + if (ret != size) { + offset += ret; + size -= ret; + goto retry; + } + + done++; + put_page(p); + sk_mem_uncharge(sk, sg->length); + sg = sg_next(sg); + } while (sg); + + memset(emsg, 0, sizeof(*emsg)); + + return 0; +} + +static int espintcp_push_msgs(struct sock *sk) +{ + struct espintcp_ctx *ctx = espintcp_getctx(sk); + struct espintcp_msg *emsg = &ctx->partial; + int err; + + if (!emsg->len) + return 0; + + if (ctx->tx_running) + return -EAGAIN; + ctx->tx_running = 1; + + if (emsg->skb) + err = espintcp_sendskb_locked(sk, emsg, 0); + else + err = espintcp_sendskmsg_locked(sk, emsg, 0); + if (err == -EAGAIN) { + ctx->tx_running = 0; + return 0; + } + if (!err) + memset(emsg, 0, sizeof(*emsg)); + + ctx->tx_running = 0; + + return err; +} + +int espintcp_push_skb(struct sock *sk, struct sk_buff *skb) +{ + struct espintcp_ctx *ctx = espintcp_getctx(sk); + struct espintcp_msg *emsg = &ctx->partial; + unsigned int len; + int offset; + + if (sk->sk_state != TCP_ESTABLISHED) { + kfree_skb(skb); + return -ECONNRESET; + } + + offset = skb_transport_offset(skb); + len = skb->len - offset; + + espintcp_push_msgs(sk); + + if (emsg->len) { + kfree_skb(skb); + return -ENOBUFS; + } + + skb_set_owner_w(skb, sk); + + emsg->offset = offset; + emsg->len = len; + emsg->skb = skb; + + espintcp_push_msgs(sk); + + return 0; +} +EXPORT_SYMBOL_GPL(espintcp_push_skb); + +static int espintcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +{ + long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + struct espintcp_ctx *ctx = espintcp_getctx(sk); + struct espintcp_msg *emsg = &ctx->partial; + struct iov_iter pfx_iter; + struct kvec pfx_iov = {}; + size_t msglen = size + 2; + char buf[2] = {0}; + int err, end; + + if (msg->msg_flags) + return -EOPNOTSUPP; + + if (size > MAX_ESPINTCP_MSG) + return -EMSGSIZE; + + if (msg->msg_controllen) + return -EOPNOTSUPP; + + lock_sock(sk); + + err = espintcp_push_msgs(sk); + if (err < 0) { + err = -ENOBUFS; + goto unlock; + } + + sk_msg_init(&emsg->skmsg); + while (1) { + /* only -ENOMEM is possible since we don't coalesce */ + err = sk_msg_alloc(sk, &emsg->skmsg, msglen, 0); + if (!err) + break; + + err = sk_stream_wait_memory(sk, &timeo); + if (err) + goto fail; + } + + *((__be16 *)buf) = cpu_to_be16(msglen); + pfx_iov.iov_base = buf; + pfx_iov.iov_len = sizeof(buf); + iov_iter_kvec(&pfx_iter, WRITE, &pfx_iov, 1, pfx_iov.iov_len); + + err = sk_msg_memcopy_from_iter(sk, &pfx_iter, &emsg->skmsg, + pfx_iov.iov_len); + if (err < 0) + goto fail; + + err = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, &emsg->skmsg, size); + if (err < 0) + goto fail; + + end = emsg->skmsg.sg.end; + emsg->len = size; + sk_msg_iter_var_prev(end); + sg_mark_end(sk_msg_elem(&emsg->skmsg, end)); + + tcp_rate_check_app_limited(sk); + + err = espintcp_push_msgs(sk); + /* this message could be partially sent, keep it */ + if (err < 0) + goto unlock; + release_sock(sk); + + return size; + +fail: + sk_msg_free(sk, &emsg->skmsg); + memset(emsg, 0, sizeof(*emsg)); +unlock: + release_sock(sk); + return err; +} + +static struct proto espintcp_prot __ro_after_init; +static struct proto_ops espintcp_ops __ro_after_init; + +static void espintcp_data_ready(struct sock *sk) +{ + struct espintcp_ctx *ctx = espintcp_getctx(sk); + + strp_data_ready(&ctx->strp); +} + +static void espintcp_tx_work(struct work_struct *work) +{ + struct espintcp_ctx *ctx = container_of(work, + struct espintcp_ctx, work); + struct sock *sk = ctx->strp.sk; + + lock_sock(sk); + if (!ctx->tx_running) + espintcp_push_msgs(sk); + release_sock(sk); +} + +static void espintcp_write_space(struct sock *sk) +{ + struct espintcp_ctx *ctx = espintcp_getctx(sk); + + schedule_work(&ctx->work); + ctx->saved_write_space(sk); +} + +static void espintcp_destruct(struct sock *sk) +{ + struct espintcp_ctx *ctx = espintcp_getctx(sk); + + kfree(ctx); +} + +bool tcp_is_ulp_esp(struct sock *sk) +{ + return sk->sk_prot == &espintcp_prot; +} +EXPORT_SYMBOL_GPL(tcp_is_ulp_esp); + +static int espintcp_init_sk(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct strp_callbacks cb = { + .rcv_msg = espintcp_rcv, + .parse_msg = espintcp_parse, + }; + struct espintcp_ctx *ctx; + int err; + + /* sockmap is not compatible with espintcp */ + if (sk->sk_user_data) + return -EBUSY; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + err = strp_init(&ctx->strp, sk, &cb); + if (err) + goto free; + + __sk_dst_reset(sk); + + strp_check_rcv(&ctx->strp); + skb_queue_head_init(&ctx->ike_queue); + skb_queue_head_init(&ctx->out_queue); + sk->sk_prot = &espintcp_prot; + sk->sk_socket->ops = &espintcp_ops; + ctx->saved_data_ready = sk->sk_data_ready; + ctx->saved_write_space = sk->sk_write_space; + sk->sk_data_ready = espintcp_data_ready; + sk->sk_write_space = espintcp_write_space; + sk->sk_destruct = espintcp_destruct; + rcu_assign_pointer(icsk->icsk_ulp_data, ctx); + INIT_WORK(&ctx->work, espintcp_tx_work); + + /* avoid using task_frag */ + sk->sk_allocation = GFP_ATOMIC; + + return 0; + +free: + kfree(ctx); + return err; +} + +static void espintcp_release(struct sock *sk) +{ + struct espintcp_ctx *ctx = espintcp_getctx(sk); + struct sk_buff_head queue; + struct sk_buff *skb; + + __skb_queue_head_init(&queue); + skb_queue_splice_init(&ctx->out_queue, &queue); + + while ((skb = __skb_dequeue(&queue))) + espintcp_push_skb(sk, skb); + + tcp_release_cb(sk); +} + +static void espintcp_close(struct sock *sk, long timeout) +{ + struct espintcp_ctx *ctx = espintcp_getctx(sk); + struct espintcp_msg *emsg = &ctx->partial; + + strp_stop(&ctx->strp); + + sk->sk_prot = &tcp_prot; + barrier(); + + cancel_work_sync(&ctx->work); + strp_done(&ctx->strp); + + skb_queue_purge(&ctx->out_queue); + skb_queue_purge(&ctx->ike_queue); + + if (emsg->len) { + if (emsg->skb) + kfree_skb(emsg->skb); + else + sk_msg_free(sk, &emsg->skmsg); + } + + tcp_close(sk, timeout); +} + +static __poll_t espintcp_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + __poll_t mask = datagram_poll(file, sock, wait); + struct sock *sk = sock->sk; + struct espintcp_ctx *ctx = espintcp_getctx(sk); + + if (!skb_queue_empty(&ctx->ike_queue)) + mask |= EPOLLIN | EPOLLRDNORM; + + return mask; +} + +static struct tcp_ulp_ops espintcp_ulp __read_mostly = { + .name = "espintcp", + .owner = THIS_MODULE, + .init = espintcp_init_sk, +}; + +void __init espintcp_init(void) +{ + memcpy(&espintcp_prot, &tcp_prot, sizeof(tcp_prot)); + memcpy(&espintcp_ops, &inet_stream_ops, sizeof(inet_stream_ops)); + espintcp_prot.sendmsg = espintcp_sendmsg; + espintcp_prot.recvmsg = espintcp_recvmsg; + espintcp_prot.close = espintcp_close; + espintcp_prot.release_cb = espintcp_release; + espintcp_ops.poll = espintcp_poll; + + tcp_register_ulp(&espintcp_ulp); +} diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c index 32a378e7011f..4dae3ab8d030 100644 --- a/net/xfrm/xfrm_algo.c +++ b/net/xfrm/xfrm_algo.c @@ -626,8 +626,8 @@ static const struct xfrm_algo_list xfrm_aalg_list = { static const struct xfrm_algo_list xfrm_ealg_list = { .algs = ealg_list, .entries = ARRAY_SIZE(ealg_list), - .type = CRYPTO_ALG_TYPE_BLKCIPHER, - .mask = CRYPTO_ALG_TYPE_BLKCIPHER_MASK, + .type = CRYPTO_ALG_TYPE_SKCIPHER, + .mask = CRYPTO_ALG_TYPE_MASK, }; static const struct xfrm_algo_list xfrm_calg_list = { diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 189ef15acbbc..50f567a88f45 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -78,7 +78,7 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur int err; unsigned long flags; struct xfrm_state *x; - struct sk_buff *skb2; + struct sk_buff *skb2, *nskb; struct softnet_data *sd; netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); @@ -148,11 +148,7 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur return skb; } - skb2 = skb; - - do { - struct sk_buff *nskb = skb2->next; - + skb_list_walk_safe(skb, skb2, nskb) { esp_features |= skb->dev->gso_partial_features; skb_mark_not_on_list(skb2); @@ -176,14 +172,11 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur if (!skb) return NULL; - goto skip_push; + continue; } skb_push(skb2, skb2->data - skb_mac_header(skb2)); - -skip_push: - skb2 = nskb; - } while (skb2); + } return skb; } diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 6088bc2dc11e..aa35f23c4912 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -36,6 +36,7 @@ struct xfrm_trans_cb { #endif } header; int (*finish)(struct net *net, struct sock *sk, struct sk_buff *skb); + struct net *net; }; #define XFRM_TRANS_SKB_CB(__skb) ((struct xfrm_trans_cb *)&((__skb)->cb[0])) @@ -480,6 +481,9 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) else XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEINVALID); + + if (encap_type == -1) + dev_put(skb->dev); goto drop; } @@ -706,7 +710,7 @@ resume: if (err) goto drop; - nf_reset(skb); + nf_reset_ct(skb); if (decaps) { sp = skb_sec_path(skb); @@ -763,12 +767,13 @@ static void xfrm_trans_reinject(unsigned long data) skb_queue_splice_init(&trans->queue, &queue); while ((skb = __skb_dequeue(&queue))) - XFRM_TRANS_SKB_CB(skb)->finish(dev_net(skb->dev), NULL, skb); + XFRM_TRANS_SKB_CB(skb)->finish(XFRM_TRANS_SKB_CB(skb)->net, + NULL, skb); } -int xfrm_trans_queue(struct sk_buff *skb, - int (*finish)(struct net *, struct sock *, - struct sk_buff *)) +int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb, + int (*finish)(struct net *, struct sock *, + struct sk_buff *)) { struct xfrm_trans_tasklet *trans; @@ -777,11 +782,22 @@ int xfrm_trans_queue(struct sk_buff *skb, if (skb_queue_len(&trans->queue) >= netdev_max_backlog) return -ENOBUFS; + BUILD_BUG_ON(sizeof(struct xfrm_trans_cb) > sizeof(skb->cb)); + XFRM_TRANS_SKB_CB(skb)->finish = finish; + XFRM_TRANS_SKB_CB(skb)->net = net; __skb_queue_tail(&trans->queue, skb); tasklet_schedule(&trans->tasklet); return 0; } +EXPORT_SYMBOL(xfrm_trans_queue_net); + +int xfrm_trans_queue(struct sk_buff *skb, + int (*finish)(struct net *, struct sock *, + struct sk_buff *)) +{ + return xfrm_trans_queue_net(dev_net(skb->dev), skb, finish); +} EXPORT_SYMBOL(xfrm_trans_queue); void __init xfrm_input_init(void) diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 74868f9d81fb..3361e3ac5714 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -145,8 +145,6 @@ static int xfrmi_create(struct net_device *dev) if (err < 0) goto out; - strcpy(xi->p.name, dev->name); - dev_hold(dev); xfrmi_link(xfrmn, xi); @@ -177,7 +175,6 @@ static void xfrmi_dev_uninit(struct net_device *dev) struct xfrmi_net *xfrmn = net_generic(xi->net, xfrmi_net_id); xfrmi_unlink(xfrmn, xi); - dev_put(xi->phydev); dev_put(dev); } @@ -188,7 +185,7 @@ static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) skb->skb_iif = 0; skb->ignore_df = 0; skb_dst_drop(skb); - nf_reset(skb); + nf_reset_ct(skb); nf_reset_trace(skb); if (!xnet) @@ -271,9 +268,6 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) int err = -1; int mtu; - if (!dst) - goto tx_err_link_failure; - dst_hold(dst); dst = xfrm_lookup_with_ifid(xi->net, dst, fl, NULL, 0, xi->p.if_id); if (IS_ERR(dst)) { @@ -294,22 +288,22 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) if (tdev == dev) { stats->collisions++; net_warn_ratelimited("%s: Local routing loop detected!\n", - xi->p.name); + dev->name); goto tx_err_dst_release; } mtu = dst_mtu(dst); if (!skb->ignore_df && skb->len > mtu) { - skb_dst_update_pmtu(skb, mtu); + skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IPV6)) { if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); } else { - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(mtu)); + icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); } dst_release(dst); @@ -346,6 +340,7 @@ static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct net_device_stats *stats = &xi->dev->stats; + struct dst_entry *dst = skb_dst(skb); struct flowi fl; int ret; @@ -355,16 +350,39 @@ static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) case htons(ETH_P_IPV6): xfrm_decode_session(skb, &fl, AF_INET6); memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); + if (!dst) { + fl.u.ip6.flowi6_oif = dev->ifindex; + fl.u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; + dst = ip6_route_output(dev_net(dev), NULL, &fl.u.ip6); + if (dst->error) { + dst_release(dst); + stats->tx_carrier_errors++; + goto tx_err; + } + skb_dst_set(skb, dst); + } break; case htons(ETH_P_IP): xfrm_decode_session(skb, &fl, AF_INET); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + if (!dst) { + struct rtable *rt; + + fl.u.ip4.flowi4_oif = dev->ifindex; + fl.u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; + rt = __ip_route_output_key(dev_net(dev), &fl.u.ip4); + if (IS_ERR(rt)) { + stats->tx_carrier_errors++; + goto tx_err; + } + skb_dst_set(skb, &rt->dst); + } break; default: goto tx_err; } - fl.flowi_oif = xi->phydev->ifindex; + fl.flowi_oif = xi->p.link; ret = xfrmi_xmit2(skb, dev, &fl); if (ret < 0) @@ -505,7 +523,7 @@ static int xfrmi_change(struct xfrm_if *xi, const struct xfrm_if_parms *p) static int xfrmi_update(struct xfrm_if *xi, struct xfrm_if_parms *p) { - struct net *net = dev_net(xi->dev); + struct net *net = xi->net; struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); int err; @@ -550,7 +568,7 @@ static int xfrmi_get_iflink(const struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); - return xi->phydev->ifindex; + return xi->p.link; } @@ -566,22 +584,21 @@ static void xfrmi_dev_setup(struct net_device *dev) { dev->netdev_ops = &xfrmi_netdev_ops; dev->type = ARPHRD_NONE; - dev->hard_header_len = ETH_HLEN; - dev->min_header_len = ETH_HLEN; dev->mtu = ETH_DATA_LEN; dev->min_mtu = ETH_MIN_MTU; - dev->max_mtu = ETH_DATA_LEN; - dev->addr_len = ETH_ALEN; + dev->max_mtu = IP_MAX_MTU; dev->flags = IFF_NOARP; dev->needs_free_netdev = true; dev->priv_destructor = xfrmi_dev_free; netif_keep_dst(dev); + + eth_broadcast_addr(dev->broadcast); } static int xfrmi_dev_init(struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); - struct net_device *phydev = xi->phydev; + struct net_device *phydev = __dev_get_by_index(xi->net, xi->p.link); int err; dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); @@ -596,13 +613,19 @@ static int xfrmi_dev_init(struct net_device *dev) dev->features |= NETIF_F_LLTX; - dev->needed_headroom = phydev->needed_headroom; - dev->needed_tailroom = phydev->needed_tailroom; + if (phydev) { + dev->needed_headroom = phydev->needed_headroom; + dev->needed_tailroom = phydev->needed_tailroom; - if (is_zero_ether_addr(dev->dev_addr)) - eth_hw_addr_inherit(dev, phydev); - if (is_zero_ether_addr(dev->broadcast)) - memcpy(dev->broadcast, phydev->broadcast, dev->addr_len); + if (is_zero_ether_addr(dev->dev_addr)) + eth_hw_addr_inherit(dev, phydev); + if (is_zero_ether_addr(dev->broadcast)) + memcpy(dev->broadcast, phydev->broadcast, + dev->addr_len); + } else { + eth_hw_addr_random(dev); + eth_broadcast_addr(dev->broadcast); + } return 0; } @@ -638,12 +661,6 @@ static int xfrmi_newlink(struct net *src_net, struct net_device *dev, int err; xfrmi_netlink_parms(data, &p); - - if (!tb[IFLA_IFNAME]) - return -EINVAL; - - nla_strlcpy(p.name, tb[IFLA_IFNAME], IFNAMSIZ); - xi = xfrmi_locate(net, &p); if (xi) return -EEXIST; @@ -652,13 +669,8 @@ static int xfrmi_newlink(struct net *src_net, struct net_device *dev, xi->p = p; xi->net = net; xi->dev = dev; - xi->phydev = dev_get_by_index(net, p.link); - if (!xi->phydev) - return -ENODEV; err = xfrmi_create(dev); - if (err < 0) - dev_put(xi->phydev); return err; } @@ -672,11 +684,11 @@ static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct xfrm_if *xi = netdev_priv(dev); - struct net *net = dev_net(dev); - - xfrmi_netlink_parms(data, &xi->p); + struct net *net = xi->net; + struct xfrm_if_parms p; - xi = xfrmi_locate(net, &xi->p); + xfrmi_netlink_parms(data, &p); + xi = xfrmi_locate(net, &p); if (!xi) { xi = netdev_priv(dev); } else { @@ -684,7 +696,7 @@ static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], return -EEXIST; } - return xfrmi_update(xi, &xi->p); + return xfrmi_update(xi, &p); } static size_t xfrmi_get_size(const struct net_device *dev) @@ -715,7 +727,7 @@ static struct net *xfrmi_get_link_net(const struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); - return dev_net(xi->phydev); + return xi->net; } static const struct nla_policy xfrmi_policy[IFLA_XFRM_MAX + 1] = { @@ -738,30 +750,7 @@ static struct rtnl_link_ops xfrmi_link_ops __read_mostly = { .get_link_net = xfrmi_get_link_net, }; -static void __net_exit xfrmi_destroy_interfaces(struct xfrmi_net *xfrmn) -{ - struct xfrm_if *xi; - LIST_HEAD(list); - - xi = rtnl_dereference(xfrmn->xfrmi[0]); - if (!xi) - return; - - unregister_netdevice_queue(xi->dev, &list); - unregister_netdevice_many(&list); -} - -static void __net_exit xfrmi_exit_net(struct net *net) -{ - struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); - - rtnl_lock(); - xfrmi_destroy_interfaces(xfrmn); - rtnl_unlock(); -} - static struct pernet_operations xfrmi_net_ops = { - .exit = xfrmi_exit_net, .id = &xfrmi_net_id, .size = sizeof(struct xfrmi_net), }; diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c index 32c364d3bfb3..4d422447aadc 100644 --- a/net/xfrm/xfrm_ipcomp.c +++ b/net/xfrm/xfrm_ipcomp.c @@ -85,7 +85,7 @@ static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) if (dlen < len) len = dlen; - frag->page_offset = 0; + skb_frag_off_set(frag, 0); skb_frag_size_set(frag, len); memcpy(skb_frag_address(frag), scratch, len); diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 9499b35feb92..fafc7aba705f 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -502,7 +502,7 @@ int xfrm_output_resume(struct sk_buff *skb, int err) struct net *net = xs_net(skb_dst(skb)->xfrm); while (likely((err = xfrm_output_one(skb, err)) == 0)) { - nf_reset(skb); + nf_reset_ct(skb); err = skb_dst(skb)->ops->local_out(net, skb->sk, skb); if (unlikely(err != 1)) @@ -533,7 +533,7 @@ static int xfrm_output2(struct net *net, struct sock *sk, struct sk_buff *skb) static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct sk_buff *segs; + struct sk_buff *segs, *nskb; BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET); BUILD_BUG_ON(sizeof(*IP6CB(skb)) > SKB_SGO_CB_OFFSET); @@ -544,8 +544,7 @@ static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb if (segs == NULL) return -EINVAL; - do { - struct sk_buff *nskb = segs->next; + skb_list_walk_safe(segs, segs, nskb) { int err; skb_mark_not_on_list(segs); @@ -555,9 +554,7 @@ static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb kfree_skb_list(nskb); return err; } - - segs = nskb; - } while (segs); + } return 0; } diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 8ca637a72697..dbda08ec566e 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -39,6 +39,9 @@ #ifdef CONFIG_XFRM_STATISTICS #include <net/snmp.h> #endif +#ifdef CONFIG_INET_ESPINTCP +#include <net/espintcp.h> +#endif #include "xfrm_hash.h" @@ -912,6 +915,7 @@ restart: } else if (delta > 0) { p = &parent->rb_right; } else { + bool same_prefixlen = node->prefixlen == n->prefixlen; struct xfrm_policy *tmp; hlist_for_each_entry(tmp, &n->hhead, bydst) { @@ -919,9 +923,11 @@ restart: hlist_del_rcu(&tmp->bydst); } + node->prefixlen = prefixlen; + xfrm_policy_inexact_list_reinsert(net, node, family); - if (node->prefixlen == n->prefixlen) { + if (same_prefixlen) { kfree_rcu(n, rcu); return; } @@ -929,7 +935,6 @@ restart: rb_erase(*p, new); kfree_rcu(n, rcu); n = node; - n->prefixlen = prefixlen; goto restart; } } @@ -2806,7 +2811,7 @@ static void xfrm_policy_queue_process(struct timer_list *t) continue; } - nf_reset(skb); + nf_reset_ct(skb); skb_dst_drop(skb); skb_dst_set(skb, dst); @@ -3184,7 +3189,7 @@ struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, flags | XFRM_LOOKUP_QUEUE | XFRM_LOOKUP_KEEP_DST_REF); - if (IS_ERR(dst) && PTR_ERR(dst) == -EREMOTE) + if (PTR_ERR(dst) == -EREMOTE) return make_blackhole(net, dst_orig->ops->family, dst_orig); if (IS_ERR(dst)) @@ -3269,7 +3274,7 @@ decode_session4(struct sk_buff *skb, struct flowi *fl, bool reverse) struct flowi4 *fl4 = &fl->u.ip4; int oif = 0; - if (skb_dst(skb)) + if (skb_dst(skb) && skb_dst(skb)->dev) oif = skb_dst(skb)->dev->ifindex; memset(fl4, 0, sizeof(struct flowi4)); @@ -3387,7 +3392,7 @@ decode_session6(struct sk_buff *skb, struct flowi *fl, bool reverse) nexthdr = nh[nhoff]; - if (skb_dst(skb)) + if (skb_dst(skb) && skb_dst(skb)->dev) oif = skb_dst(skb)->dev->ifindex; memset(fl6, 0, sizeof(struct flowi6)); @@ -4155,6 +4160,10 @@ void __init xfrm_init(void) seqcount_init(&xfrm_policy_hash_generation); xfrm_input_init(); +#ifdef CONFIG_INET_ESPINTCP + espintcp_init(); +#endif + RCU_INIT_POINTER(xfrm_if_cb, NULL); synchronize_rcu(); } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index c6f3c4a1bd99..170d6e7f31d3 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -495,6 +495,8 @@ static void ___xfrm_state_destroy(struct xfrm_state *x) x->type->destructor(x); xfrm_put_type(x->type); } + if (x->xfrag.page) + put_page(x->xfrag.page); xfrm_dev_state_free(x); security_xfrm_state_free(x); xfrm_state_free(x); @@ -668,6 +670,9 @@ int __xfrm_state_delete(struct xfrm_state *x) net->xfrm.state_num--; spin_unlock(&net->xfrm.xfrm_state_lock); + if (x->encap_sk) + sock_put(rcu_dereference_raw(x->encap_sk)); + xfrm_dev_state_delete(x); /* All xfrm_state objects are created by xfrm_state_alloc. |