diff options
Diffstat (limited to 'net')
216 files changed, 4573 insertions, 2373 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index a0103500cc6d..8dfdd94e430f 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -379,6 +379,9 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, dev->name); vlan_vid_add(dev, htons(ETH_P_8021Q), 0); } + if (event == NETDEV_DOWN && + (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) + vlan_vid_del(dev, htons(ETH_P_8021Q), 0); vlan_info = rtnl_dereference(dev->vlan_info); if (!vlan_info) @@ -426,9 +429,6 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, struct net_device *tmp; LIST_HEAD(close_list); - if (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER) - vlan_vid_del(dev, htons(ETH_P_8021Q), 0); - /* Put all VLANs for this dev in the down state too. */ vlan_group_for_each_dev(grp, i, vlandev) { flgs = vlandev->flags; diff --git a/net/atm/mpc.c b/net/atm/mpc.c index 883d25778fa4..e882d8b5db05 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -121,7 +121,7 @@ static struct notifier_block mpoa_notifier = { struct mpoa_client *mpcs = NULL; /* FIXME */ static struct atm_mpoa_qos *qos_head = NULL; -static DEFINE_TIMER(mpc_timer, NULL, 0, 0); +static DEFINE_TIMER(mpc_timer, NULL); static struct mpoa_client *find_mpc_by_itfnum(int itf) diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index a096d3e189da..7f98a7d25866 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -137,7 +137,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb mdst = br_mdb_get(br, skb, vid); if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) && br_multicast_querier_exists(br, eth_hdr(skb))) { - if ((mdst && mdst->mglist) || + if ((mdst && mdst->host_joined) || br_multicast_is_router(br)) { local_rcv = true; br->dev->stats.multicast++; diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 31ddff22563e..b0f4c734900b 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -292,6 +292,46 @@ err: kfree(priv); } +static void br_mdb_switchdev_host_port(struct net_device *dev, + struct net_device *lower_dev, + struct br_mdb_entry *entry, int type) +{ + struct switchdev_obj_port_mdb mdb = { + .obj = { + .id = SWITCHDEV_OBJ_ID_HOST_MDB, + .flags = SWITCHDEV_F_DEFER, + }, + .vid = entry->vid, + }; + + if (entry->addr.proto == htons(ETH_P_IP)) + ip_eth_mc_map(entry->addr.u.ip4, mdb.addr); +#if IS_ENABLED(CONFIG_IPV6) + else + ipv6_eth_mc_map(&entry->addr.u.ip6, mdb.addr); +#endif + + mdb.obj.orig_dev = dev; + switch (type) { + case RTM_NEWMDB: + switchdev_port_obj_add(lower_dev, &mdb.obj); + break; + case RTM_DELMDB: + switchdev_port_obj_del(lower_dev, &mdb.obj); + break; + } +} + +static void br_mdb_switchdev_host(struct net_device *dev, + struct br_mdb_entry *entry, int type) +{ + struct net_device *lower_dev; + struct list_head *iter; + + netdev_for_each_lower_dev(dev, lower_dev, iter) + br_mdb_switchdev_host_port(dev, lower_dev, entry, type); +} + static void __br_mdb_notify(struct net_device *dev, struct net_bridge_port *p, struct br_mdb_entry *entry, int type) { @@ -317,7 +357,7 @@ static void __br_mdb_notify(struct net_device *dev, struct net_bridge_port *p, #endif mdb.obj.orig_dev = port_dev; - if (port_dev && type == RTM_NEWMDB) { + if (p && port_dev && type == RTM_NEWMDB) { complete_info = kmalloc(sizeof(*complete_info), GFP_ATOMIC); if (complete_info) { complete_info->port = p; @@ -327,10 +367,13 @@ static void __br_mdb_notify(struct net_device *dev, struct net_bridge_port *p, if (switchdev_port_obj_add(port_dev, &mdb.obj)) kfree(complete_info); } - } else if (port_dev && type == RTM_DELMDB) { + } else if (p && port_dev && type == RTM_DELMDB) { switchdev_port_obj_del(port_dev, &mdb.obj); } + if (!p) + br_mdb_switchdev_host(dev, entry, type); + skb = nlmsg_new(rtnl_mdb_nlmsg_size(), GFP_ATOMIC); if (!skb) goto errout; @@ -353,7 +396,10 @@ void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, struct br_mdb_entry entry; memset(&entry, 0, sizeof(entry)); - entry.ifindex = port->dev->ifindex; + if (port) + entry.ifindex = port->dev->ifindex; + else + entry.ifindex = dev->ifindex; entry.addr.proto = group->proto; entry.addr.u.ip4 = group->u.ip4; #if IS_ENABLED(CONFIG_IPV6) @@ -655,7 +701,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) call_rcu_bh(&p->rcu, br_multicast_free_pg); err = 0; - if (!mp->ports && !mp->mglist && + if (!mp->ports && !mp->host_joined && netif_running(br->dev)) mod_timer(&mp->timer, jiffies); break; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 5f7f0e9d446c..cb4729539b82 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -249,7 +249,8 @@ static void br_multicast_group_expired(struct timer_list *t) if (!netif_running(br->dev) || timer_pending(&mp->timer)) goto out; - mp->mglist = false; + mp->host_joined = false; + br_mdb_notify(br->dev, NULL, &mp->addr, RTM_DELMDB, 0); if (mp->ports) goto out; @@ -292,7 +293,7 @@ static void br_multicast_del_pg(struct net_bridge *br, p->flags); call_rcu_bh(&p->rcu, br_multicast_free_pg); - if (!mp->ports && !mp->mglist && + if (!mp->ports && !mp->host_joined && netif_running(br->dev)) mod_timer(&mp->timer, jiffies); @@ -773,7 +774,10 @@ static int br_multicast_add_group(struct net_bridge *br, goto err; if (!port) { - mp->mglist = true; + if (!mp->host_joined) { + mp->host_joined = true; + br_mdb_notify(br->dev, NULL, &mp->addr, RTM_NEWMDB, 0); + } mod_timer(&mp->timer, now + br->multicast_membership_interval); goto out; } @@ -1477,7 +1481,7 @@ static int br_ip4_multicast_query(struct net_bridge *br, max_delay *= br->multicast_last_member_count; - if (mp->mglist && + if (mp->host_joined && (timer_pending(&mp->timer) ? time_after(mp->timer.expires, now + max_delay) : try_to_del_timer_sync(&mp->timer) >= 0)) @@ -1561,7 +1565,7 @@ static int br_ip6_multicast_query(struct net_bridge *br, goto out; max_delay *= br->multicast_last_member_count; - if (mp->mglist && + if (mp->host_joined && (timer_pending(&mp->timer) ? time_after(mp->timer.expires, now + max_delay) : try_to_del_timer_sync(&mp->timer) >= 0)) @@ -1622,7 +1626,7 @@ br_multicast_leave_group(struct net_bridge *br, br_mdb_notify(br->dev, port, group, RTM_DELMDB, p->flags); - if (!mp->ports && !mp->mglist && + if (!mp->ports && !mp->host_joined && netif_running(br->dev)) mod_timer(&mp->timer, jiffies); } @@ -1662,7 +1666,7 @@ br_multicast_leave_group(struct net_bridge *br, br->multicast_last_member_interval; if (!port) { - if (mp->mglist && + if (mp->host_joined && (timer_pending(&mp->timer) ? time_after(mp->timer.expires, time) : try_to_del_timer_sync(&mp->timer) >= 0)) { diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 67bae0f11c67..d0ef0a8e8831 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -657,6 +657,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_MCAST_TO_UCAST] = { .type = NLA_U8 }, [IFLA_BRPORT_MCAST_FLOOD] = { .type = NLA_U8 }, [IFLA_BRPORT_BCAST_FLOOD] = { .type = NLA_U8 }, + [IFLA_BRPORT_VLAN_TUNNEL] = { .type = NLA_U8 }, [IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 }, [IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 }, }; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 40553d832b6e..1312b8d20ec3 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -209,7 +209,7 @@ struct net_bridge_mdb_entry struct rcu_head rcu; struct timer_list timer; struct br_ip addr; - bool mglist; + bool host_joined; }; struct net_bridge_mdb_htable diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 3b3dcf719e07..37817d25b63d 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -2112,9 +2112,8 @@ static int size_entry_mwt(struct ebt_entry *entry, const unsigned char *base, for (i = 0, j = 1 ; j < 4 ; j++, i++) { struct compat_ebt_entry_mwt *match32; unsigned int size; - char *buf = buf_start; + char *buf = buf_start + offsets[i]; - buf = buf_start + offsets[i]; if (offsets[i] > offsets[j]) return -EINVAL; diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c index ee43bc13221c..a3d0adc828e6 100644 --- a/net/ceph/pagevec.c +++ b/net/ceph/pagevec.c @@ -25,9 +25,9 @@ struct page **ceph_get_direct_page_vector(const void __user *data, return ERR_PTR(-ENOMEM); while (got < num_pages) { - rc = get_user_pages_unlocked( + rc = get_user_pages_fast( (unsigned long)data + ((unsigned long)got * PAGE_SIZE), - num_pages - got, pages + got, write_page ? FOLL_WRITE : 0); + num_pages - got, write_page, pages + got); if (rc < 0) break; BUG_ON(rc == 0); diff --git a/net/core/dev.c b/net/core/dev.c index 30b5fe32c525..8ee29f4f5fa9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1064,7 +1064,10 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf) unsigned long *inuse; struct net_device *d; - p = strnchr(name, IFNAMSIZ-1, '%'); + if (!dev_valid_name(name)) + return -EINVAL; + + p = strchr(name, '%'); if (p) { /* * Verify the string as this thing may have come from @@ -1095,8 +1098,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf) free_page((unsigned long) inuse); } - if (buf != name) - snprintf(buf, IFNAMSIZ, name, i); + snprintf(buf, IFNAMSIZ, name, i); if (!__dev_get_by_name(net, buf)) return i; @@ -1104,7 +1106,21 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf) * when the name is long and there isn't enough space left * for the digits, or if all bits are used. */ - return -ENFILE; + return p ? -ENFILE : -EEXIST; +} + +static int dev_alloc_name_ns(struct net *net, + struct net_device *dev, + const char *name) +{ + char buf[IFNAMSIZ]; + int ret; + + BUG_ON(!net); + ret = __dev_alloc_name(net, name, buf); + if (ret >= 0) + strlcpy(dev->name, buf, IFNAMSIZ); + return ret; } /** @@ -1123,48 +1139,14 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf) int dev_alloc_name(struct net_device *dev, const char *name) { - char buf[IFNAMSIZ]; - struct net *net; - int ret; - - BUG_ON(!dev_net(dev)); - net = dev_net(dev); - ret = __dev_alloc_name(net, name, buf); - if (ret >= 0) - strlcpy(dev->name, buf, IFNAMSIZ); - return ret; + return dev_alloc_name_ns(dev_net(dev), dev, name); } EXPORT_SYMBOL(dev_alloc_name); -static int dev_alloc_name_ns(struct net *net, - struct net_device *dev, - const char *name) -{ - char buf[IFNAMSIZ]; - int ret; - - ret = __dev_alloc_name(net, name, buf); - if (ret >= 0) - strlcpy(dev->name, buf, IFNAMSIZ); - return ret; -} - int dev_get_valid_name(struct net *net, struct net_device *dev, const char *name) { - BUG_ON(!net); - - if (!dev_valid_name(name)) - return -EINVAL; - - if (strchr(name, '%')) - return dev_alloc_name_ns(net, dev, name); - else if (__dev_get_by_name(net, name)) - return -EEXIST; - else if (dev->name != name) - strlcpy(dev->name, name, IFNAMSIZ); - - return 0; + return dev_alloc_name_ns(net, dev, name); } EXPORT_SYMBOL(dev_get_valid_name); @@ -3754,7 +3736,7 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, flow_table = rcu_dereference(rxqueue->rps_flow_table); if (flow_table && flow_id <= flow_table->mask) { rflow = &flow_table->flows[flow_id]; - cpu = ACCESS_ONCE(rflow->cpu); + cpu = READ_ONCE(rflow->cpu); if (rflow->filter == filter_id && cpu < nr_cpu_ids && ((int)(per_cpu(softnet_data, cpu).input_queue_head - rflow->last_qtail) < @@ -8667,6 +8649,8 @@ static void __net_exit netdev_exit(struct net *net) { kfree(net->dev_name_head); kfree(net->dev_index_head); + if (net != &init_net) + WARN_ON_ONCE(!list_empty(&net->dev_base_head)); } static struct pernet_operations __net_initdata netdev_net_ops = { diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c index 4fc202dbdfb6..0c048bdeb016 100644 --- a/net/core/fib_notifier.c +++ b/net/core/fib_notifier.c @@ -34,12 +34,14 @@ static unsigned int fib_seq_sum(void) rtnl_lock(); for_each_net(net) { - list_for_each_entry(ops, &net->fib_notifier_ops, list) { + rcu_read_lock(); + list_for_each_entry_rcu(ops, &net->fib_notifier_ops, list) { if (!try_module_get(ops->owner)) continue; fib_seq += ops->fib_seq_read(net); module_put(ops->owner); } + rcu_read_unlock(); } rtnl_unlock(); @@ -161,8 +163,14 @@ static int __net_init fib_notifier_net_init(struct net *net) return 0; } +static void __net_exit fib_notifier_net_exit(struct net *net) +{ + WARN_ON_ONCE(!list_empty(&net->fib_notifier_ops)); +} + static struct pernet_operations fib_notifier_net_ops = { .init = fib_notifier_net_init, + .exit = fib_notifier_net_exit, }; static int __init fib_notifier_init(void) diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index fafd0a41e3f7..98e1066c3d55 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -1022,8 +1022,14 @@ static int __net_init fib_rules_net_init(struct net *net) return 0; } +static void __net_exit fib_rules_net_exit(struct net *net) +{ + WARN_ON_ONCE(!list_empty(&net->rules_ops)); +} + static struct pernet_operations fib_rules_net_ops = { .init = fib_rules_net_init, + .exit = fib_rules_net_exit, }; static int __init fib_rules_init(void) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 1f5caafb4492..15ce30063765 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -10,6 +10,7 @@ #include <net/ipv6.h> #include <net/gre.h> #include <net/pptp.h> +#include <net/tipc.h> #include <linux/igmp.h> #include <linux/icmp.h> #include <linux/sctp.h> @@ -772,23 +773,22 @@ proto_again: break; } case htons(ETH_P_TIPC): { - struct { - __be32 pre[3]; - __be32 srcnode; - } *hdr, _hdr; - hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); + struct tipc_basic_hdr *hdr, _hdr; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), + data, hlen, &_hdr); if (!hdr) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_TIPC_ADDRS)) { + FLOW_DISSECTOR_KEY_TIPC)) { key_addrs = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_TIPC_ADDRS, + FLOW_DISSECTOR_KEY_TIPC, target_container); - key_addrs->tipcaddrs.srcnode = hdr->srcnode; - key_control->addr_type = FLOW_DISSECTOR_KEY_TIPC_ADDRS; + key_addrs->tipckey.key = tipc_hdr_rps_key(hdr); + key_control->addr_type = FLOW_DISSECTOR_KEY_TIPC; } fdret = FLOW_DISSECT_RET_OUT_GOOD; break; @@ -1024,8 +1024,8 @@ static inline size_t flow_keys_hash_length(const struct flow_keys *flow) case FLOW_DISSECTOR_KEY_IPV6_ADDRS: diff -= sizeof(flow->addrs.v6addrs); break; - case FLOW_DISSECTOR_KEY_TIPC_ADDRS: - diff -= sizeof(flow->addrs.tipcaddrs); + case FLOW_DISSECTOR_KEY_TIPC: + diff -= sizeof(flow->addrs.tipckey); break; } return (sizeof(*flow) - diff) / sizeof(u32); @@ -1039,8 +1039,8 @@ __be32 flow_get_u32_src(const struct flow_keys *flow) case FLOW_DISSECTOR_KEY_IPV6_ADDRS: return (__force __be32)ipv6_addr_hash( &flow->addrs.v6addrs.src); - case FLOW_DISSECTOR_KEY_TIPC_ADDRS: - return flow->addrs.tipcaddrs.srcnode; + case FLOW_DISSECTOR_KEY_TIPC: + return flow->addrs.tipckey.key; default: return 0; } @@ -1321,8 +1321,8 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = { .offset = offsetof(struct flow_keys, addrs.v6addrs), }, { - .key_id = FLOW_DISSECTOR_KEY_TIPC_ADDRS, - .offset = offsetof(struct flow_keys, addrs.tipcaddrs), + .key_id = FLOW_DISSECTOR_KEY_TIPC, + .offset = offsetof(struct flow_keys, addrs.tipckey), }, { .key_id = FLOW_DISSECTOR_KEY_PORTS, diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 51d5836d8fb9..799b75268291 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -382,7 +382,7 @@ static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, struct net_device *netdev = to_net_dev(dev); struct net *net = dev_net(netdev); size_t count = len; - ssize_t ret; + ssize_t ret = 0; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; @@ -391,9 +391,20 @@ static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, if (len > 0 && buf[len - 1] == '\n') --count; - ret = dev_set_alias(netdev, buf, count); + if (!rtnl_trylock()) + return restart_syscall(); + + if (dev_isalive(netdev)) { + ret = dev_set_alias(netdev, buf, count); + if (ret < 0) + goto err; + ret = len; + netdev_state_change(netdev); + } +err: + rtnl_unlock(); - return ret < 0 ? ret : len; + return ret; } static ssize_t ifalias_show(struct device *dev, diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 912731bed7b7..57557a6a950c 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -334,7 +334,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, /* It is up to the caller to keep npinfo alive. */ struct netpoll_info *npinfo; - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); npinfo = rcu_dereference_bh(np->dev->npinfo); if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { diff --git a/net/core/pktgen.c b/net/core/pktgen.c index e3fa53a07d34..f95a15086225 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2711,7 +2711,7 @@ static inline __be16 build_tci(unsigned int id, unsigned int cfi, static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, int datalen) { - struct timeval timestamp; + struct timespec64 timestamp; struct pktgen_hdr *pgh; pgh = skb_put(skb, sizeof(*pgh)); @@ -2773,9 +2773,17 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, pgh->tv_sec = 0; pgh->tv_usec = 0; } else { - do_gettimeofday(×tamp); + /* + * pgh->tv_sec wraps in y2106 when interpreted as unsigned + * as done by wireshark, or y2038 when interpreted as signed. + * This is probably harmless, but if anyone wants to improve + * it, we could introduce a variant that puts 64-bit nanoseconds + * into the respective header bytes. + * This would also be slightly faster to read. + */ + ktime_get_real_ts64(×tamp); pgh->tv_sec = htonl(timestamp.tv_sec); - pgh->tv_usec = htonl(timestamp.tv_usec); + pgh->tv_usec = htonl(timestamp.tv_nsec / NSEC_PER_USEC); } } @@ -3377,7 +3385,7 @@ static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev) static void pktgen_xmit(struct pktgen_dev *pkt_dev) { - unsigned int burst = ACCESS_ONCE(pkt_dev->burst); + unsigned int burst = READ_ONCE(pkt_dev->burst); struct net_device *odev = pkt_dev->odev; struct netdev_queue *txq; struct sk_buff *skb; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index dc5ad84ac096..dabba2a91fc8 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -920,7 +920,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + rtnl_xdp_size() /* IFLA_XDP */ + nla_total_size(4) /* IFLA_EVENT */ + nla_total_size(4) /* IFLA_NEW_NETNSID */ - + nla_total_size(1); /* IFLA_PROTO_DOWN */ + + nla_total_size(1) /* IFLA_PROTO_DOWN */ + nla_total_size(4) /* IFLA_IF_NETNSID */ + 0; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 97e604d55d55..6b0ff396fa9d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -41,7 +41,6 @@ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/kmemcheck.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/in.h> @@ -234,14 +233,12 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, shinfo = skb_shinfo(skb); memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); atomic_set(&shinfo->dataref, 1); - kmemcheck_annotate_variable(shinfo->destructor_arg); if (flags & SKB_ALLOC_FCLONE) { struct sk_buff_fclones *fclones; fclones = container_of(skb, struct sk_buff_fclones, skb1); - kmemcheck_annotate_bitfield(&fclones->skb2, flags1); skb->fclone = SKB_FCLONE_ORIG; refcount_set(&fclones->fclone_ref, 1); @@ -301,7 +298,6 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size) shinfo = skb_shinfo(skb); memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); atomic_set(&shinfo->dataref, 1); - kmemcheck_annotate_variable(shinfo->destructor_arg); return skb; } @@ -357,7 +353,7 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) */ void *netdev_alloc_frag(unsigned int fragsz) { - return __netdev_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD); + return __netdev_alloc_frag(fragsz, GFP_ATOMIC); } EXPORT_SYMBOL(netdev_alloc_frag); @@ -370,7 +366,7 @@ static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) void *napi_alloc_frag(unsigned int fragsz) { - return __napi_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD); + return __napi_alloc_frag(fragsz, GFP_ATOMIC); } EXPORT_SYMBOL(napi_alloc_frag); @@ -1283,7 +1279,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) if (!n) return NULL; - kmemcheck_annotate_bitfield(n, flags1); n->fclone = SKB_FCLONE_UNAVAILABLE; } @@ -4869,6 +4864,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) if (!xnet) return; + ipvs_reset(skb); skb_orphan(skb); skb->mark = 0; } diff --git a/net/core/sock.c b/net/core/sock.c index 759400053110..c0b5b2f17412 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1469,8 +1469,6 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, sk = kmalloc(prot->obj_size, priority); if (sk != NULL) { - kmemcheck_annotate_bitfield(sk, flags); - if (security_sk_alloc(sk, family, priority)) goto out_free; @@ -2346,16 +2344,18 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) /* guarantee minimum buffer size under pressure */ if (kind == SK_MEM_RECV) { - if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0]) + if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) return 1; } else { /* SK_MEM_SEND */ + int wmem0 = sk_get_wmem0(sk, prot); + if (sk->sk_type == SOCK_STREAM) { - if (sk->sk_wmem_queued < prot->sysctl_wmem[0]) + if (sk->sk_wmem_queued < wmem0) return 1; - } else if (refcount_read(&sk->sk_wmem_alloc) < - prot->sysctl_wmem[0]) + } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { return 1; + } } if (sk_has_memory_pressure(sk)) { @@ -2744,6 +2744,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_max_pacing_rate = ~0U; sk->sk_pacing_rate = ~0U; + sk->sk_pacing_shift = 10; sk->sk_incoming_cpu = -1; /* * Before updating sk_refcnt, we must commit prior changes to memory @@ -3042,7 +3043,6 @@ struct prot_inuse { static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); -#ifdef CONFIG_NET_NS void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) { __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val); @@ -3086,27 +3086,6 @@ static __init int net_inuse_init(void) } core_initcall(net_inuse_init); -#else -static DEFINE_PER_CPU(struct prot_inuse, prot_inuse); - -void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) -{ - __this_cpu_add(prot_inuse.val[prot->inuse_idx], val); -} -EXPORT_SYMBOL_GPL(sock_prot_inuse_add); - -int sock_prot_inuse_get(struct net *net, struct proto *prot) -{ - int cpu, idx = prot->inuse_idx; - int res = 0; - - for_each_possible_cpu(cpu) - res += per_cpu(prot_inuse, cpu).val[idx]; - - return res >= 0 ? res : 0; -} -EXPORT_SYMBOL_GPL(sock_prot_inuse_get); -#endif static void assign_proto_idx(struct proto *prot) { diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index bff5ab88cdbb..b36dceab0dc1 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -131,7 +131,7 @@ static struct dn_rt_hash_bucket *dn_rt_hash_table; static unsigned int dn_rt_hash_mask; static struct timer_list dn_route_timer; -static DEFINE_TIMER(dn_rt_flush_timer, dn_run_flush, 0, 0); +static DEFINE_TIMER(dn_rt_flush_timer, dn_run_flush); int decnet_dst_gc_interval = 2; static struct dst_ops dn_dst_ops = { diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c index 08667f68e601..f0710b5d037d 100644 --- a/net/decnet/dn_table.c +++ b/net/decnet/dn_table.c @@ -156,6 +156,7 @@ static void dn_rehash_zone(struct dn_zone *dz) default: printk(KERN_DEBUG "DECnet: dn_rehash_zone: BUG! %d\n", old_divisor); + /* fall through */ case 256: new_divisor = 1024; new_hashmask = 0x3FF; diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index cc5f8f971689..03c3bdf25468 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -7,6 +7,7 @@ config HAVE_NET_DSA config NET_DSA tristate "Distributed Switch Architecture" depends on HAVE_NET_DSA && MAY_USE_DEVLINK + depends on BRIDGE || BRIDGE=n select NET_SWITCHDEV select PHYLIB ---help--- @@ -19,6 +20,9 @@ if NET_DSA config NET_DSA_TAG_BRCM bool +config NET_DSA_TAG_BRCM_PREPEND + bool + config NET_DSA_TAG_DSA bool diff --git a/net/dsa/Makefile b/net/dsa/Makefile index e9a4a0f33e86..0e13c1f95d13 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -5,6 +5,7 @@ dsa_core-y += dsa.o dsa2.o legacy.o master.o port.o slave.o switch.o # tagging formats dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o +dsa_core-$(CONFIG_NET_DSA_TAG_BRCM_PREPEND) += tag_brcm.o dsa_core-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o dsa_core-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o dsa_core-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index b8f2d9f7c3ed..6a9d0f50fbee 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -44,6 +44,9 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = { #ifdef CONFIG_NET_DSA_TAG_BRCM [DSA_TAG_PROTO_BRCM] = &brcm_netdev_ops, #endif +#ifdef CONFIG_NET_DSA_TAG_BRCM_PREPEND + [DSA_TAG_PROTO_BRCM_PREPEND] = &brcm_prepend_netdev_ops, +#endif #ifdef CONFIG_NET_DSA_TAG_DSA [DSA_TAG_PROTO_DSA] = &dsa_netdev_ops, #endif diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 283104e5ca6a..44e3fb7dec8c 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -94,14 +94,6 @@ static void dsa_tree_put(struct dsa_switch_tree *dst) kref_put(&dst->refcount, dsa_tree_release); } -/* For platform data configurations, we need to have a valid name argument to - * differentiate a disabled port from an enabled one - */ -static bool dsa_port_is_valid(struct dsa_port *port) -{ - return port->type != DSA_PORT_TYPE_UNUSED; -} - static bool dsa_port_is_dsa(struct dsa_port *port) { return port->type == DSA_PORT_TYPE_DSA; @@ -112,197 +104,214 @@ static bool dsa_port_is_cpu(struct dsa_port *port) return port->type == DSA_PORT_TYPE_CPU; } -static bool dsa_ds_find_port_dn(struct dsa_switch *ds, - struct device_node *port) +static bool dsa_port_is_user(struct dsa_port *dp) { - u32 index; - - for (index = 0; index < ds->num_ports; index++) - if (ds->ports[index].dn == port) - return true; - return false; + return dp->type == DSA_PORT_TYPE_USER; } -static struct dsa_switch *dsa_dst_find_port_dn(struct dsa_switch_tree *dst, - struct device_node *port) +static struct dsa_port *dsa_tree_find_port_by_node(struct dsa_switch_tree *dst, + struct device_node *dn) { struct dsa_switch *ds; - u32 index; + struct dsa_port *dp; + int device, port; - for (index = 0; index < DSA_MAX_SWITCHES; index++) { - ds = dst->ds[index]; + for (device = 0; device < DSA_MAX_SWITCHES; device++) { + ds = dst->ds[device]; if (!ds) continue; - if (dsa_ds_find_port_dn(ds, port)) - return ds; + for (port = 0; port < ds->num_ports; port++) { + dp = &ds->ports[port]; + + if (dp->dn == dn) + return dp; + } } return NULL; } -static int dsa_port_complete(struct dsa_switch_tree *dst, - struct dsa_switch *src_ds, - struct dsa_port *port, - u32 src_port) +static bool dsa_port_setup_routing_table(struct dsa_port *dp) { - struct device_node *link; - int index; - struct dsa_switch *dst_ds; - - for (index = 0;; index++) { - link = of_parse_phandle(port->dn, "link", index); - if (!link) - break; - - dst_ds = dsa_dst_find_port_dn(dst, link); - of_node_put(link); + struct dsa_switch *ds = dp->ds; + struct dsa_switch_tree *dst = ds->dst; + struct device_node *dn = dp->dn; + struct of_phandle_iterator it; + struct dsa_port *link_dp; + int err; - if (!dst_ds) - return 1; + of_for_each_phandle(&it, err, dn, "link", NULL, 0) { + link_dp = dsa_tree_find_port_by_node(dst, it.node); + if (!link_dp) { + of_node_put(it.node); + return false; + } - src_ds->rtable[dst_ds->index] = src_port; + ds->rtable[link_dp->ds->index] = dp->index; } - return 0; + return true; } -/* A switch is complete if all the DSA ports phandles point to ports - * known in the tree. A return value of 1 means the tree is not - * complete. This is not an error condition. A value of 0 is - * success. - */ -static int dsa_ds_complete(struct dsa_switch_tree *dst, struct dsa_switch *ds) +static bool dsa_switch_setup_routing_table(struct dsa_switch *ds) { - struct dsa_port *port; - u32 index; - int err; + bool complete = true; + struct dsa_port *dp; + int i; - for (index = 0; index < ds->num_ports; index++) { - port = &ds->ports[index]; - if (!dsa_port_is_valid(port)) - continue; + for (i = 0; i < DSA_MAX_SWITCHES; i++) + ds->rtable[i] = DSA_RTABLE_NONE; - if (!dsa_port_is_dsa(port)) - continue; + for (i = 0; i < ds->num_ports; i++) { + dp = &ds->ports[i]; - err = dsa_port_complete(dst, ds, port, index); - if (err != 0) - return err; + if (dsa_port_is_dsa(dp)) { + complete = dsa_port_setup_routing_table(dp); + if (!complete) + break; + } } - return 0; + return complete; } -/* A tree is complete if all the DSA ports phandles point to ports - * known in the tree. A return value of 1 means the tree is not - * complete. This is not an error condition. A value of 0 is - * success. - */ -static int dsa_dst_complete(struct dsa_switch_tree *dst) +static bool dsa_tree_setup_routing_table(struct dsa_switch_tree *dst) { struct dsa_switch *ds; - u32 index; - int err; + bool complete = true; + int device; - for (index = 0; index < DSA_MAX_SWITCHES; index++) { - ds = dst->ds[index]; + for (device = 0; device < DSA_MAX_SWITCHES; device++) { + ds = dst->ds[device]; if (!ds) continue; - err = dsa_ds_complete(dst, ds); - if (err != 0) - return err; + complete = dsa_switch_setup_routing_table(ds); + if (!complete) + break; } - return 0; + return complete; } -static int dsa_dsa_port_apply(struct dsa_port *port) +static struct dsa_port *dsa_tree_find_first_cpu(struct dsa_switch_tree *dst) { - struct dsa_switch *ds = port->ds; - int err; + struct dsa_switch *ds; + struct dsa_port *dp; + int device, port; - err = dsa_port_fixed_link_register_of(port); - if (err) { - dev_warn(ds->dev, "Failed to setup dsa port %d: %d\n", - port->index, err); - return err; - } + for (device = 0; device < DSA_MAX_SWITCHES; device++) { + ds = dst->ds[device]; + if (!ds) + continue; - memset(&port->devlink_port, 0, sizeof(port->devlink_port)); + for (port = 0; port < ds->num_ports; port++) { + dp = &ds->ports[port]; - return devlink_port_register(ds->devlink, &port->devlink_port, - port->index); -} + if (dsa_port_is_cpu(dp)) + return dp; + } + } -static void dsa_dsa_port_unapply(struct dsa_port *port) -{ - devlink_port_unregister(&port->devlink_port); - dsa_port_fixed_link_unregister_of(port); + return NULL; } -static int dsa_cpu_port_apply(struct dsa_port *port) +static int dsa_tree_setup_default_cpu(struct dsa_switch_tree *dst) { - struct dsa_switch *ds = port->ds; - int err; + struct dsa_switch *ds; + struct dsa_port *dp; + int device, port; - err = dsa_port_fixed_link_register_of(port); - if (err) { - dev_warn(ds->dev, "Failed to setup cpu port %d: %d\n", - port->index, err); - return err; + /* DSA currently only supports a single CPU port */ + dst->cpu_dp = dsa_tree_find_first_cpu(dst); + if (!dst->cpu_dp) { + pr_warn("Tree has no master device\n"); + return -EINVAL; } - memset(&port->devlink_port, 0, sizeof(port->devlink_port)); - err = devlink_port_register(ds->devlink, &port->devlink_port, - port->index); - return err; + /* Assign the default CPU port to all ports of the fabric */ + for (device = 0; device < DSA_MAX_SWITCHES; device++) { + ds = dst->ds[device]; + if (!ds) + continue; + + for (port = 0; port < ds->num_ports; port++) { + dp = &ds->ports[port]; + + if (dsa_port_is_user(dp)) + dp->cpu_dp = dst->cpu_dp; + } + } + + return 0; } -static void dsa_cpu_port_unapply(struct dsa_port *port) +static void dsa_tree_teardown_default_cpu(struct dsa_switch_tree *dst) { - devlink_port_unregister(&port->devlink_port); - dsa_port_fixed_link_unregister_of(port); + /* DSA currently only supports a single CPU port */ + dst->cpu_dp = NULL; } -static int dsa_user_port_apply(struct dsa_port *port) +static int dsa_port_setup(struct dsa_port *dp) { - struct dsa_switch *ds = port->ds; + struct dsa_switch *ds = dp->ds; int err; - err = dsa_slave_create(port); - if (err) { - dev_warn(ds->dev, "Failed to create slave %d: %d\n", - port->index, err); - port->slave = NULL; - return err; - } + memset(&dp->devlink_port, 0, sizeof(dp->devlink_port)); - memset(&port->devlink_port, 0, sizeof(port->devlink_port)); - err = devlink_port_register(ds->devlink, &port->devlink_port, - port->index); + err = devlink_port_register(ds->devlink, &dp->devlink_port, dp->index); if (err) return err; - devlink_port_type_eth_set(&port->devlink_port, port->slave); + switch (dp->type) { + case DSA_PORT_TYPE_UNUSED: + break; + case DSA_PORT_TYPE_CPU: + case DSA_PORT_TYPE_DSA: + err = dsa_port_fixed_link_register_of(dp); + if (err) { + dev_err(ds->dev, "failed to register fixed link for port %d.%d\n", + ds->index, dp->index); + return err; + } + + break; + case DSA_PORT_TYPE_USER: + err = dsa_slave_create(dp); + if (err) + dev_err(ds->dev, "failed to create slave for port %d.%d\n", + ds->index, dp->index); + else + devlink_port_type_eth_set(&dp->devlink_port, dp->slave); + break; + } return 0; } -static void dsa_user_port_unapply(struct dsa_port *port) +static void dsa_port_teardown(struct dsa_port *dp) { - devlink_port_unregister(&port->devlink_port); - if (port->slave) { - dsa_slave_destroy(port->slave); - port->slave = NULL; + devlink_port_unregister(&dp->devlink_port); + + switch (dp->type) { + case DSA_PORT_TYPE_UNUSED: + break; + case DSA_PORT_TYPE_CPU: + case DSA_PORT_TYPE_DSA: + dsa_port_fixed_link_unregister_of(dp); + break; + case DSA_PORT_TYPE_USER: + if (dp->slave) { + dsa_slave_destroy(dp->slave); + dp->slave = NULL; + } + break; } } -static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds) +static int dsa_switch_setup(struct dsa_switch *ds) { - struct dsa_port *port; - u32 index; int err; /* Initialize ds->phys_mii_mask before registering the slave MDIO bus @@ -343,136 +352,145 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds) return err; } - for (index = 0; index < ds->num_ports; index++) { - port = &ds->ports[index]; - if (!dsa_port_is_valid(port)) - continue; + return 0; +} - if (dsa_port_is_dsa(port)) { - err = dsa_dsa_port_apply(port); - if (err) - return err; +static void dsa_switch_teardown(struct dsa_switch *ds) +{ + if (ds->slave_mii_bus && ds->ops->phy_read) + mdiobus_unregister(ds->slave_mii_bus); + + dsa_switch_unregister_notifier(ds); + + if (ds->devlink) { + devlink_unregister(ds->devlink); + devlink_free(ds->devlink); + ds->devlink = NULL; + } + +} + +static int dsa_tree_setup_switches(struct dsa_switch_tree *dst) +{ + struct dsa_switch *ds; + struct dsa_port *dp; + int device, port; + int err; + + for (device = 0; device < DSA_MAX_SWITCHES; device++) { + ds = dst->ds[device]; + if (!ds) continue; - } - if (dsa_port_is_cpu(port)) { - err = dsa_cpu_port_apply(port); + err = dsa_switch_setup(ds); + if (err) + return err; + + for (port = 0; port < ds->num_ports; port++) { + dp = &ds->ports[port]; + + err = dsa_port_setup(dp); if (err) return err; - continue; } - - err = dsa_user_port_apply(port); - if (err) - continue; } return 0; } -static void dsa_ds_unapply(struct dsa_switch_tree *dst, struct dsa_switch *ds) +static void dsa_tree_teardown_switches(struct dsa_switch_tree *dst) { - struct dsa_port *port; - u32 index; + struct dsa_switch *ds; + struct dsa_port *dp; + int device, port; - for (index = 0; index < ds->num_ports; index++) { - port = &ds->ports[index]; - if (!dsa_port_is_valid(port)) + for (device = 0; device < DSA_MAX_SWITCHES; device++) { + ds = dst->ds[device]; + if (!ds) continue; - if (dsa_port_is_dsa(port)) { - dsa_dsa_port_unapply(port); - continue; - } + for (port = 0; port < ds->num_ports; port++) { + dp = &ds->ports[port]; - if (dsa_port_is_cpu(port)) { - dsa_cpu_port_unapply(port); - continue; + dsa_port_teardown(dp); } - dsa_user_port_unapply(port); + dsa_switch_teardown(ds); } +} - if (ds->slave_mii_bus && ds->ops->phy_read) - mdiobus_unregister(ds->slave_mii_bus); +static int dsa_tree_setup_master(struct dsa_switch_tree *dst) +{ + struct dsa_port *cpu_dp = dst->cpu_dp; + struct net_device *master = cpu_dp->master; - dsa_switch_unregister_notifier(ds); + /* DSA currently supports a single pair of CPU port and master device */ + return dsa_master_setup(master, cpu_dp); +} - if (ds->devlink) { - devlink_unregister(ds->devlink); - devlink_free(ds->devlink); - ds->devlink = NULL; - } +static void dsa_tree_teardown_master(struct dsa_switch_tree *dst) +{ + struct dsa_port *cpu_dp = dst->cpu_dp; + struct net_device *master = cpu_dp->master; + return dsa_master_teardown(master); } -static int dsa_dst_apply(struct dsa_switch_tree *dst) +static int dsa_tree_setup(struct dsa_switch_tree *dst) { - struct dsa_switch *ds; - u32 index; + bool complete; int err; - for (index = 0; index < DSA_MAX_SWITCHES; index++) { - ds = dst->ds[index]; - if (!ds) - continue; - - err = dsa_ds_apply(dst, ds); - if (err) - return err; + if (dst->setup) { + pr_err("DSA: tree %d already setup! Disjoint trees?\n", + dst->index); + return -EEXIST; } - /* If we use a tagging format that doesn't have an ethertype - * field, make sure that all packets from this point on get - * sent to the tag format's receive function. - */ - wmb(); - dst->cpu_dp->master->dsa_ptr = dst->cpu_dp; + complete = dsa_tree_setup_routing_table(dst); + if (!complete) + return 0; + + err = dsa_tree_setup_default_cpu(dst); + if (err) + return err; - err = dsa_master_ethtool_setup(dst->cpu_dp->master); + err = dsa_tree_setup_switches(dst); if (err) return err; - dst->applied = true; + err = dsa_tree_setup_master(dst); + if (err) + return err; + + dst->setup = true; + + pr_info("DSA: tree %d setup\n", dst->index); return 0; } -static void dsa_dst_unapply(struct dsa_switch_tree *dst) +static void dsa_tree_teardown(struct dsa_switch_tree *dst) { - struct dsa_switch *ds; - u32 index; - - if (!dst->applied) + if (!dst->setup) return; - dsa_master_ethtool_restore(dst->cpu_dp->master); + dsa_tree_teardown_master(dst); - dst->cpu_dp->master->dsa_ptr = NULL; + dsa_tree_teardown_switches(dst); - /* If we used a tagging format that doesn't have an ethertype - * field, make sure that all packets from this point get sent - * without the tag and go through the regular receive path. - */ - wmb(); + dsa_tree_teardown_default_cpu(dst); - for (index = 0; index < DSA_MAX_SWITCHES; index++) { - ds = dst->ds[index]; - if (!ds) - continue; + pr_info("DSA: tree %d torn down\n", dst->index); - dsa_ds_unapply(dst, ds); - } - - dst->cpu_dp = NULL; - - pr_info("DSA: tree %d unapplied\n", dst->index); - dst->applied = false; + dst->setup = false; } static void dsa_tree_remove_switch(struct dsa_switch_tree *dst, unsigned int index) { + dsa_tree_teardown(dst); + dst->ds[index] = NULL; dsa_tree_put(dst); } @@ -481,6 +499,7 @@ static int dsa_tree_add_switch(struct dsa_switch_tree *dst, struct dsa_switch *ds) { unsigned int index = ds->index; + int err; if (dst->ds[index]) return -EBUSY; @@ -488,7 +507,11 @@ static int dsa_tree_add_switch(struct dsa_switch_tree *dst, dsa_tree_get(dst); dst->ds[index] = ds; - return 0; + err = dsa_tree_setup(dst); + if (err) + dsa_tree_remove_switch(dst, index); + + return err; } static int dsa_port_parse_user(struct dsa_port *dp, const char *name) @@ -516,7 +539,7 @@ static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master) const struct dsa_device_ops *tag_ops; enum dsa_tag_protocol tag_protocol; - tag_protocol = ds->ops->get_tag_protocol(ds); + tag_protocol = ds->ops->get_tag_protocol(ds, dp->index); tag_ops = dsa_resolve_tag_protocol(tag_protocol); if (IS_ERR(tag_ops)) { dev_warn(ds->dev, "No tagger for this switch\n"); @@ -532,86 +555,6 @@ static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master) return 0; } -static int dsa_cpu_parse(struct dsa_port *port, u32 index, - struct dsa_switch_tree *dst, - struct dsa_switch *ds) -{ - if (!dst->cpu_dp) - dst->cpu_dp = port; - - return 0; -} - -static int dsa_ds_parse(struct dsa_switch_tree *dst, struct dsa_switch *ds) -{ - struct dsa_port *port; - u32 index; - int err; - - for (index = 0; index < ds->num_ports; index++) { - port = &ds->ports[index]; - if (!dsa_port_is_valid(port) || - dsa_port_is_dsa(port)) - continue; - - if (dsa_port_is_cpu(port)) { - err = dsa_cpu_parse(port, index, dst, ds); - if (err) - return err; - } - - } - - pr_info("DSA: switch %d %d parsed\n", dst->index, ds->index); - - return 0; -} - -static int dsa_dst_parse(struct dsa_switch_tree *dst) -{ - struct dsa_switch *ds; - struct dsa_port *dp; - u32 index; - int port; - int err; - - for (index = 0; index < DSA_MAX_SWITCHES; index++) { - ds = dst->ds[index]; - if (!ds) - continue; - - err = dsa_ds_parse(dst, ds); - if (err) - return err; - } - - if (!dst->cpu_dp) { - pr_warn("Tree has no master device\n"); - return -EINVAL; - } - - /* Assign the default CPU port to all ports of the fabric */ - for (index = 0; index < DSA_MAX_SWITCHES; index++) { - ds = dst->ds[index]; - if (!ds) - continue; - - for (port = 0; port < ds->num_ports; port++) { - dp = &ds->ports[port]; - if (!dsa_port_is_valid(dp) || - dsa_port_is_dsa(dp) || - dsa_port_is_cpu(dp)) - continue; - - dp->cpu_dp = dst->cpu_dp; - } - } - - pr_info("DSA: tree %d parsed\n", dst->index); - - return 0; -} - static int dsa_port_parse_of(struct dsa_port *dp, struct device_node *dn) { struct device_node *ethernet = of_parse_phandle(dn, "ethernet", 0); @@ -768,13 +711,18 @@ static int dsa_switch_parse(struct dsa_switch *ds, struct dsa_chip_data *cd) return dsa_switch_parse_ports(ds, cd); } -static int _dsa_register_switch(struct dsa_switch *ds) +static int dsa_switch_add(struct dsa_switch *ds) +{ + struct dsa_switch_tree *dst = ds->dst; + + return dsa_tree_add_switch(dst, ds); +} + +static int dsa_switch_probe(struct dsa_switch *ds) { struct dsa_chip_data *pdata = ds->dev->platform_data; struct device_node *np = ds->dev->of_node; - struct dsa_switch_tree *dst; - unsigned int index; - int i, err; + int err; if (np) err = dsa_switch_parse_of(ds, np); @@ -786,46 +734,7 @@ static int _dsa_register_switch(struct dsa_switch *ds) if (err) return err; - index = ds->index; - dst = ds->dst; - - /* Initialize the routing table */ - for (i = 0; i < DSA_MAX_SWITCHES; ++i) - ds->rtable[i] = DSA_RTABLE_NONE; - - err = dsa_tree_add_switch(dst, ds); - if (err) - return err; - - err = dsa_dst_complete(dst); - if (err < 0) - goto out_del_dst; - - /* Not all switches registered yet */ - if (err == 1) - return 0; - - if (dst->applied) { - pr_info("DSA: Disjoint trees?\n"); - return -EINVAL; - } - - err = dsa_dst_parse(dst); - if (err) - goto out_del_dst; - - err = dsa_dst_apply(dst); - if (err) { - dsa_dst_unapply(dst); - goto out_del_dst; - } - - return 0; - -out_del_dst: - dsa_tree_remove_switch(dst, index); - - return err; + return dsa_switch_add(ds); } struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n) @@ -855,27 +764,25 @@ int dsa_register_switch(struct dsa_switch *ds) int err; mutex_lock(&dsa2_mutex); - err = _dsa_register_switch(ds); + err = dsa_switch_probe(ds); mutex_unlock(&dsa2_mutex); return err; } EXPORT_SYMBOL_GPL(dsa_register_switch); -static void _dsa_unregister_switch(struct dsa_switch *ds) +static void dsa_switch_remove(struct dsa_switch *ds) { struct dsa_switch_tree *dst = ds->dst; unsigned int index = ds->index; - dsa_dst_unapply(dst); - dsa_tree_remove_switch(dst, index); } void dsa_unregister_switch(struct dsa_switch *ds) { mutex_lock(&dsa2_mutex); - _dsa_unregister_switch(ds); + dsa_switch_remove(ds); mutex_unlock(&dsa2_mutex); } EXPORT_SYMBOL_GPL(dsa_unregister_switch); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 253a613c40cd..7d036696e8c4 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -108,8 +108,8 @@ int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], const unsigned char *addr, u16 vid); /* master.c */ -int dsa_master_ethtool_setup(struct net_device *dev); -void dsa_master_ethtool_restore(struct net_device *dev); +int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp); +void dsa_master_teardown(struct net_device *dev); static inline struct net_device *dsa_master_find_slave(struct net_device *dev, int device, int port) @@ -147,10 +147,10 @@ int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr, int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr, u16 vid); int dsa_port_fdb_dump(struct dsa_port *dp, dsa_fdb_dump_cb_t *cb, void *data); -int dsa_port_mdb_add(struct dsa_port *dp, +int dsa_port_mdb_add(const struct dsa_port *dp, const struct switchdev_obj_port_mdb *mdb, struct switchdev_trans *trans); -int dsa_port_mdb_del(struct dsa_port *dp, +int dsa_port_mdb_del(const struct dsa_port *dp, const struct switchdev_obj_port_mdb *mdb); int dsa_port_vlan_add(struct dsa_port *dp, const struct switchdev_obj_port_vlan *vlan, @@ -191,6 +191,7 @@ void dsa_switch_unregister_notifier(struct dsa_switch *ds); /* tag_brcm.c */ extern const struct dsa_device_ops brcm_netdev_ops; +extern const struct dsa_device_ops brcm_prepend_netdev_ops; /* tag_dsa.c */ extern const struct dsa_device_ops dsa_netdev_ops; diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index 0511fe2feff7..84611d7fcfa2 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -151,7 +151,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, const struct dsa_device_ops *tag_ops; enum dsa_tag_protocol tag_protocol; - tag_protocol = ops->get_tag_protocol(ds); + tag_protocol = ops->get_tag_protocol(ds, dst->cpu_dp->index); tag_ops = dsa_resolve_tag_protocol(tag_protocol); if (IS_ERR(tag_ops)) return PTR_ERR(tag_ops); @@ -593,15 +593,7 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev, if (!configured) return -EPROBE_DEFER; - /* - * If we use a tagging format that doesn't have an ethertype - * field, make sure that all packets from this point on get - * sent to the tag format's receive function. - */ - wmb(); - dev->dsa_ptr = dst->cpu_dp; - - return dsa_master_ethtool_setup(dst->cpu_dp->master); + return dsa_master_setup(dst->cpu_dp->master, dst->cpu_dp); } static int dsa_probe(struct platform_device *pdev) @@ -666,15 +658,7 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst) { int i; - dsa_master_ethtool_restore(dst->cpu_dp->master); - - dst->cpu_dp->master->dsa_ptr = NULL; - - /* If we used a tagging format that doesn't have an ethertype - * field, make sure that all packets from this point get sent - * without the tag and go through the regular receive path. - */ - wmb(); + dsa_master_teardown(dst->cpu_dp->master); for (i = 0; i < dst->pd->nr_chips; i++) { struct dsa_switch *ds = dst->ds[i]; diff --git a/net/dsa/master.c b/net/dsa/master.c index 5f3f57e372e0..00589147f042 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -85,7 +85,7 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset, } } -int dsa_master_ethtool_setup(struct net_device *dev) +static int dsa_master_ethtool_setup(struct net_device *dev) { struct dsa_port *cpu_dp = dev->dsa_ptr; struct dsa_switch *ds = cpu_dp->ds; @@ -108,10 +108,36 @@ int dsa_master_ethtool_setup(struct net_device *dev) return 0; } -void dsa_master_ethtool_restore(struct net_device *dev) +static void dsa_master_ethtool_teardown(struct net_device *dev) { struct dsa_port *cpu_dp = dev->dsa_ptr; dev->ethtool_ops = cpu_dp->orig_ethtool_ops; cpu_dp->orig_ethtool_ops = NULL; } + +int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) +{ + /* If we use a tagging format that doesn't have an ethertype + * field, make sure that all packets from this point on get + * sent to the tag format's receive function. + */ + wmb(); + + dev->dsa_ptr = cpu_dp; + + return dsa_master_ethtool_setup(dev); +} + +void dsa_master_teardown(struct net_device *dev) +{ + dsa_master_ethtool_teardown(dev); + + dev->dsa_ptr = NULL; + + /* If we used a tagging format that doesn't have an ethertype + * field, make sure that all packets from this point get sent + * without the tag and go through the regular receive path. + */ + wmb(); +} diff --git a/net/dsa/port.c b/net/dsa/port.c index bb30b1a7de3a..bb4be2679904 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -17,7 +17,7 @@ #include "dsa_priv.h" -static int dsa_port_notify(struct dsa_port *dp, unsigned long e, void *v) +static int dsa_port_notify(const struct dsa_port *dp, unsigned long e, void *v) { struct raw_notifier_head *nh = &dp->ds->dst->nh; int err; @@ -215,7 +215,7 @@ int dsa_port_fdb_dump(struct dsa_port *dp, dsa_fdb_dump_cb_t *cb, void *data) return ds->ops->port_fdb_dump(ds, port, cb, data); } -int dsa_port_mdb_add(struct dsa_port *dp, +int dsa_port_mdb_add(const struct dsa_port *dp, const struct switchdev_obj_port_mdb *mdb, struct switchdev_trans *trans) { @@ -229,7 +229,7 @@ int dsa_port_mdb_add(struct dsa_port *dp, return dsa_port_notify(dp, DSA_NOTIFIER_MDB_ADD, &info); } -int dsa_port_mdb_del(struct dsa_port *dp, +int dsa_port_mdb_del(const struct dsa_port *dp, const struct switchdev_obj_port_mdb *mdb) { struct dsa_notifier_mdb_info info = { @@ -252,7 +252,10 @@ int dsa_port_vlan_add(struct dsa_port *dp, .vlan = vlan, }; - return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_ADD, &info); + if (br_vlan_enabled(dp->bridge_dev)) + return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_ADD, &info); + + return 0; } int dsa_port_vlan_del(struct dsa_port *dp, @@ -264,7 +267,10 @@ int dsa_port_vlan_del(struct dsa_port *dp, .vlan = vlan, }; - return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info); + if (br_vlan_enabled(dp->bridge_dev)) + return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info); + + return 0; } int dsa_port_fixed_link_register_of(struct dsa_port *dp) diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 814ced75a0cc..d6e7a642493b 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -304,6 +304,13 @@ static int dsa_slave_port_obj_add(struct net_device *dev, case SWITCHDEV_OBJ_ID_PORT_MDB: err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj), trans); break; + case SWITCHDEV_OBJ_ID_HOST_MDB: + /* DSA can directly translate this to a normal MDB add, + * but on the CPU port. + */ + err = dsa_port_mdb_add(dp->cpu_dp, SWITCHDEV_OBJ_PORT_MDB(obj), + trans); + break; case SWITCHDEV_OBJ_ID_PORT_VLAN: err = dsa_port_vlan_add(dp, SWITCHDEV_OBJ_PORT_VLAN(obj), trans); @@ -326,6 +333,12 @@ static int dsa_slave_port_obj_del(struct net_device *dev, case SWITCHDEV_OBJ_ID_PORT_MDB: err = dsa_port_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; + case SWITCHDEV_OBJ_ID_HOST_MDB: + /* DSA can directly translate this to a normal MDB add, + * but on the CPU port. + */ + err = dsa_port_mdb_del(dp->cpu_dp, SWITCHDEV_OBJ_PORT_MDB(obj)); + break; case SWITCHDEV_OBJ_ID_PORT_VLAN: err = dsa_port_vlan_del(dp, SWITCHDEV_OBJ_PORT_VLAN(obj)); break; @@ -342,11 +355,12 @@ static int dsa_slave_port_attr_get(struct net_device *dev, { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; + struct dsa_switch_tree *dst = ds->dst; switch (attr->id) { case SWITCHDEV_ATTR_ID_PORT_PARENT_ID: - attr->u.ppid.id_len = sizeof(ds->index); - memcpy(&attr->u.ppid.id, &ds->index, attr->u.ppid.id_len); + attr->u.ppid.id_len = sizeof(dst->index); + memcpy(&attr->u.ppid.id, &dst->index, attr->u.ppid.id_len); break; case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS_SUPPORT: attr->u.brport_flags_support = 0; @@ -1147,7 +1161,7 @@ static void dsa_slave_notify(struct net_device *dev, unsigned long val) int dsa_slave_create(struct dsa_port *port) { - struct dsa_port *cpu_dp = port->cpu_dp; + const struct dsa_port *cpu_dp = port->cpu_dp; struct net_device *master = cpu_dp->master; struct dsa_switch *ds = port->ds; const char *name = port->name; diff --git a/net/dsa/switch.c b/net/dsa/switch.c index e6c06aa349a6..29608d087a7c 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -121,7 +121,7 @@ static int dsa_switch_mdb_add(struct dsa_switch *ds, if (ds->index == info->sw_index) set_bit(info->port, group); for (port = 0; port < ds->num_ports; port++) - if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)) + if (dsa_is_dsa_port(ds, port)) set_bit(port, group); if (switchdev_trans_ph_prepare(trans)) { @@ -133,6 +133,8 @@ static int dsa_switch_mdb_add(struct dsa_switch *ds, if (err) return err; } + + return 0; } for_each_set_bit(port, group, ds->num_ports) @@ -180,6 +182,8 @@ static int dsa_switch_vlan_add(struct dsa_switch *ds, if (err) return err; } + + return 0; } for_each_set_bit(port, members, ds->num_ports) diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 9e082bae3cb0..e6e0b7b6025c 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -59,7 +59,9 @@ #define BRCM_EG_TC_MASK 0x7 #define BRCM_EG_PID_MASK 0x1f -static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev) +static struct sk_buff *brcm_tag_xmit_ll(struct sk_buff *skb, + struct net_device *dev, + unsigned int offset) { struct dsa_port *dp = dsa_slave_to_port(dev); u16 queue = skb_get_queue_mapping(skb); @@ -70,10 +72,10 @@ static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev skb_push(skb, BRCM_TAG_LEN); - memmove(skb->data, skb->data + BRCM_TAG_LEN, 2 * ETH_ALEN); + if (offset) + memmove(skb->data, skb->data + BRCM_TAG_LEN, offset); - /* Build the tag after the MAC Source Address */ - brcm_tag = skb->data + 2 * ETH_ALEN; + brcm_tag = skb->data + offset; /* Set the ingress opcode, traffic class, tag enforcment is * deprecated @@ -94,8 +96,10 @@ static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev return skb; } -static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) +static struct sk_buff *brcm_tag_rcv_ll(struct sk_buff *skb, + struct net_device *dev, + struct packet_type *pt, + unsigned int offset) { int source_port; u8 *brcm_tag; @@ -103,8 +107,7 @@ static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, if (unlikely(!pskb_may_pull(skb, BRCM_TAG_LEN))) return NULL; - /* skb->data points to the EtherType, the tag is right before it */ - brcm_tag = skb->data - 2; + brcm_tag = skb->data - offset; /* The opcode should never be different than 0b000 */ if (unlikely((brcm_tag[0] >> BRCM_OPCODE_SHIFT) & BRCM_OPCODE_MASK)) @@ -126,15 +129,60 @@ static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, /* Remove Broadcom tag and update checksum */ skb_pull_rcsum(skb, BRCM_TAG_LEN); + return skb; +} + +#ifdef CONFIG_NET_DSA_TAG_BRCM +static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + /* Build the tag after the MAC Source Address */ + return brcm_tag_xmit_ll(skb, dev, 2 * ETH_ALEN); +} + + +static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt) +{ + struct sk_buff *nskb; + + /* skb->data points to the EtherType, the tag is right before it */ + nskb = brcm_tag_rcv_ll(skb, dev, pt, 2); + if (!nskb) + return nskb; + /* Move the Ethernet DA and SA */ - memmove(skb->data - ETH_HLEN, - skb->data - ETH_HLEN - BRCM_TAG_LEN, + memmove(nskb->data - ETH_HLEN, + nskb->data - ETH_HLEN - BRCM_TAG_LEN, 2 * ETH_ALEN); - return skb; + return nskb; } const struct dsa_device_ops brcm_netdev_ops = { .xmit = brcm_tag_xmit, .rcv = brcm_tag_rcv, }; +#endif + +#ifdef CONFIG_NET_DSA_TAG_BRCM_PREPEND +static struct sk_buff *brcm_tag_xmit_prepend(struct sk_buff *skb, + struct net_device *dev) +{ + /* tag is prepended to the packet */ + return brcm_tag_xmit_ll(skb, dev, 0); +} + +static struct sk_buff *brcm_tag_rcv_prepend(struct sk_buff *skb, + struct net_device *dev, + struct packet_type *pt) +{ + /* tag is prepended to the packet */ + return brcm_tag_rcv_ll(skb, dev, pt, ETH_HLEN); +} + +const struct dsa_device_ops brcm_prepend_netdev_ops = { + .xmit = brcm_tag_xmit_prepend, + .rcv = brcm_tag_rcv_prepend, +}; +#endif diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index dbbcdafed8c3..cd13cfc542ce 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -141,6 +141,8 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, 2 * ETH_ALEN); } + skb->offload_fwd_mark = 1; + return skb; } diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index f38a626b3a05..4083326b806e 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -160,6 +160,8 @@ static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev, 2 * ETH_ALEN); } + skb->offload_fwd_mark = 1; + return skb; } diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index e526c8967b98..548c00254c07 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -42,6 +42,10 @@ #define LAN9303_TAG_LEN 4 # define LAN9303_TAG_TX_USE_ALR BIT(3) # define LAN9303_TAG_TX_STP_OVERRIDE BIT(4) +# define LAN9303_TAG_RX_IGMP BIT(3) +# define LAN9303_TAG_RX_STP BIT(4) +# define LAN9303_TAG_RX_TRAPPED_TO_CPU (LAN9303_TAG_RX_IGMP | \ + LAN9303_TAG_RX_STP) /* Decide whether to transmit using ALR lookup, or transmit directly to * port using tag. ALR learning is performed only when using ALR lookup. @@ -88,9 +92,10 @@ static struct sk_buff *lan9303_xmit(struct sk_buff *skb, struct net_device *dev) } static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) + struct packet_type *pt) { u16 *lan9303_tag; + u16 lan9303_tag1; unsigned int source_port; if (unlikely(!pskb_may_pull(skb, LAN9303_TAG_LEN))) { @@ -112,7 +117,8 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, return NULL; } - source_port = ntohs(lan9303_tag[1]) & 0x3; + lan9303_tag1 = ntohs(lan9303_tag[1]); + source_port = lan9303_tag1 & 0x3; skb->dev = dsa_master_find_slave(dev, 0, source_port); if (!skb->dev) { @@ -126,8 +132,7 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, skb_pull_rcsum(skb, 2 + 2); memmove(skb->data - ETH_HLEN, skb->data - (ETH_HLEN + LAN9303_TAG_LEN), 2 * ETH_ALEN); - skb->offload_fwd_mark = !ether_addr_equal(skb->data - ETH_HLEN, - eth_stp_addr); + skb->offload_fwd_mark = !(lan9303_tag1 & LAN9303_TAG_RX_TRAPPED_TO_CPU); return skb; } diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 37db44f60718..4dd95cdd8070 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -240,7 +240,7 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) if (err == -EINPROGRESS) goto out; - if (err == -EBUSY) + if (err == -ENOSPC) err = NET_XMIT_DROP; goto out_free; } diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index b00e4a43b4dc..d57aa64fa7c7 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -432,7 +432,7 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * case -EINPROGRESS: goto error; - case -EBUSY: + case -ENOSPC: err = NET_XMIT_DROP; break; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 589caaa90613..f04d944f8abe 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -710,7 +710,7 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) bool ecn_ca = false; nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp, &ecn_ca); + val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca); } else { val = nla_get_u32(nla); } @@ -1030,7 +1030,7 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) char tmp[TCP_CA_NAME_MAX]; nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp, &ecn_ca); + val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca); if (val == TCP_CA_UNSPEC) return -EINVAL; } else { diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 7f3ef5c287a1..26a3d0315728 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -164,7 +164,7 @@ static void inet_frag_worker(struct work_struct *work) local_bh_disable(); - for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) { + for (i = READ_ONCE(f->next_bucket); budget; --budget) { evicted += inet_evict_bucket(f, &f->hash[i]); i = (i + 1) & (INETFRAGS_HASHSZ - 1); if (evicted > INETFRAGS_EVICT_MAX) diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index a4bab81f1462..c690cd0d9b3f 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -9,7 +9,6 @@ */ #include <linux/kernel.h> -#include <linux/kmemcheck.h> #include <linux/slab.h> #include <linux/module.h> #include <net/inet_hashtables.h> @@ -167,8 +166,6 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, if (tw) { const struct inet_sock *inet = inet_sk(sk); - kmemcheck_annotate_bitfield(tw, flags); - tw->tw_dr = dr; /* Give us an identity. */ tw->tw_daddr = inet->inet_daddr; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index c105a315b1a3..bb6239169b1a 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -773,20 +773,46 @@ free_skb: return NETDEV_TX_OK; } +static void ipgre_link_update(struct net_device *dev, bool set_mtu) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + int len; + + len = tunnel->tun_hlen; + tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags); + len = tunnel->tun_hlen - len; + tunnel->hlen = tunnel->hlen + len; + + dev->needed_headroom = dev->needed_headroom + len; + if (set_mtu) + dev->mtu = max_t(int, dev->mtu - len, 68); + + if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) { + if (!(tunnel->parms.o_flags & TUNNEL_CSUM) || + tunnel->encap.type == TUNNEL_ENCAP_NONE) { + dev->features |= NETIF_F_GSO_SOFTWARE; + dev->hw_features |= NETIF_F_GSO_SOFTWARE; + } + dev->features |= NETIF_F_LLTX; + } +} + static int ipgre_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { - int err; struct ip_tunnel_parm p; + int err; if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) return -EFAULT; + if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE || - p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) || - ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING))) + p.iph.ihl != 5 || (p.iph.frag_off & htons(~IP_DF)) || + ((p.i_flags | p.o_flags) & (GRE_VERSION | GRE_ROUTING))) return -EINVAL; } + p.i_flags = gre_flags_to_tnl_flags(p.i_flags); p.o_flags = gre_flags_to_tnl_flags(p.o_flags); @@ -794,11 +820,22 @@ static int ipgre_tunnel_ioctl(struct net_device *dev, if (err) return err; + if (cmd == SIOCCHGTUNNEL) { + struct ip_tunnel *t = netdev_priv(dev); + + t->parms.i_flags = p.i_flags; + t->parms.o_flags = p.o_flags; + + if (strcmp(dev->rtnl_link_ops->kind, "erspan")) + ipgre_link_update(dev, true); + } + p.i_flags = gre_tnl_flags_to_gre_flags(p.i_flags); p.o_flags = gre_tnl_flags_to_gre_flags(p.o_flags); if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) return -EFAULT; + return 0; } @@ -1307,9 +1344,9 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; __u32 fwmark = t->fwmark; + struct ip_tunnel_parm p; int err; if (ipgre_netlink_encap_parms(data, &ipencap)) { @@ -1322,7 +1359,18 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark); if (err < 0) return err; - return ip_tunnel_changelink(dev, tb, &p, fwmark); + + err = ip_tunnel_changelink(dev, tb, &p, fwmark); + if (err < 0) + return err; + + t->parms.i_flags = p.i_flags; + t->parms.o_flags = p.o_flags; + + if (strcmp(dev->rtnl_link_ops->kind, "erspan")) + ipgre_link_update(dev, !tb[IFLA_MTU]); + + return 0; } static size_t ipgre_get_size(const struct net_device *dev) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 9e2770fd00be..f88221aebc9d 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -634,6 +634,25 @@ static void get_counters(const struct xt_table_info *t, } } +static void get_old_counters(const struct xt_table_info *t, + struct xt_counters counters[]) +{ + struct arpt_entry *iter; + unsigned int cpu, i; + + for_each_possible_cpu(cpu) { + i = 0; + xt_entry_foreach(iter, t->entries, t->size) { + struct xt_counters *tmp; + + tmp = xt_get_per_cpu_counter(&iter->counters, cpu); + ADD_COUNTER(counters[i], tmp->bcnt, tmp->pcnt); + ++i; + } + cond_resched(); + } +} + static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; @@ -910,8 +929,7 @@ static int __do_replace(struct net *net, const char *name, (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); - /* Get the old counters, and synchronize with replace */ - get_counters(oldinfo, counters); + get_old_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ loc_cpu_old_entry = oldinfo->entries; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 39286e543ee6..4cbe5e80f3bf 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -781,6 +781,26 @@ get_counters(const struct xt_table_info *t, } } +static void get_old_counters(const struct xt_table_info *t, + struct xt_counters counters[]) +{ + struct ipt_entry *iter; + unsigned int cpu, i; + + for_each_possible_cpu(cpu) { + i = 0; + xt_entry_foreach(iter, t->entries, t->size) { + const struct xt_counters *tmp; + + tmp = xt_get_per_cpu_counter(&iter->counters, cpu); + ADD_COUNTER(counters[i], tmp->bcnt, tmp->pcnt); + ++i; /* macro does multi eval of i */ + } + + cond_resched(); + } +} + static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; @@ -1070,8 +1090,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); - /* Get the old counters, and synchronize with replace */ - get_counters(oldinfo, counters); + get_old_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ xt_entry_foreach(iter, oldinfo->entries, oldinfo->size) diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index fe374da4bc13..89af9d88ca21 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -344,7 +344,7 @@ static void ipv4_hooks_unregister(struct net *net) mutex_unlock(®ister_ipv4_hooks); } -struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { +const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = { .l3proto = PF_INET, .pkt_to_tuple = ipv4_pkt_to_tuple, .invert_tuple = ipv4_invert_tuple, diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index a046c298413a..1849fedd9b81 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -81,7 +81,6 @@ static int icmp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - u_int8_t pf, unsigned int *timeout) { /* Do not immediately delete the connection after the first @@ -165,6 +164,12 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, return NF_ACCEPT; } +static void icmp_error_log(const struct sk_buff *skb, struct net *net, + u8 pf, const char *msg) +{ + nf_l4proto_log_invalid(skb, net, pf, IPPROTO_ICMP, "%s", msg); +} + /* Small and modified version of icmp_rcv */ static int icmp_error(struct net *net, struct nf_conn *tmpl, @@ -177,18 +182,14 @@ icmp_error(struct net *net, struct nf_conn *tmpl, /* Not enough header? */ icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih); if (icmph == NULL) { - if (LOG_INVALID(net, IPPROTO_ICMP)) - nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, - NULL, "nf_ct_icmp: short packet "); + icmp_error_log(skb, net, pf, "short packet"); return -NF_ACCEPT; } /* See ip_conntrack_proto_tcp.c */ if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_ip_checksum(skb, hooknum, dataoff, 0)) { - if (LOG_INVALID(net, IPPROTO_ICMP)) - nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL, - "nf_ct_icmp: bad HW ICMP checksum "); + icmp_error_log(skb, net, pf, "bad hw icmp checksum"); return -NF_ACCEPT; } @@ -199,9 +200,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl, * discarded. */ if (icmph->type > NR_ICMP_TYPES) { - if (LOG_INVALID(net, IPPROTO_ICMP)) - nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL, - "nf_ct_icmp: invalid ICMP type "); + icmp_error_log(skb, net, pf, "invalid icmp type"); return -NF_ACCEPT; } @@ -259,9 +258,14 @@ static int icmp_nlattr_to_tuple(struct nlattr *tb[], return 0; } -static int icmp_nlattr_tuple_size(void) +static unsigned int icmp_nlattr_tuple_size(void) { - return nla_policy_len(icmp_nla_policy, CTA_PROTO_MAX + 1); + static unsigned int size __read_mostly; + + if (!size) + size = nla_policy_len(icmp_nla_policy, CTA_PROTO_MAX + 1); + + return size; } #endif diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 127153f1ed8a..9f37c4727861 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -212,7 +212,6 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY), SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY), SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING), - SNMP_MIB_ITEM("TCPFACKReorder", LINUX_MIB_TCPFACKREORDER), SNMP_MIB_ITEM("TCPSACKReorder", LINUX_MIB_TCPSACKREORDER), SNMP_MIB_ITEM("TCPRenoReorder", LINUX_MIB_TCPRENOREORDER), SNMP_MIB_ITEM("TCPTSReorder", LINUX_MIB_TCPTSREORDER), diff --git a/net/ipv4/route.c b/net/ipv4/route.c index bc40bd411196..43b69af242e1 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -495,7 +495,7 @@ u32 ip_idents_reserve(u32 hash, int segs) { u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ; atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ; - u32 old = ACCESS_ONCE(*p_tstamp); + u32 old = READ_ONCE(*p_tstamp); u32 now = (u32)jiffies; u32 new, delta = 0; @@ -651,9 +651,12 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, struct fnhe_hash_bucket *hash; struct fib_nh_exception *fnhe; struct rtable *rt; + u32 genid, hval; unsigned int i; int depth; - u32 hval = fnhe_hashfun(daddr); + + genid = fnhe_genid(dev_net(nh->nh_dev)); + hval = fnhe_hashfun(daddr); spin_lock_bh(&fnhe_lock); @@ -676,12 +679,13 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, } if (fnhe) { + if (fnhe->fnhe_genid != genid) + fnhe->fnhe_genid = genid; if (gw) fnhe->fnhe_gw = gw; - if (pmtu) { + if (pmtu) fnhe->fnhe_pmtu = pmtu; - fnhe->fnhe_expires = max(1UL, expires); - } + fnhe->fnhe_expires = max(1UL, expires); /* Update all cached dsts too */ rt = rcu_dereference(fnhe->fnhe_rth_input); if (rt) @@ -700,7 +704,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, fnhe->fnhe_next = hash->chain; rcu_assign_pointer(hash->chain, fnhe); } - fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev)); + fnhe->fnhe_genid = genid; fnhe->fnhe_daddr = daddr; fnhe->fnhe_gw = gw; fnhe->fnhe_pmtu = pmtu; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index a82b44038308..93e172118a94 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -201,6 +201,8 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write, static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct net *net = container_of(ctl->data, struct net, + ipv4.tcp_congestion_control); char val[TCP_CA_NAME_MAX]; struct ctl_table tbl = { .data = val, @@ -208,11 +210,11 @@ static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, }; int ret; - tcp_get_default_congestion_control(val); + tcp_get_default_congestion_control(net, val); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) - ret = tcp_set_default_congestion_control(val); + ret = tcp_set_default_congestion_control(net, val); return ret; } @@ -441,34 +443,12 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_doulongvec_minmax, }, { - .procname = "tcp_wmem", - .data = &sysctl_tcp_wmem, - .maxlen = sizeof(sysctl_tcp_wmem), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - }, - { - .procname = "tcp_rmem", - .data = &sysctl_tcp_rmem, - .maxlen = sizeof(sysctl_tcp_rmem), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - }, - { .procname = "tcp_low_latency", .data = &sysctl_tcp_low_latency, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "tcp_congestion_control", - .mode = 0644, - .maxlen = TCP_CA_NAME_MAX, - .proc_handler = proc_tcp_congestion_control, - }, #ifdef CONFIG_NETLABEL { .procname = "cipso_cache_enable", @@ -780,6 +760,13 @@ static struct ctl_table ipv4_net_table[] = { }, #endif { + .procname = "tcp_congestion_control", + .data = &init_net.ipv4.tcp_congestion_control, + .mode = 0644, + .maxlen = TCP_CA_NAME_MAX, + .proc_handler = proc_tcp_congestion_control, + }, + { .procname = "tcp_keepalive_time", .data = &init_net.ipv4.sysctl_tcp_keepalive_time, .maxlen = sizeof(int), @@ -1164,6 +1151,22 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = &zero, .extra2 = &thousand, }, + { + .procname = "tcp_wmem", + .data = &init_net.ipv4.sysctl_tcp_wmem, + .maxlen = sizeof(init_net.ipv4.sysctl_tcp_wmem), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + }, + { + .procname = "tcp_rmem", + .data = &init_net.ipv4.sysctl_tcp_rmem, + .maxlen = sizeof(init_net.ipv4.sysctl_tcp_rmem), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + }, { } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c4cb19ed4628..bf97317e6c97 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -289,12 +289,7 @@ struct percpu_counter tcp_orphan_count; EXPORT_SYMBOL_GPL(tcp_orphan_count); long sysctl_tcp_mem[3] __read_mostly; -int sysctl_tcp_wmem[3] __read_mostly; -int sysctl_tcp_rmem[3] __read_mostly; - EXPORT_SYMBOL(sysctl_tcp_mem); -EXPORT_SYMBOL(sysctl_tcp_rmem); -EXPORT_SYMBOL(sysctl_tcp_wmem); atomic_long_t tcp_memory_allocated; /* Current allocated memory. */ EXPORT_SYMBOL(tcp_memory_allocated); @@ -456,8 +451,8 @@ void tcp_init_sock(struct sock *sk) icsk->icsk_sync_mss = tcp_sync_mss; - sk->sk_sndbuf = sysctl_tcp_wmem[1]; - sk->sk_rcvbuf = sysctl_tcp_rmem[1]; + sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1]; + sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1]; sk_sockets_allocated_inc(sk); } @@ -2514,8 +2509,6 @@ static int tcp_repair_options_est(struct sock *sk, return -EINVAL; tp->rx_opt.sack_ok |= TCP_SACK_SEEN; - if (sock_net(sk)->ipv4.sysctl_tcp_fack) - tcp_enable_fack(tp); break; case TCPOPT_TIMESTAMP: if (opt.opt_val != 0) @@ -2984,7 +2977,6 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_lost = tp->lost_out; info->tcpi_retrans = tp->retrans_out; - info->tcpi_fackets = tp->fackets_out; now = tcp_jiffies32; info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); @@ -3636,13 +3628,13 @@ void __init tcp_init(void) max_wshare = min(4UL*1024*1024, limit); max_rshare = min(6UL*1024*1024, limit); - sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; - sysctl_tcp_wmem[1] = 16*1024; - sysctl_tcp_wmem[2] = max(64*1024, max_wshare); + init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; + init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; + init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare); - sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; - sysctl_tcp_rmem[1] = 87380; - sysctl_tcp_rmem[2] = max(87380, max_rshare); + init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; + init_net.ipv4.sysctl_tcp_rmem[1] = 87380; + init_net.ipv4.sysctl_tcp_rmem[2] = max(87380, max_rshare); pr_info("Hash tables configured (established %u bind %u)\n", tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 2f26124fd160..bc6c02f16243 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -33,9 +33,11 @@ static struct tcp_congestion_ops *tcp_ca_find(const char *name) } /* Must be called with rcu lock held */ -static const struct tcp_congestion_ops *__tcp_ca_find_autoload(const char *name) +static struct tcp_congestion_ops *tcp_ca_find_autoload(struct net *net, + const char *name) { - const struct tcp_congestion_ops *ca = tcp_ca_find(name); + struct tcp_congestion_ops *ca = tcp_ca_find(name); + #ifdef CONFIG_MODULES if (!ca && capable(CAP_NET_ADMIN)) { rcu_read_unlock(); @@ -115,7 +117,7 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) } EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); -u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca) +u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca) { const struct tcp_congestion_ops *ca; u32 key = TCP_CA_UNSPEC; @@ -123,7 +125,7 @@ u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca) might_sleep(); rcu_read_lock(); - ca = __tcp_ca_find_autoload(name); + ca = tcp_ca_find_autoload(net, name); if (ca) { key = ca->key; *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN; @@ -153,23 +155,18 @@ EXPORT_SYMBOL_GPL(tcp_ca_get_name_by_key); /* Assign choice of congestion control. */ void tcp_assign_congestion_control(struct sock *sk) { + struct net *net = sock_net(sk); struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_congestion_ops *ca; + const struct tcp_congestion_ops *ca; rcu_read_lock(); - list_for_each_entry_rcu(ca, &tcp_cong_list, list) { - if (likely(try_module_get(ca->owner))) { - icsk->icsk_ca_ops = ca; - goto out; - } - /* Fallback to next available. The last really - * guaranteed fallback is Reno from this list. - */ - } -out: + ca = rcu_dereference(net->ipv4.tcp_congestion_control); + if (unlikely(!try_module_get(ca->owner))) + ca = &tcp_reno; + icsk->icsk_ca_ops = ca; rcu_read_unlock(); - memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); + memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); if (ca->flags & TCP_CONG_NEEDS_ECN) INET_ECN_xmit(sk); else @@ -214,29 +211,27 @@ void tcp_cleanup_congestion_control(struct sock *sk) } /* Used by sysctl to change default congestion control */ -int tcp_set_default_congestion_control(const char *name) +int tcp_set_default_congestion_control(struct net *net, const char *name) { struct tcp_congestion_ops *ca; - int ret = -ENOENT; - - spin_lock(&tcp_cong_list_lock); - ca = tcp_ca_find(name); -#ifdef CONFIG_MODULES - if (!ca && capable(CAP_NET_ADMIN)) { - spin_unlock(&tcp_cong_list_lock); + const struct tcp_congestion_ops *prev; + int ret; - request_module("tcp_%s", name); - spin_lock(&tcp_cong_list_lock); - ca = tcp_ca_find(name); - } -#endif + rcu_read_lock(); + ca = tcp_ca_find_autoload(net, name); + if (!ca) { + ret = -ENOENT; + } else if (!try_module_get(ca->owner)) { + ret = -EBUSY; + } else { + prev = xchg(&net->ipv4.tcp_congestion_control, ca); + if (prev) + module_put(prev->owner); - if (ca) { - ca->flags |= TCP_CONG_NON_RESTRICTED; /* default is always allowed */ - list_move(&ca->list, &tcp_cong_list); + ca->flags |= TCP_CONG_NON_RESTRICTED; ret = 0; } - spin_unlock(&tcp_cong_list_lock); + rcu_read_unlock(); return ret; } @@ -244,7 +239,8 @@ int tcp_set_default_congestion_control(const char *name) /* Set default value from kernel configuration at bootup */ static int __init tcp_congestion_default(void) { - return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG); + return tcp_set_default_congestion_control(&init_net, + CONFIG_DEFAULT_TCP_CONG); } late_initcall(tcp_congestion_default); @@ -264,14 +260,12 @@ void tcp_get_available_congestion_control(char *buf, size_t maxlen) } /* Get current default congestion control */ -void tcp_get_default_congestion_control(char *name) +void tcp_get_default_congestion_control(struct net *net, char *name) { - struct tcp_congestion_ops *ca; - /* We will always have reno... */ - BUG_ON(list_empty(&tcp_cong_list)); + const struct tcp_congestion_ops *ca; rcu_read_lock(); - ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list); + ca = rcu_dereference(net->ipv4.tcp_congestion_control); strncpy(name, ca->name, TCP_CA_NAME_MAX); rcu_read_unlock(); } @@ -351,12 +345,14 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, boo if (!load) ca = tcp_ca_find(name); else - ca = __tcp_ca_find_autoload(name); + ca = tcp_ca_find_autoload(sock_net(sk), name); + /* No change asking for existing value */ if (ca == icsk->icsk_ca_ops) { icsk->icsk_ca_setsockopt = 1; goto out; } + if (!ca) { err = -ENOENT; } else if (!load) { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0ada8bfc2ebd..734cfc8ff76e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -100,7 +100,7 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE; #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) -#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE) +#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE|FLAG_DSACKING_ACK) #define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) @@ -320,7 +320,7 @@ static void tcp_sndbuf_expand(struct sock *sk) sndmem *= nr_segs * per_mss; if (sk->sk_sndbuf < sndmem) - sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); + sk->sk_sndbuf = min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]); } /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) @@ -354,7 +354,7 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); /* Optimize this! */ int truesize = tcp_win_from_space(sk, skb->truesize) >> 1; - int window = tcp_win_from_space(sk, sysctl_tcp_rmem[2]) >> 1; + int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; while (tp->rcv_ssthresh <= window) { if (truesize <= skb->len) @@ -409,7 +409,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) rcvmem <<= 2; if (sk->sk_rcvbuf < rcvmem) - sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]); + sk->sk_rcvbuf = min(rcvmem, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); } /* 4. Try to fixup all. It is made immediately after connection enters @@ -457,15 +457,16 @@ static void tcp_clamp_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct net *net = sock_net(sk); icsk->icsk_ack.quick = 0; - if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && + if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && !tcp_under_memory_pressure(sk) && sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), - sysctl_tcp_rmem[2]); + net->ipv4.sysctl_tcp_rmem[2]); } if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); @@ -623,7 +624,8 @@ void tcp_rcv_space_adjust(struct sock *sk) while (tcp_win_from_space(sk, rcvmem) < tp->advmss) rcvmem += 128; - rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]); + rcvbuf = min(rcvwin / tp->advmss * rcvmem, + sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); if (rcvbuf > sk->sk_rcvbuf) { sk->sk_rcvbuf = rcvbuf; @@ -793,12 +795,12 @@ static void tcp_update_pacing_rate(struct sock *sk) if (likely(tp->srtt_us)) do_div(rate, tp->srtt_us); - /* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate + /* WRITE_ONCE() is needed because sch_fq fetches sk_pacing_rate * without any lock. We want to make sure compiler wont store * intermediate values in this location. */ - ACCESS_ONCE(sk->sk_pacing_rate) = min_t(u64, rate, - sk->sk_max_pacing_rate); + WRITE_ONCE(sk->sk_pacing_rate, min_t(u64, rate, + sk->sk_max_pacing_rate)); } /* Calculate rto without backoff. This is the second half of Van Jacobson's @@ -840,18 +842,6 @@ __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } -/* - * Packet counting of FACK is based on in-order assumptions, therefore TCP - * disables it when reordering is detected - */ -void tcp_disable_fack(struct tcp_sock *tp) -{ - /* RFC3517 uses different metric in lost marker => reset on change */ - if (tcp_is_fack(tp)) - tp->lost_skb_hint = NULL; - tp->rx_opt.sack_ok &= ~TCP_FACK_ENABLED; -} - /* Take a notice that peer is sending D-SACKs */ static void tcp_dsack_seen(struct tcp_sock *tp) { @@ -859,42 +849,39 @@ static void tcp_dsack_seen(struct tcp_sock *tp) tp->rack.dsack_seen = 1; } -static void tcp_update_reordering(struct sock *sk, const int metric, - const int ts) +/* It's reordering when higher sequence was delivered (i.e. sacked) before + * some lower never-retransmitted sequence ("low_seq"). The maximum reordering + * distance is approximated in full-mss packet distance ("reordering"). + */ +static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq, + const int ts) { struct tcp_sock *tp = tcp_sk(sk); - int mib_idx; + const u32 mss = tp->mss_cache; + u32 fack, metric; - if (WARN_ON_ONCE(metric < 0)) + fack = tcp_highest_sack_seq(tp); + if (!before(low_seq, fack)) return; - if (metric > tp->reordering) { - tp->reordering = min(sock_net(sk)->ipv4.sysctl_tcp_max_reordering, metric); - + metric = fack - low_seq; + if ((metric > tp->reordering * mss) && mss) { #if FASTRETRANS_DEBUG > 1 pr_debug("Disorder%d %d %u f%u s%u rr%d\n", tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, tp->reordering, - tp->fackets_out, + 0, tp->sacked_out, tp->undo_marker ? tp->undo_retrans : 0); #endif - tcp_disable_fack(tp); + tp->reordering = min_t(u32, (metric + mss - 1) / mss, + sock_net(sk)->ipv4.sysctl_tcp_max_reordering); } tp->rack.reord = 1; - /* This exciting event is worth to be remembered. 8) */ - if (ts) - mib_idx = LINUX_MIB_TCPTSREORDER; - else if (tcp_is_reno(tp)) - mib_idx = LINUX_MIB_TCPRENOREORDER; - else if (tcp_is_fack(tp)) - mib_idx = LINUX_MIB_TCPFACKREORDER; - else - mib_idx = LINUX_MIB_TCPSACKREORDER; - - NET_INC_STATS(sock_net(sk), mib_idx); + NET_INC_STATS(sock_net(sk), + ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER); } /* This must be called before lost_out is incremented */ @@ -968,7 +955,6 @@ void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb) * 3. Loss detection event of two flavors: * A. Scoreboard estimator decided the packet is lost. * A'. Reno "three dupacks" marks head of queue lost. - * A''. Its FACK modification, head until snd.fack is lost. * B. SACK arrives sacking SND.NXT at the moment, when the * segment was retransmitted. * 4. D-SACK added new rule: D-SACK changes any tag to S. @@ -1111,8 +1097,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, } struct tcp_sacktag_state { - int reord; - int fack_count; + u32 reord; /* Timestamps for earliest and latest never-retransmitted segment * that was SACKed. RTO needs the earliest RTT to stay conservative, * but congestion control should still get an accurate delay signal. @@ -1188,15 +1173,15 @@ static u8 tcp_sacktag_one(struct sock *sk, u64 xmit_time) { struct tcp_sock *tp = tcp_sk(sk); - int fack_count = state->fack_count; /* Account D-SACK for retransmitted packet. */ if (dup_sack && (sacked & TCPCB_RETRANS)) { if (tp->undo_marker && tp->undo_retrans > 0 && after(end_seq, tp->undo_marker)) tp->undo_retrans--; - if (sacked & TCPCB_SACKED_ACKED) - state->reord = min(fack_count, state->reord); + if ((sacked & TCPCB_SACKED_ACKED) && + before(start_seq, state->reord)) + state->reord = start_seq; } /* Nothing to do; acked frame is about to be dropped (was ACKed). */ @@ -1222,9 +1207,10 @@ static u8 tcp_sacktag_one(struct sock *sk, * which was in hole. It is reordering. */ if (before(start_seq, - tcp_highest_sack_seq(tp))) - state->reord = min(fack_count, - state->reord); + tcp_highest_sack_seq(tp)) && + before(start_seq, state->reord)) + state->reord = start_seq; + if (!after(end_seq, tp->high_seq)) state->flag |= FLAG_ORIG_SACK_ACKED; if (state->first_sackt == 0) @@ -1243,15 +1229,10 @@ static u8 tcp_sacktag_one(struct sock *sk, tp->sacked_out += pcount; tp->delivered += pcount; /* Out-of-order packets delivered */ - fack_count += pcount; - /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ - if (!tcp_is_fack(tp) && tp->lost_skb_hint && + if (tp->lost_skb_hint && before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq)) tp->lost_cnt_hint += pcount; - - if (fack_count > tp->fackets_out) - tp->fackets_out = fack_count; } /* D-SACK. We can detect redundant retransmission in S|R and plain R @@ -1498,7 +1479,6 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, } out: - state->fack_count += pcount; return prev; noop: @@ -1577,8 +1557,6 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, tcp_highest_sack_seq(tp))) tcp_advance_highest_sack(sk, skb); } - - state->fack_count += tcp_skb_pcount(skb); } return skb; } @@ -1589,7 +1567,6 @@ static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, { struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node; struct sk_buff *skb; - int unack_bytes; while (*p) { parent = *p; @@ -1602,12 +1579,6 @@ static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, p = &parent->rb_right; continue; } - - state->fack_count = 0; - unack_bytes = TCP_SKB_CB(skb)->seq - tcp_sk(sk)->snd_una; - if (state->mss_now && unack_bytes > 0) - state->fack_count = unack_bytes / state->mss_now; - return skb; } return NULL; @@ -1665,13 +1636,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, int first_sack_index; state->flag = 0; - state->reord = tp->packets_out; + state->reord = tp->snd_nxt; - if (!tp->sacked_out) { - if (WARN_ON(tp->fackets_out)) - tp->fackets_out = 0; + if (!tp->sacked_out) tcp_highest_sack_reset(sk); - } found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, num_sacks, prior_snd_una); @@ -1743,7 +1711,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, } state->mss_now = tcp_current_mss(sk); - state->fack_count = 0; skb = NULL; i = 0; @@ -1801,7 +1768,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, skb = tcp_highest_sack(sk); if (!skb) break; - state->fack_count = tp->fackets_out; cache++; goto walk; } @@ -1816,7 +1782,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, skb = tcp_highest_sack(sk); if (!skb) break; - state->fack_count = tp->fackets_out; } skb = tcp_sacktag_skip(skb, sk, state, start_seq); @@ -1836,9 +1801,8 @@ advance_sp: for (j = 0; j < used_sacks; j++) tp->recv_sack_cache[i++] = sp[j]; - if ((state->reord < tp->fackets_out) && - ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker)) - tcp_update_reordering(sk, tp->fackets_out - state->reord, 0); + if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss || tp->undo_marker) + tcp_check_sack_reordering(sk, state->reord, 0); tcp_verify_left_out(tp); out: @@ -1876,8 +1840,13 @@ static bool tcp_limit_reno_sacked(struct tcp_sock *tp) static void tcp_check_reno_reordering(struct sock *sk, const int addend) { struct tcp_sock *tp = tcp_sk(sk); - if (tcp_limit_reno_sacked(tp)) - tcp_update_reordering(sk, tp->packets_out + addend, 0); + + if (!tcp_limit_reno_sacked(tp)) + return; + + tp->reordering = min_t(u32, tp->packets_out + addend, + sock_net(sk)->ipv4.sysctl_tcp_max_reordering); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER); } /* Emulate SACKs for SACKless connection: account for a new dupack. */ @@ -1923,7 +1892,6 @@ void tcp_clear_retrans(struct tcp_sock *tp) tp->lost_out = 0; tp->undo_marker = 0; tp->undo_retrans = -1; - tp->fackets_out = 0; tp->sacked_out = 0; } @@ -1973,7 +1941,6 @@ void tcp_enter_loss(struct sock *sk) if (is_reneg) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); tp->sacked_out = 0; - tp->fackets_out = 0; } tcp_clear_all_retrans_hints(tp); @@ -2040,19 +2007,10 @@ static bool tcp_check_sack_reneging(struct sock *sk, int flag) return false; } -static inline int tcp_fackets_out(const struct tcp_sock *tp) -{ - return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out; -} - /* Heurestics to calculate number of duplicate ACKs. There's no dupACKs * counter when SACK is enabled (without SACK, sacked_out is used for * that purpose). * - * Instead, with FACK TCP uses fackets_out that includes both SACKed - * segments up to the highest received SACK block so far and holes in - * between them. - * * With reordering, holes may still be in flight, so RFC3517 recovery * uses pure sacked_out (total number of SACKed segments) even though * it violates the RFC that uses duplicate ACKs, often these are equal @@ -2062,10 +2020,10 @@ static inline int tcp_fackets_out(const struct tcp_sock *tp) */ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) { - return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; + return tp->sacked_out + 1; } -/* Linux NewReno/SACK/FACK/ECN state machine. +/* Linux NewReno/SACK/ECN state machine. * -------------------------------------- * * "Open" Normal state, no dubious events, fast path. @@ -2130,16 +2088,6 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) * dynamically measured and adjusted. This is implemented in * tcp_rack_mark_lost. * - * FACK (Disabled by default. Subsumbed by RACK): - * It is the simplest heuristics. As soon as we decided - * that something is lost, we decide that _all_ not SACKed - * packets until the most forward SACK are lost. I.e. - * lost_out = fackets_out - sacked_out and left_out = fackets_out. - * It is absolutely correct estimate, if network does not reorder - * packets. And it loses any connection to reality when reordering - * takes place. We use FACK by default until reordering - * is suspected on the path to this destination. - * * If the receiver does not support SACK: * * NewReno (RFC6582): in Recovery we assume that one segment @@ -2188,7 +2136,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) } /* Detect loss in event "A" above by marking head of queue up as lost. - * For FACK or non-SACK(Reno) senders, the first "packets" number of segments + * For non-SACK(Reno) senders, the first "packets" number of segments * are considered lost. For RFC3517 SACK, a segment is considered lost if it * has at least tp->reordering SACKed seqments above it; "packets" refers to * the maximum SACKed segments to pass before reaching this limit. @@ -2224,12 +2172,12 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) break; oldcnt = cnt; - if (tcp_is_fack(tp) || tcp_is_reno(tp) || + if (tcp_is_reno(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) cnt += tcp_skb_pcount(skb); if (cnt > packets) { - if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) || + if (tcp_is_sack(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || (oldcnt >= packets)) break; @@ -2260,11 +2208,6 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) if (tcp_is_reno(tp)) { tcp_mark_head_lost(sk, 1, 1); - } else if (tcp_is_fack(tp)) { - int lost = tp->fackets_out - tp->reordering; - if (lost <= 0) - lost = 1; - tcp_mark_head_lost(sk, lost, 0); } else { int sacked_upto = tp->sacked_out - tp->reordering; if (sacked_upto >= 0) @@ -2611,7 +2554,6 @@ void tcp_simple_retransmit(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; unsigned int mss = tcp_current_mss(sk); - u32 prior_lost = tp->lost_out; skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { if (tcp_skb_seglen(skb) > mss && @@ -2626,7 +2568,7 @@ void tcp_simple_retransmit(struct sock *sk) tcp_clear_retrans_hints_partial(tp); - if (prior_lost == tp->lost_out) + if (!tp->lost_out) return; if (tcp_is_reno(tp)) @@ -2734,15 +2676,15 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, } /* Undo during fast recovery after partial ACK. */ -static bool tcp_try_undo_partial(struct sock *sk, const int acked) +static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una) { struct tcp_sock *tp = tcp_sk(sk); if (tp->undo_marker && tcp_packet_delayed(tp)) { /* Plain luck! Hole if filled with delayed - * packet, rather than with a retransmit. + * packet, rather than with a retransmit. Check reordering. */ - tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); + tcp_check_sack_reordering(sk, prior_snd_una, 1); /* We are getting evidence that the reordering degree is higher * than we realized. If there are no retransmits out then we @@ -2778,6 +2720,14 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag) } } +static bool tcp_force_fast_retransmit(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + return after(tcp_highest_sack_seq(tp), + tp->snd_una + tp->reordering * tp->mss_cache); +} + /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and @@ -2790,19 +2740,17 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag) * It does _not_ decide what to send, it is made in function * tcp_xmit_retransmit_queue(). */ -static void tcp_fastretrans_alert(struct sock *sk, const int acked, +static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, bool is_dupack, int *ack_flag, int *rexmit) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int fast_rexmit = 0, flag = *ack_flag; bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && - (tcp_fackets_out(tp) > tp->reordering)); + tcp_force_fast_retransmit(sk)); if (!tp->packets_out && tp->sacked_out) tp->sacked_out = 0; - if (!tp->sacked_out && tp->fackets_out) - tp->fackets_out = 0; /* Now state machine starts. * A. ECE, hence prohibit cwnd undoing, the reduction is required. */ @@ -2849,11 +2797,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, if (tcp_is_reno(tp) && is_dupack) tcp_add_reno_sack(sk); } else { - if (tcp_try_undo_partial(sk, acked)) + if (tcp_try_undo_partial(sk, prior_snd_una)) return; /* Partial ACK arrived. Force fast retransmit. */ do_lost = tcp_is_reno(tp) || - tcp_fackets_out(tp) > tp->reordering; + tcp_force_fast_retransmit(sk); } if (tcp_try_undo_dsack(sk)) { tcp_try_keep_open(sk); @@ -3016,7 +2964,7 @@ void tcp_rearm_rto(struct sock *sk) /* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */ static void tcp_set_xmit_timer(struct sock *sk) { - if (!tcp_schedule_loss_probe(sk)) + if (!tcp_schedule_loss_probe(sk, true)) tcp_rearm_rto(sk); } @@ -3063,15 +3011,15 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, * is before the ack sequence we can discard it as it's confirmed to have * arrived at the other end. */ -static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, - u32 prior_snd_una, int *acked, +static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, + u32 prior_snd_una, struct tcp_sacktag_state *sack) { const struct inet_connection_sock *icsk = inet_csk(sk); u64 first_ackt, last_ackt; struct tcp_sock *tp = tcp_sk(sk); u32 prior_sacked = tp->sacked_out; - u32 reord = tp->packets_out; + u32 reord = tp->snd_nxt; /* lowest acked un-retx un-sacked seq */ struct sk_buff *skb, *next; bool fully_acked = true; long sack_rtt_us = -1L; @@ -3086,6 +3034,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + const u32 start_seq = scb->seq; u8 sacked = scb->sacked; u32 acked_pcount; @@ -3116,7 +3065,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, first_ackt = last_ackt; last_in_flight = TCP_SKB_CB(skb)->tx.in_flight; - reord = min(pkts_acked, reord); + if (before(start_seq, reord)) + reord = start_seq; if (!after(scb->end_seq, tp->high_seq)) flag |= FLAG_ORIG_SACK_ACKED; } @@ -3194,16 +3144,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, int delta; /* Non-retransmitted hole got filled? That's reordering */ - if (reord < prior_fackets && reord <= tp->fackets_out) - tcp_update_reordering(sk, tp->fackets_out - reord, 0); + if (before(reord, prior_fack)) + tcp_check_sack_reordering(sk, reord, 0); - delta = tcp_is_fack(tp) ? pkts_acked : - prior_sacked - tp->sacked_out; + delta = prior_sacked - tp->sacked_out; tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta); } - - tp->fackets_out -= min(pkts_acked, tp->fackets_out); - } else if (skb && rtt_update && sack_rtt_us >= 0 && sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) { /* Do not re-arm RTO if the sack RTT is measured from data sent @@ -3244,7 +3190,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, } } #endif - *acked = pkts_acked; return flag; } @@ -3553,12 +3498,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; bool is_dupack = false; - u32 prior_fackets; int prior_packets = tp->packets_out; u32 delivered = tp->delivered; u32 lost = tp->lost; - int acked = 0; /* Number of packets newly acked */ int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ + u32 prior_fack; sack_state.first_sackt = 0; sack_state.rate = &rs; @@ -3590,7 +3534,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) icsk->icsk_retransmits = 0; } - prior_fackets = tp->fackets_out; + prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; rs.prior_in_flight = tcp_packets_in_flight(tp); /* ts_recent update must be made after we are sure that the packet @@ -3646,8 +3590,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) goto no_queue; /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, - &sack_state); + flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state); tcp_rack_update_reo_wnd(sk, &rs); @@ -3659,7 +3602,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (tcp_ack_is_dubious(sk, flag)) { is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); - tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); + tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, + &rexmit); } if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) @@ -3675,7 +3619,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) no_queue: /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) - tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); + tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, + &rexmit); /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. @@ -3697,7 +3642,8 @@ old_ack: if (TCP_SKB_CB(skb)->sacked) { flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_state); - tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); + tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, + &rexmit); tcp_xmit_recovery(sk, rexmit); } @@ -5706,9 +5652,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tp->tcp_header_len = sizeof(struct tcphdr); } - if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_fack) - tcp_enable_fack(tp); - tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); @@ -6187,7 +6130,6 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, if (req) { struct inet_request_sock *ireq = inet_rsk(req); - kmemcheck_annotate_bitfield(ireq, flags); ireq->ireq_opt = NULL; #if IS_ENABLED(CONFIG_IPV6) ireq->pktopts = NULL; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 0162c577bb9c..c6bc0c4d19c6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2409,8 +2409,8 @@ struct proto tcp_prot = { .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, .sysctl_mem = sysctl_tcp_mem, - .sysctl_wmem = sysctl_tcp_wmem, - .sysctl_rmem = sysctl_tcp_rmem, + .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), + .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp_sock), .slab_flags = SLAB_TYPESAFE_BY_RCU, @@ -2430,6 +2430,8 @@ static void __net_exit tcp_sk_exit(struct net *net) { int cpu; + module_put(net->ipv4.tcp_congestion_control->owner); + for_each_possible_cpu(cpu) inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); free_percpu(net->ipv4.tcp_sk); @@ -2509,12 +2511,26 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; - + if (net != &init_net) { + memcpy(net->ipv4.sysctl_tcp_rmem, + init_net.ipv4.sysctl_tcp_rmem, + sizeof(init_net.ipv4.sysctl_tcp_rmem)); + memcpy(net->ipv4.sysctl_tcp_wmem, + init_net.ipv4.sysctl_tcp_wmem, + sizeof(init_net.ipv4.sysctl_tcp_wmem)); + } net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; atomic_set(&net->ipv4.tfo_active_disable_times, 0); + /* Reno is always built in */ + if (!net_eq(net, &init_net) && + try_module_get(init_net.ipv4.tcp_congestion_control->owner)) + net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; + else + net->ipv4.tcp_congestion_control = &tcp_reno; + return 0; fail: tcp_sk_exit(net); diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 9d5ddebfd831..7097f92d16e5 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -470,10 +470,8 @@ void tcp_init_metrics(struct sock *sk) tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; } val = tcp_metric_get(tm, TCP_METRIC_REORDERING); - if (val && tp->reordering != val) { - tcp_disable_fack(tp); + if (val && tp->reordering != val) tp->reordering = val; - } crtt = tcp_metric_get(tm, TCP_METRIC_RTT); rcu_read_unlock(); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 4bb86580decd..e36eff0403f4 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -475,7 +475,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->packets_out = 0; newtp->retrans_out = 0; newtp->sacked_out = 0; - newtp->fackets_out = 0; newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; newtp->tlp_high_seq = 0; newtp->lsndtime = tcp_jiffies32; @@ -509,10 +508,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, keepalive_time_when(newtp)); newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; - if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) { - if (sock_net(sk)->ipv4.sysctl_tcp_fack) - tcp_enable_fack(newtp); - } + newtp->rx_opt.sack_ok = ireq->sack_ok; newtp->window_clamp = req->rsk_window_clamp; newtp->rcv_ssthresh = req->rsk_rcv_wnd; newtp->rcv_wnd = req->rsk_rcv_wnd; diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 11f69bbf9307..b6a2aa1dcf56 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -149,11 +149,19 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, * is freed by GSO engine */ if (copy_destructor) { + int delta; + swap(gso_skb->sk, skb->sk); swap(gso_skb->destructor, skb->destructor); sum_truesize += skb->truesize; - refcount_add(sum_truesize - gso_skb->truesize, - &skb->sk->sk_wmem_alloc); + delta = sum_truesize - gso_skb->truesize; + /* In some pathological cases, delta can be negative. + * We need to either use refcount_add() or refcount_sub_and_test() + */ + if (likely(delta >= 0)) + refcount_add(delta, &skb->sk->sk_wmem_alloc); + else + WARN_ON_ONCE(refcount_sub_and_test(-delta, &skb->sk->sk_wmem_alloc)); } delta = htonl(oldlen + (skb_tail_pointer(skb) - diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a9d917e4dad5..a4d214c7b506 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -220,7 +220,7 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, (*rcv_wscale) = 0; if (wscale_ok) { /* Set window scaling on max possible window */ - space = max_t(u32, space, sysctl_tcp_rmem[2]); + space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); space = max_t(u32, space, sysctl_rmem_max); space = min_t(u32, space, *window_clamp); while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) { @@ -1218,21 +1218,6 @@ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) } } -/* When a modification to fackets out becomes necessary, we need to check - * skb is counted to fackets_out or not. - */ -static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb, - int decr) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (!tp->sacked_out || tcp_is_reno(tp)) - return; - - if (after(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq)) - tp->fackets_out -= decr; -} - /* Pcount in the middle of the write queue got changed, we need to do various * tweaks to fix counters */ @@ -1253,11 +1238,9 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de if (tcp_is_reno(tp) && decr > 0) tp->sacked_out -= min_t(u32, tp->sacked_out, decr); - tcp_adjust_fackets_out(sk, skb, decr); - if (tp->lost_skb_hint && before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) && - (tcp_is_fack(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))) + (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) tp->lost_cnt_hint -= decr; tcp_verify_left_out(tp); @@ -1737,7 +1720,7 @@ u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, { u32 bytes, segs; - bytes = min(sk->sk_pacing_rate >> 10, + bytes = min(sk->sk_pacing_rate >> sk->sk_pacing_shift, sk->sk_gso_max_size - 1 - MAX_TCP_HEADER); /* Goal is to send at least one packet per ms, @@ -1978,7 +1961,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) goto send_now; - win_divisor = ACCESS_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor); + win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor); if (win_divisor) { u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); @@ -2215,7 +2198,7 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, { unsigned int limit; - limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10); + limit = max(2 * skb->truesize, sk->sk_pacing_rate >> sk->sk_pacing_shift); limit = min_t(u32, limit, sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); limit <<= factor; @@ -2408,7 +2391,7 @@ repair: /* Send one loss probe per tail loss episode. */ if (push_one != 2) - tcp_schedule_loss_probe(sk); + tcp_schedule_loss_probe(sk, false); is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd); tcp_cwnd_validate(sk, is_cwnd_limited); return false; @@ -2416,7 +2399,7 @@ repair: return !tp->packets_out && !tcp_write_queue_empty(sk); } -bool tcp_schedule_loss_probe(struct sock *sk) +bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -2457,7 +2440,9 @@ bool tcp_schedule_loss_probe(struct sock *sk) } /* If the RTO formula yields an earlier time, then use that time. */ - rto_delta_us = tcp_rto_delta_us(sk); /* How far in future is RTO? */ + rto_delta_us = advancing_rto ? + jiffies_to_usecs(inet_csk(sk)->icsk_rto) : + tcp_rto_delta_us(sk); /* How far in future is RTO? */ if (rto_delta_us > 0) timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us)); @@ -2961,9 +2946,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) * retransmitted data is acknowledged. It tries to continue * resending the rest of the retransmit queue, until either * we've sent it all or the congestion window limit is reached. - * If doing SACK, the first ACK which comes back for a timeout - * based retransmit packet might feed us FACK information again. - * If so, we use it to avoid unnecessarily retransmissions. */ void tcp_xmit_retransmit_queue(struct sock *sk) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index a6699af05539..e4ff25c947c5 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1852,7 +1852,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) */ /* if we're overly short, let UDP handle it */ - encap_rcv = ACCESS_ONCE(up->encap_rcv); + encap_rcv = READ_ONCE(up->encap_rcv); if (encap_rcv) { int ret; @@ -2297,7 +2297,7 @@ void udp_destroy_sock(struct sock *sk) unlock_sock_fast(sk, slow); if (static_key_false(&udp_encap_needed) && up->encap_type) { void (*encap_destroy)(struct sock *sk); - encap_destroy = ACCESS_ONCE(up->encap_destroy); + encap_destroy = READ_ONCE(up->encap_destroy); if (encap_destroy) encap_destroy(sk); } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 66d8c3d912fd..a0ae1c9d37df 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -231,7 +231,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .proxy_ndp = 0, .accept_source_route = 0, /* we do not accept RH0 by default. */ .disable_ipv6 = 0, - .accept_dad = 1, + .accept_dad = 0, .suppress_frag_ndisc = 1, .accept_ra_mtu = 1, .stable_secret = { @@ -1267,7 +1267,9 @@ out: in6_ifa_put(ifp); } -static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, struct inet6_ifaddr *ift) +static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, + struct inet6_ifaddr *ift, + bool block) { struct inet6_dev *idev = ifp->idev; struct in6_addr addr, *tmpaddr; @@ -1371,7 +1373,7 @@ retry: ift = ipv6_add_addr(idev, &addr, NULL, tmp_plen, ipv6_addr_scope(&addr), addr_flags, - tmp_valid_lft, tmp_prefered_lft, true, NULL); + tmp_valid_lft, tmp_prefered_lft, block, NULL); if (IS_ERR(ift)) { in6_ifa_put(ifp); in6_dev_put(idev); @@ -1956,7 +1958,7 @@ static void addrconf_dad_stop(struct inet6_ifaddr *ifp, int dad_failed) if (ifpub) { in6_ifa_hold(ifpub); spin_unlock_bh(&ifp->lock); - ipv6_create_tempaddr(ifpub, ifp); + ipv6_create_tempaddr(ifpub, ifp, true); in6_ifa_put(ifpub); } else { spin_unlock_bh(&ifp->lock); @@ -2456,7 +2458,7 @@ static void manage_tempaddrs(struct inet6_dev *idev, * no temporary address currently exists. */ read_unlock_bh(&idev->lock); - ipv6_create_tempaddr(ifp, NULL); + ipv6_create_tempaddr(ifp, NULL, false); } else { read_unlock_bh(&idev->lock); } @@ -4351,7 +4353,7 @@ restart: spin_lock(&ifpub->lock); ifpub->regen_count = 0; spin_unlock(&ifpub->lock); - ipv6_create_tempaddr(ifpub, ifp); + ipv6_create_tempaddr(ifpub, ifp, true); in6_ifa_put(ifpub); in6_ifa_put(ifp); goto restart; @@ -5057,6 +5059,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_ENHANCED_DAD] = cnf->enhanced_dad; array[DEVCONF_ADDR_GEN_MODE] = cnf->addr_gen_mode; array[DEVCONF_DISABLE_POLICY] = cnf->disable_policy; + array[DEVCONF_NDISC_TCLASS] = cnf->ndisc_tclass; } static inline size_t inet6_ifla6_size(void) @@ -5984,6 +5987,7 @@ int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write, } static int minus_one = -1; +static const int zero = 0; static const int one = 1; static const int two_five_five = 255; @@ -6355,6 +6359,15 @@ static const struct ctl_table addrconf_sysctl[] = { .proc_handler = addrconf_sysctl_disable_policy, }, { + .procname = "ndisc_tclass", + .data = &ipv6_devconf.ndisc_tclass, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&zero, + .extra2 = (void *)&two_five_five, + }, + { /* sentinel */ } }; diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 37bb33fbc742..78c974391567 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -444,7 +444,7 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb) if (err == -EINPROGRESS) goto out; - if (err == -EBUSY) + if (err == -ENOSPC) err = NET_XMIT_DROP; goto out_free; } diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 4000b71bfdc5..a902ff8f59be 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -396,7 +396,7 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info case -EINPROGRESS: goto error; - case -EBUSY: + case -ENOSPC: err = NET_XMIT_DROP; break; diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h index e0170f62bc39..3c7a11b62334 100644 --- a/net/ipv6/ila/ila.h +++ b/net/ipv6/ila/ila.h @@ -55,17 +55,6 @@ struct ila_identifier { }; }; -enum { - ILA_ATYPE_IID = 0, - ILA_ATYPE_LUID, - ILA_ATYPE_VIRT_V4, - ILA_ATYPE_VIRT_UNI_V6, - ILA_ATYPE_VIRT_MULTI_V6, - ILA_ATYPE_RSVD_1, - ILA_ATYPE_RSVD_2, - ILA_ATYPE_RSVD_3, -}; - #define CSUM_NEUTRAL_FLAG htonl(0x10000000) struct ila_addr { @@ -93,6 +82,7 @@ struct ila_params { struct ila_locator locator_match; __wsum csum_diff; u8 csum_mode; + u8 ident_type; }; static inline __wsum compute_csum_diff8(const __be32 *from, const __be32 *to) diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c index aba0998ddbfb..8c88ecf29b93 100644 --- a/net/ipv6/ila/ila_common.c +++ b/net/ipv6/ila/ila_common.c @@ -13,30 +13,37 @@ #include <uapi/linux/ila.h> #include "ila.h" -static __wsum get_csum_diff(struct ipv6hdr *ip6h, struct ila_params *p) +void ila_init_saved_csum(struct ila_params *p) { - struct ila_addr *iaddr = ila_a2i(&ip6h->daddr); + if (!p->locator_match.v64) + return; + p->csum_diff = compute_csum_diff8( + (__be32 *)&p->locator, + (__be32 *)&p->locator_match); +} + +static __wsum get_csum_diff_iaddr(struct ila_addr *iaddr, struct ila_params *p) +{ if (p->locator_match.v64) return p->csum_diff; else - return compute_csum_diff8((__be32 *)&iaddr->loc, - (__be32 *)&p->locator); + return compute_csum_diff8((__be32 *)&p->locator, + (__be32 *)&iaddr->loc); } -static void ila_csum_do_neutral(struct ila_addr *iaddr, - struct ila_params *p) +static __wsum get_csum_diff(struct ipv6hdr *ip6h, struct ila_params *p) +{ + return get_csum_diff_iaddr(ila_a2i(&ip6h->daddr), p); +} + +static void ila_csum_do_neutral_fmt(struct ila_addr *iaddr, + struct ila_params *p) { __sum16 *adjust = (__force __sum16 *)&iaddr->ident.v16[3]; __wsum diff, fval; - /* Check if checksum adjust value has been cached */ - if (p->locator_match.v64) { - diff = p->csum_diff; - } else { - diff = compute_csum_diff8((__be32 *)&p->locator, - (__be32 *)iaddr); - } + diff = get_csum_diff_iaddr(iaddr, p); fval = (__force __wsum)(ila_csum_neutral_set(iaddr->ident) ? CSUM_NEUTRAL_FLAG : ~CSUM_NEUTRAL_FLAG); @@ -53,13 +60,23 @@ static void ila_csum_do_neutral(struct ila_addr *iaddr, iaddr->ident.csum_neutral ^= 1; } -static void ila_csum_adjust_transport(struct sk_buff *skb, +static void ila_csum_do_neutral_nofmt(struct ila_addr *iaddr, struct ila_params *p) { + __sum16 *adjust = (__force __sum16 *)&iaddr->ident.v16[3]; __wsum diff; - struct ipv6hdr *ip6h = ipv6_hdr(skb); - struct ila_addr *iaddr = ila_a2i(&ip6h->daddr); + + diff = get_csum_diff_iaddr(iaddr, p); + + *adjust = ~csum_fold(csum_add(diff, csum_unfold(*adjust))); +} + +static void ila_csum_adjust_transport(struct sk_buff *skb, + struct ila_params *p) +{ size_t nhoff = sizeof(struct ipv6hdr); + struct ipv6hdr *ip6h = ipv6_hdr(skb); + __wsum diff; switch (ip6h->nexthdr) { case NEXTHDR_TCP: @@ -98,52 +115,45 @@ static void ila_csum_adjust_transport(struct sk_buff *skb, } break; } - - /* Now change destination address */ - iaddr->loc = p->locator; } void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p, - bool set_csum_neutral) + bool sir2ila) { struct ipv6hdr *ip6h = ipv6_hdr(skb); struct ila_addr *iaddr = ila_a2i(&ip6h->daddr); - /* First deal with the transport checksum */ - if (ila_csum_neutral_set(iaddr->ident)) { - /* C-bit is set in the locator indicating that this - * is a locator being translated to a SIR address. - * Perform (receiver) checksum-neutral translation. - */ - if (!set_csum_neutral) - ila_csum_do_neutral(iaddr, p); - } else { - switch (p->csum_mode) { - case ILA_CSUM_ADJUST_TRANSPORT: - ila_csum_adjust_transport(skb, p); - break; - case ILA_CSUM_NEUTRAL_MAP: - ila_csum_do_neutral(iaddr, p); - break; - case ILA_CSUM_NO_ACTION: + switch (p->csum_mode) { + case ILA_CSUM_ADJUST_TRANSPORT: + ila_csum_adjust_transport(skb, p); + break; + case ILA_CSUM_NEUTRAL_MAP: + if (sir2ila) { + if (WARN_ON(ila_csum_neutral_set(iaddr->ident))) { + /* Checksum flag should never be + * set in a formatted SIR address. + */ + break; + } + } else if (!ila_csum_neutral_set(iaddr->ident)) { + /* ILA to SIR translation and C-bit isn't + * set so we're good. + */ break; } + ila_csum_do_neutral_fmt(iaddr, p); + break; + case ILA_CSUM_NEUTRAL_MAP_AUTO: + ila_csum_do_neutral_nofmt(iaddr, p); + break; + case ILA_CSUM_NO_ACTION: + break; } /* Now change destination address */ iaddr->loc = p->locator; } -void ila_init_saved_csum(struct ila_params *p) -{ - if (!p->locator_match.v64) - return; - - p->csum_diff = compute_csum_diff8( - (__be32 *)&p->locator, - (__be32 *)&p->locator_match); -} - static int __init ila_init(void) { int ret; diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c index 696281b4bca2..3d56a2fb6f86 100644 --- a/net/ipv6/ila/ila_lwt.c +++ b/net/ipv6/ila/ila_lwt.c @@ -20,6 +20,7 @@ struct ila_lwt { struct ila_params p; struct dst_cache dst_cache; u32 connected : 1; + u32 lwt_output : 1; }; static inline struct ila_lwt *ila_lwt_lwtunnel( @@ -45,8 +46,10 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (skb->protocol != htons(ETH_P_IPV6)) goto drop; - ila_update_ipv6_locator(skb, ila_params_lwtunnel(orig_dst->lwtstate), - true); + if (ilwt->lwt_output) + ila_update_ipv6_locator(skb, + ila_params_lwtunnel(orig_dst->lwtstate), + true); if (rt->rt6i_flags & (RTF_GATEWAY | RTF_CACHE)) { /* Already have a next hop address in route, no need for @@ -98,11 +101,15 @@ drop: static int ila_input(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); + struct ila_lwt *ilwt = ila_lwt_lwtunnel(dst->lwtstate); if (skb->protocol != htons(ETH_P_IPV6)) goto drop; - ila_update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate), false); + if (!ilwt->lwt_output) + ila_update_ipv6_locator(skb, + ila_params_lwtunnel(dst->lwtstate), + false); return dst->lwtstate->orig_input(skb); @@ -114,6 +121,8 @@ drop: static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { [ILA_ATTR_LOCATOR] = { .type = NLA_U64, }, [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, }, + [ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, }, + [ILA_ATTR_HOOK_TYPE] = { .type = NLA_U8, }, }; static int ila_build_state(struct nlattr *nla, @@ -127,33 +136,84 @@ static int ila_build_state(struct nlattr *nla, struct lwtunnel_state *newts; const struct fib6_config *cfg6 = cfg; struct ila_addr *iaddr; + u8 ident_type = ILA_ATYPE_USE_FORMAT; + u8 hook_type = ILA_HOOK_ROUTE_OUTPUT; + u8 csum_mode = ILA_CSUM_NO_ACTION; + bool lwt_output = true; + u8 eff_ident_type; int ret; if (family != AF_INET6) return -EINVAL; - if (cfg6->fc_dst_len < 8 * sizeof(struct ila_locator) + 3) { - /* Need to have full locator and at least type field - * included in destination - */ + ret = nla_parse_nested(tb, ILA_ATTR_MAX, nla, ila_nl_policy, extack); + if (ret < 0) + return ret; + + if (!tb[ILA_ATTR_LOCATOR]) return -EINVAL; - } iaddr = (struct ila_addr *)&cfg6->fc_dst; - if (!ila_addr_is_ila(iaddr) || ila_csum_neutral_set(iaddr->ident)) { - /* Don't allow translation for a non-ILA address or checksum - * neutral flag to be set. + if (tb[ILA_ATTR_IDENT_TYPE]) + ident_type = nla_get_u8(tb[ILA_ATTR_IDENT_TYPE]); + + if (ident_type == ILA_ATYPE_USE_FORMAT) { + /* Infer identifier type from type field in formatted + * identifier. */ + + if (cfg6->fc_dst_len < 8 * sizeof(struct ila_locator) + 3) { + /* Need to have full locator and at least type field + * included in destination + */ + return -EINVAL; + } + + eff_ident_type = iaddr->ident.type; + } else { + eff_ident_type = ident_type; + } + + switch (eff_ident_type) { + case ILA_ATYPE_IID: + /* Don't allow ILA for IID type */ + return -EINVAL; + case ILA_ATYPE_LUID: + break; + case ILA_ATYPE_VIRT_V4: + case ILA_ATYPE_VIRT_UNI_V6: + case ILA_ATYPE_VIRT_MULTI_V6: + case ILA_ATYPE_NONLOCAL_ADDR: + /* These ILA formats are not supported yet. */ + default: return -EINVAL; } - ret = nla_parse_nested(tb, ILA_ATTR_MAX, nla, ila_nl_policy, extack); - if (ret < 0) - return ret; + if (tb[ILA_ATTR_HOOK_TYPE]) + hook_type = nla_get_u8(tb[ILA_ATTR_HOOK_TYPE]); + + switch (hook_type) { + case ILA_HOOK_ROUTE_OUTPUT: + lwt_output = true; + break; + case ILA_HOOK_ROUTE_INPUT: + lwt_output = false; + break; + default: + return -EINVAL; + } - if (!tb[ILA_ATTR_LOCATOR]) + if (tb[ILA_ATTR_CSUM_MODE]) + csum_mode = nla_get_u8(tb[ILA_ATTR_CSUM_MODE]); + + if (csum_mode == ILA_CSUM_NEUTRAL_MAP && + ila_csum_neutral_set(iaddr->ident)) { + /* Don't allow translation if checksum neutral bit is + * configured and it's set in the SIR address. + */ return -EINVAL; + } newts = lwtunnel_state_alloc(sizeof(*ilwt)); if (!newts) @@ -166,19 +226,18 @@ static int ila_build_state(struct nlattr *nla, return ret; } + ilwt->lwt_output = !!lwt_output; + p = ila_params_lwtunnel(newts); + p->csum_mode = csum_mode; + p->ident_type = ident_type; p->locator.v64 = (__force __be64)nla_get_u64(tb[ILA_ATTR_LOCATOR]); /* Precompute checksum difference for translation since we * know both the old locator and the new one. */ p->locator_match = iaddr->loc; - p->csum_diff = compute_csum_diff8( - (__be32 *)&p->locator_match, (__be32 *)&p->locator); - - if (tb[ILA_ATTR_CSUM_MODE]) - p->csum_mode = nla_get_u8(tb[ILA_ATTR_CSUM_MODE]); ila_init_saved_csum(p); @@ -203,13 +262,23 @@ static int ila_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwtstate) { struct ila_params *p = ila_params_lwtunnel(lwtstate); + struct ila_lwt *ilwt = ila_lwt_lwtunnel(lwtstate); if (nla_put_u64_64bit(skb, ILA_ATTR_LOCATOR, (__force u64)p->locator.v64, ILA_ATTR_PAD)) goto nla_put_failure; + if (nla_put_u8(skb, ILA_ATTR_CSUM_MODE, (__force u8)p->csum_mode)) goto nla_put_failure; + if (nla_put_u8(skb, ILA_ATTR_IDENT_TYPE, (__force u8)p->ident_type)) + goto nla_put_failure; + + if (nla_put_u8(skb, ILA_ATTR_HOOK_TYPE, + ilwt->lwt_output ? ILA_HOOK_ROUTE_OUTPUT : + ILA_HOOK_ROUTE_INPUT)) + goto nla_put_failure; + return 0; nla_put_failure: @@ -220,6 +289,8 @@ static int ila_encap_nlsize(struct lwtunnel_state *lwtstate) { return nla_total_size_64bit(sizeof(u64)) + /* ILA_ATTR_LOCATOR */ nla_total_size(sizeof(u8)) + /* ILA_ATTR_CSUM_MODE */ + nla_total_size(sizeof(u8)) + /* ILA_ATTR_IDENT_TYPE */ + nla_total_size(sizeof(u8)) + /* ILA_ATTR_HOOK_TYPE */ 0; } diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 3123b9de91b5..6eb5e68f112a 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -121,6 +121,7 @@ static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { [ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, }, [ILA_ATTR_IFINDEX] = { .type = NLA_U32, }, [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, }, + [ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, }, }; static int parse_nl_config(struct genl_info *info, @@ -138,6 +139,14 @@ static int parse_nl_config(struct genl_info *info, if (info->attrs[ILA_ATTR_CSUM_MODE]) xp->ip.csum_mode = nla_get_u8(info->attrs[ILA_ATTR_CSUM_MODE]); + else + xp->ip.csum_mode = ILA_CSUM_NO_ACTION; + + if (info->attrs[ILA_ATTR_IDENT_TYPE]) + xp->ip.ident_type = nla_get_u8( + info->attrs[ILA_ATTR_IDENT_TYPE]); + else + xp->ip.ident_type = ILA_ATYPE_USE_FORMAT; if (info->attrs[ILA_ATTR_IFINDEX]) xp->ifindex = nla_get_s32(info->attrs[ILA_ATTR_IFINDEX]); @@ -198,7 +207,7 @@ static void ila_free_cb(void *ptr, void *arg) } } -static int ila_xlat_addr(struct sk_buff *skb, bool set_csum_neutral); +static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila); static unsigned int ila_nf_input(void *priv, @@ -396,7 +405,8 @@ static int ila_fill_info(struct ila_map *ila, struct sk_buff *msg) (__force u64)ila->xp.ip.locator_match.v64, ILA_ATTR_PAD) || nla_put_s32(msg, ILA_ATTR_IFINDEX, ila->xp.ifindex) || - nla_put_u32(msg, ILA_ATTR_CSUM_MODE, ila->xp.ip.csum_mode)) + nla_put_u8(msg, ILA_ATTR_CSUM_MODE, ila->xp.ip.csum_mode) || + nla_put_u8(msg, ILA_ATTR_IDENT_TYPE, ila->xp.ip.ident_type)) return -1; return 0; @@ -607,7 +617,7 @@ static struct pernet_operations ila_net_ops = { .size = sizeof(struct ila_net), }; -static int ila_xlat_addr(struct sk_buff *skb, bool set_csum_neutral) +static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila) { struct ila_map *ila; struct ipv6hdr *ip6h = ipv6_hdr(skb); @@ -617,16 +627,16 @@ static int ila_xlat_addr(struct sk_buff *skb, bool set_csum_neutral) /* Assumes skb contains a valid IPv6 header that is pulled */ - if (!ila_addr_is_ila(iaddr)) { - /* Type indicates this is not an ILA address */ - return 0; - } + /* No check here that ILA type in the mapping matches what is in the + * address. We assume that whatever sender gaves us can be translated. + * The checksum mode however is relevant. + */ rcu_read_lock(); ila = ila_lookup_wildcards(iaddr, skb->dev->ifindex, ilan); if (ila) - ila_update_ipv6_locator(skb, &ila->xp.ip, set_csum_neutral); + ila_update_ipv6_locator(skb, &ila->xp.ip, sir2ila); rcu_read_unlock(); diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 15535ee327c5..9f2e73c71768 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -47,7 +47,7 @@ static atomic_t fl_size = ATOMIC_INIT(0); static struct ip6_flowlabel __rcu *fl_ht[FL_HASH_MASK+1]; static void ip6_fl_gc(unsigned long dummy); -static DEFINE_TIMER(ip6_fl_gc_timer, ip6_fl_gc, 0, 0); +static DEFINE_TIMER(ip6_fl_gc_timer, ip6_fl_gc); /* FL hash table lock: it protects only of GC */ diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 3e10c51e7e0c..4cfd8e0696fe 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -369,6 +369,7 @@ static void ip6gre_tunnel_uninit(struct net_device *dev) static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { + struct net *net = dev_net(skb->dev); const struct gre_base_hdr *greh; const struct ipv6hdr *ipv6h; int grehlen = sizeof(*greh); @@ -402,9 +403,8 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return; switch (type) { - __u32 teli; struct ipv6_tlv_tnl_enc_lim *tel; - __u32 mtu; + __u32 teli; case ICMPV6_DEST_UNREACH: net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", t->parms.name); @@ -435,12 +435,11 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } return; case ICMPV6_PKT_TOOBIG: - mtu = be32_to_cpu(info) - offset - t->tun_hlen; - if (t->dev->type == ARPHRD_ETHER) - mtu -= ETH_HLEN; - if (mtu < IPV6_MIN_MTU) - mtu = IPV6_MIN_MTU; - t->dev->mtu = mtu; + ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); + return; + case NDISC_REDIRECT: + ip6_redirect(skb, net, skb->dev->ifindex, 0, + sock_net_uid(net, NULL)); return; } @@ -461,7 +460,7 @@ static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) &ipv6h->saddr, &ipv6h->daddr, tpi->key, tpi->proto); if (tunnel) { - ip6_tnl_rcv(tunnel, skb, tpi, NULL, false); + ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error); return PACKET_RCVD; } @@ -503,7 +502,6 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, __u32 *pmtu, __be16 proto) { struct ip6_tnl *tunnel = netdev_priv(dev); - struct dst_entry *dst = skb_dst(skb); __be16 protocol; if (dev->type == ARPHRD_ETHER) @@ -522,10 +520,6 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, protocol, tunnel->parms.o_key, htonl(tunnel->o_seqno)); - /* TooBig packet may have updated dst->dev's mtu */ - if (dst && dst_mtu(dst) > dst->dev->mtu) - dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu); - return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu, NEXTHDR_GRE); } diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 439d65f7e094..3d3092adf1d2 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -471,15 +471,16 @@ static int ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, u8 *type, u8 *code, int *msg, __u32 *info, int offset) { - const struct ipv6hdr *ipv6h = (const struct ipv6hdr *) skb->data; - struct ip6_tnl *t; - int rel_msg = 0; + const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)skb->data; + struct net *net = dev_net(skb->dev); u8 rel_type = ICMPV6_DEST_UNREACH; u8 rel_code = ICMPV6_ADDR_UNREACH; - u8 tproto; __u32 rel_info = 0; - __u16 len; + struct ip6_tnl *t; int err = -ENOENT; + int rel_msg = 0; + u8 tproto; + __u16 len; /* If the packet doesn't contain the original IPv6 header we are in trouble since we might need the source address for further @@ -490,16 +491,15 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, if (!t) goto out; - tproto = ACCESS_ONCE(t->parms.proto); + tproto = READ_ONCE(t->parms.proto); if (tproto != ipproto && tproto != 0) goto out; err = 0; switch (*type) { - __u32 teli; struct ipv6_tlv_tnl_enc_lim *tel; - __u32 mtu; + __u32 mtu, teli; case ICMPV6_DEST_UNREACH: net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", t->parms.name); @@ -530,11 +530,11 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, } break; case ICMPV6_PKT_TOOBIG: + ip6_update_pmtu(skb, net, htonl(*info), 0, 0, + sock_net_uid(net, NULL)); mtu = *info - offset; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; - t->dev->mtu = mtu; - len = sizeof(*ipv6h) + ntohs(ipv6h->payload_len); if (len > mtu) { rel_type = ICMPV6_PKT_TOOBIG; @@ -543,6 +543,10 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, rel_msg = 1; } break; + case NDISC_REDIRECT: + ip6_redirect(skb, net, skb->dev->ifindex, 0, + sock_net_uid(net, NULL)); + break; } *type = rel_type; @@ -559,13 +563,12 @@ static int ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { - int rel_msg = 0; - u8 rel_type = type; - u8 rel_code = code; __u32 rel_info = ntohl(info); - int err; - struct sk_buff *skb2; const struct iphdr *eiph; + struct sk_buff *skb2; + int err, rel_msg = 0; + u8 rel_type = type; + u8 rel_code = code; struct rtable *rt; struct flowi4 fl4; @@ -590,10 +593,6 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, rel_type = ICMP_DEST_UNREACH; rel_code = ICMP_FRAG_NEEDED; break; - case NDISC_REDIRECT: - rel_type = ICMP_REDIRECT; - rel_code = ICMP_REDIR_HOST; - /* fall through */ default: return 0; } @@ -612,33 +611,26 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, eiph = ip_hdr(skb2); /* Try to guess incoming interface */ - rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, - eiph->saddr, 0, - 0, 0, - IPPROTO_IPIP, RT_TOS(eiph->tos), 0); + rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, eiph->saddr, + 0, 0, 0, IPPROTO_IPIP, RT_TOS(eiph->tos), 0); if (IS_ERR(rt)) goto out; skb2->dev = rt->dst.dev; + ip_rt_put(rt); /* route "incoming" packet */ if (rt->rt_flags & RTCF_LOCAL) { - ip_rt_put(rt); - rt = NULL; rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, - eiph->daddr, eiph->saddr, - 0, 0, - IPPROTO_IPIP, - RT_TOS(eiph->tos), 0); - if (IS_ERR(rt) || - rt->dst.dev->type != ARPHRD_TUNNEL) { + eiph->daddr, eiph->saddr, 0, 0, + IPPROTO_IPIP, RT_TOS(eiph->tos), 0); + if (IS_ERR(rt) || rt->dst.dev->type != ARPHRD_TUNNEL) { if (!IS_ERR(rt)) ip_rt_put(rt); goto out; } skb_dst_set(skb2, &rt->dst); } else { - ip_rt_put(rt); if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) || skb_dst(skb2)->dev->type != ARPHRD_TUNNEL) @@ -650,10 +642,9 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (rel_info > dst_mtu(skb_dst(skb2))) goto out; - skb_dst(skb2)->ops->update_pmtu(skb_dst(skb2), NULL, skb2, rel_info); + skb_dst(skb2)->ops->update_pmtu(skb_dst(skb2), NULL, skb2, + rel_info); } - if (rel_type == ICMP_REDIRECT) - skb_dst(skb2)->ops->redirect(skb_dst(skb2), NULL, skb2); icmp_send(skb2, rel_type, rel_code, htonl(rel_info)); @@ -666,11 +657,10 @@ static int ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { - int rel_msg = 0; + __u32 rel_info = ntohl(info); + int err, rel_msg = 0; u8 rel_type = type; u8 rel_code = code; - __u32 rel_info = ntohl(info); - int err; err = ip6_tnl_err(skb, IPPROTO_IPV6, opt, &rel_type, &rel_code, &rel_msg, &rel_info, offset); @@ -901,7 +891,7 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto, t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, &ipv6h->daddr); if (t) { - u8 tproto = ACCESS_ONCE(t->parms.proto); + u8 tproto = READ_ONCE(t->parms.proto); if (tproto != ipproto && tproto != 0) goto drop; @@ -1236,7 +1226,7 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - tproto = ACCESS_ONCE(t->parms.proto); + tproto = READ_ONCE(t->parms.proto); if (tproto != IPPROTO_IPIP && tproto != 0) return -1; @@ -1306,7 +1296,7 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) u8 tproto; int err; - tproto = ACCESS_ONCE(t->parms.proto); + tproto = READ_ONCE(t->parms.proto); if ((tproto != IPPROTO_IPV6 && tproto != 0) || ip6_tnl_addr_conflict(t, ipv6h)) return -1; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 59fad81e5f7a..9c24b85949c1 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1617,6 +1617,10 @@ int ip6mr_sk_done(struct sock *sk) struct net *net = sock_net(sk); struct mr6_table *mrt; + if (sk->sk_type != SOCK_RAW || + inet_sk(sk)->inet_num != IPPROTO_ICMPV6) + return err; + rtnl_lock(); ip6mr_for_each_table(mrt, net) { if (sk == mrt->mroute6_sk) { diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index f9c3ffe04382..b3cea200c85e 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -427,12 +427,19 @@ static void ip6_nd_hdr(struct sk_buff *skb, int hop_limit, int len) { struct ipv6hdr *hdr; + struct inet6_dev *idev; + unsigned tclass; + + rcu_read_lock(); + idev = __in6_dev_get(skb->dev); + tclass = idev ? idev->cnf.ndisc_tclass : 0; + rcu_read_unlock(); skb_push(skb, sizeof(*hdr)); skb_reset_network_header(skb); hdr = ipv6_hdr(skb); - ip6_flow_hdr(hdr, 0, 0); + ip6_flow_hdr(hdr, tclass, 0); hdr->payload_len = htons(len); hdr->nexthdr = IPPROTO_ICMPV6; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 01bd3ee5ebc6..f06e25065a34 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -800,6 +800,25 @@ get_counters(const struct xt_table_info *t, } } +static void get_old_counters(const struct xt_table_info *t, + struct xt_counters counters[]) +{ + struct ip6t_entry *iter; + unsigned int cpu, i; + + for_each_possible_cpu(cpu) { + i = 0; + xt_entry_foreach(iter, t->entries, t->size) { + const struct xt_counters *tmp; + + tmp = xt_get_per_cpu_counter(&iter->counters, cpu); + ADD_COUNTER(counters[i], tmp->bcnt, tmp->pcnt); + ++i; + } + cond_resched(); + } +} + static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; @@ -1090,8 +1109,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); - /* Get the old counters, and synchronize with replace */ - get_counters(oldinfo, counters); + get_old_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ xt_entry_foreach(iter, oldinfo->entries, oldinfo->size) diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index fe01dc953c56..3b80a38f62b8 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -339,7 +339,7 @@ static void ipv6_hooks_unregister(struct net *net) mutex_unlock(®ister_ipv6_hooks); } -struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = { +const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = { .l3proto = PF_INET6, .pkt_to_tuple = ipv6_pkt_to_tuple, .invert_tuple = ipv6_invert_tuple, diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index a9e1fd1a8536..3ac0d826afc4 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -94,7 +94,6 @@ static int icmpv6_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - u_int8_t pf, unsigned int *timeout) { /* Do not immediately delete the connection after the first @@ -176,6 +175,12 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl, return NF_ACCEPT; } +static void icmpv6_error_log(const struct sk_buff *skb, struct net *net, + u8 pf, const char *msg) +{ + nf_l4proto_log_invalid(skb, net, pf, IPPROTO_ICMPV6, "%s", msg); +} + static int icmpv6_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, @@ -187,17 +192,13 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl, icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih); if (icmp6h == NULL) { - if (LOG_INVALID(net, IPPROTO_ICMPV6)) - nf_log_packet(net, PF_INET6, 0, skb, NULL, NULL, NULL, - "nf_ct_icmpv6: short packet "); + icmpv6_error_log(skb, net, pf, "short packet"); return -NF_ACCEPT; } if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) { - if (LOG_INVALID(net, IPPROTO_ICMPV6)) - nf_log_packet(net, PF_INET6, 0, skb, NULL, NULL, NULL, - "nf_ct_icmpv6: ICMPv6 checksum failed "); + icmpv6_error_log(skb, net, pf, "ICMPv6 checksum failed"); return -NF_ACCEPT; } @@ -258,9 +259,14 @@ static int icmpv6_nlattr_to_tuple(struct nlattr *tb[], return 0; } -static int icmpv6_nlattr_tuple_size(void) +static unsigned int icmpv6_nlattr_tuple_size(void) { - return nla_policy_len(icmpv6_nla_policy, CTA_PROTO_MAX + 1); + static unsigned int size __read_mostly; + + if (!size) + size = nla_policy_len(icmpv6_nla_policy, CTA_PROTO_MAX + 1); + + return size; } #endif diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index a338bbc33cf3..4a7e5ffa5108 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -31,37 +31,6 @@ static u32 __ipv6_select_ident(struct net *net, u32 hashrnd, return id; } -/* This function exists only for tap drivers that must support broken - * clients requesting UFO without specifying an IPv6 fragment ID. - * - * This is similar to ipv6_select_ident() but we use an independent hash - * seed to limit information leakage. - * - * The network header must be set before calling this. - */ -void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb) -{ - static u32 ip6_proxy_idents_hashrnd __read_mostly; - struct in6_addr buf[2]; - struct in6_addr *addrs; - u32 id; - - addrs = skb_header_pointer(skb, - skb_network_offset(skb) + - offsetof(struct ipv6hdr, saddr), - sizeof(buf), buf); - if (!addrs) - return; - - net_get_random_once(&ip6_proxy_idents_hashrnd, - sizeof(ip6_proxy_idents_hashrnd)); - - id = __ipv6_select_ident(net, ip6_proxy_idents_hashrnd, - &addrs[1], &addrs[0]); - skb_shinfo(skb)->ip6_frag_id = htonl(id); -} -EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident); - __be32 ipv6_select_ident(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 70d9659fc1e9..05eb7bc36156 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2378,6 +2378,7 @@ out: static int ip6_convert_metrics(struct mx6_config *mxc, const struct fib6_config *cfg) { + struct net *net = cfg->fc_nlinfo.nl_net; bool ecn_ca = false; struct nlattr *nla; int remaining; @@ -2403,7 +2404,7 @@ static int ip6_convert_metrics(struct mx6_config *mxc, char tmp[TCP_CA_NAME_MAX]; nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp, &ecn_ca); + val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca); if (val == TCP_CA_UNSPEC) goto err; } else { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 0e2529958b52..6bb98c93edfe 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1940,8 +1940,8 @@ struct proto tcpv6_prot = { .memory_pressure = &tcp_memory_pressure, .orphan_count = &tcp_orphan_count, .sysctl_mem = sysctl_tcp_mem, - .sysctl_wmem = sysctl_tcp_wmem, - .sysctl_rmem = sysctl_tcp_rmem, + .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), + .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp6_sock), .slab_flags = SLAB_TYPESAFE_BY_RCU, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 40d7234c27b9..3f30fa313bf2 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -606,7 +606,7 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) */ /* if we're overly short, let UDP handle it */ - encap_rcv = ACCESS_ONCE(up->encap_rcv); + encap_rcv = READ_ONCE(up->encap_rcv); if (encap_rcv) { int ret; @@ -1432,7 +1432,7 @@ void udpv6_destroy_sock(struct sock *sk) if (static_key_false(&udpv6_encap_needed) && up->encap_type) { void (*encap_destroy)(struct sock *sk); - encap_destroy = ACCESS_ONCE(up->encap_destroy); + encap_destroy = READ_ONCE(up->encap_destroy); if (encap_destroy) encap_destroy(sk); } diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 4e438bc7ee87..f85f0d7480ac 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -338,6 +338,14 @@ static int __net_init xfrm6_tunnel_net_init(struct net *net) static void __net_exit xfrm6_tunnel_net_exit(struct net *net) { + struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); + unsigned int i; + + for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) + WARN_ON_ONCE(!hlist_empty(&xfrm6_tn->spi_byaddr[i])); + + for (i = 0; i < XFRM6_TUNNEL_SPI_BYSPI_HSIZE; i++) + WARN_ON_ONCE(!hlist_empty(&xfrm6_tn->spi_byspi[i])); } static struct pernet_operations xfrm6_tunnel_net_ops = { diff --git a/net/key/af_key.c b/net/key/af_key.c index a00d607e7224..3dffb892d52c 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3845,7 +3845,7 @@ static void __net_exit pfkey_net_exit(struct net *net) struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); pfkey_exit_proc(net); - BUG_ON(!hlist_empty(&net_pfkey->table)); + WARN_ON(!hlist_empty(&net_pfkey->table)); } static struct pernet_operations pfkey_net_ops = { diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 7c8d1eb757a5..115918ad8eca 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1246,8 +1246,6 @@ static void l2tp_tunnel_destruct(struct sock *sk) list_del_rcu(&tunnel->list); spin_unlock_bh(&pn->l2tp_tunnel_list_lock); - l2tp_tunnel_closeall(tunnel); - tunnel->sock = NULL; l2tp_tunnel_dec_refcount(tunnel); @@ -1835,6 +1833,7 @@ static __net_exit void l2tp_exit_net(struct net *net) { struct l2tp_net *pn = l2tp_pernet(net); struct l2tp_tunnel *tunnel = NULL; + int hash; rcu_read_lock_bh(); list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) { @@ -1844,6 +1843,9 @@ static __net_exit void l2tp_exit_net(struct net *net) flush_workqueue(l2tp_wq); rcu_barrier(); + + for (hash = 0; hash < L2TP_HASH_SIZE_2; hash++) + WARN_ON_ONCE(!hlist_empty(&pn->l2tp_session_hlist[hash])); } static struct pernet_operations l2tp_net_ops = { diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index 3e2dec1fb0f5..5c366ecfa1cb 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -41,7 +41,6 @@ /* via netdev_priv() */ struct l2tp_eth { - struct sock *tunnel_sock; struct l2tp_session *session; atomic_long_t tx_bytes; atomic_long_t tx_packets; @@ -313,7 +312,6 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel, priv = netdev_priv(dev); priv->session = session; - priv->tunnel_sock = tunnel->sock; session->recv_skb = l2tp_eth_dev_recv; session->session_close = l2tp_eth_delete; #if IS_ENABLED(CONFIG_L2TP_DEBUGFS) diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 6dbe450400a2..ff61124fdf59 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -123,6 +123,7 @@ static int l2tp_ip_recv(struct sk_buff *skb) unsigned char *ptr, *optr; struct l2tp_session *session; struct l2tp_tunnel *tunnel = NULL; + struct iphdr *iph; int length; if (!pskb_may_pull(skb, 4)) @@ -178,24 +179,17 @@ pass_up: goto discard; tunnel_id = ntohl(*(__be32 *) &skb->data[4]); - tunnel = l2tp_tunnel_find(net, tunnel_id); - if (tunnel) { - sk = tunnel->sock; - sock_hold(sk); - } else { - struct iphdr *iph = (struct iphdr *) skb_network_header(skb); - - read_lock_bh(&l2tp_ip_lock); - sk = __l2tp_ip_bind_lookup(net, iph->daddr, iph->saddr, - inet_iif(skb), tunnel_id); - if (!sk) { - read_unlock_bh(&l2tp_ip_lock); - goto discard; - } + iph = (struct iphdr *)skb_network_header(skb); - sock_hold(sk); + read_lock_bh(&l2tp_ip_lock); + sk = __l2tp_ip_bind_lookup(net, iph->daddr, iph->saddr, inet_iif(skb), + tunnel_id); + if (!sk) { read_unlock_bh(&l2tp_ip_lock); + goto discard; } + sock_hold(sk); + read_unlock_bh(&l2tp_ip_lock); if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_put; diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 59ebb6e4f735..192344688c06 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -136,6 +136,7 @@ static int l2tp_ip6_recv(struct sk_buff *skb) unsigned char *ptr, *optr; struct l2tp_session *session; struct l2tp_tunnel *tunnel = NULL; + struct ipv6hdr *iph; int length; if (!pskb_may_pull(skb, 4)) @@ -192,24 +193,17 @@ pass_up: goto discard; tunnel_id = ntohl(*(__be32 *) &skb->data[4]); - tunnel = l2tp_tunnel_find(net, tunnel_id); - if (tunnel) { - sk = tunnel->sock; - sock_hold(sk); - } else { - struct ipv6hdr *iph = ipv6_hdr(skb); - - read_lock_bh(&l2tp_ip6_lock); - sk = __l2tp_ip6_bind_lookup(net, &iph->daddr, &iph->saddr, - inet6_iif(skb), tunnel_id); - if (!sk) { - read_unlock_bh(&l2tp_ip6_lock); - goto discard; - } + iph = ipv6_hdr(skb); - sock_hold(sk); + read_lock_bh(&l2tp_ip6_lock); + sk = __l2tp_ip6_bind_lookup(net, &iph->daddr, &iph->saddr, + inet6_iif(skb), tunnel_id); + if (!sk) { read_unlock_bh(&l2tp_ip6_lock); + goto discard; } + sock_hold(sk); + read_unlock_bh(&l2tp_ip6_lock); if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_put; diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 5f5c78b632d0..b412fc3351dc 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -127,8 +127,6 @@ struct pppol2tp_session { * PPPoX socket */ struct sock *__sk; /* Copy of .sk, for cleanup */ struct rcu_head rcu; /* For asynchronous release */ - struct sock *tunnel_sock; /* Pointer to the tunnel UDP - * socket */ int flags; /* accessed by PPPIOCGFLAGS. * Unused. */ }; @@ -295,7 +293,6 @@ static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m, int error; struct l2tp_session *session; struct l2tp_tunnel *tunnel; - struct pppol2tp_session *ps; int uhlen; error = -ENOTCONN; @@ -308,10 +305,7 @@ static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m, if (session == NULL) goto error; - ps = l2tp_session_priv(session); - tunnel = l2tp_sock_to_tunnel(ps->tunnel_sock); - if (tunnel == NULL) - goto error_put_sess; + tunnel = session->tunnel; uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(struct udphdr) : 0; @@ -322,7 +316,7 @@ static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m, 2 + total_len, /* 2 bytes for PPP_ALLSTATIONS & PPP_UI */ 0, GFP_KERNEL); if (!skb) - goto error_put_sess_tun; + goto error_put_sess; /* Reserve space for headers. */ skb_reserve(skb, NET_SKB_PAD); @@ -340,20 +334,17 @@ static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m, error = memcpy_from_msg(skb_put(skb, total_len), m, total_len); if (error < 0) { kfree_skb(skb); - goto error_put_sess_tun; + goto error_put_sess; } local_bh_disable(); l2tp_xmit_skb(session, skb, session->hdr_len); local_bh_enable(); - sock_put(ps->tunnel_sock); sock_put(sk); return total_len; -error_put_sess_tun: - sock_put(ps->tunnel_sock); error_put_sess: sock_put(sk); error: @@ -377,10 +368,8 @@ error: static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb) { struct sock *sk = (struct sock *) chan->private; - struct sock *sk_tun; struct l2tp_session *session; struct l2tp_tunnel *tunnel; - struct pppol2tp_session *ps; int uhlen, headroom; if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED)) @@ -391,13 +380,7 @@ static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb) if (session == NULL) goto abort; - ps = l2tp_session_priv(session); - sk_tun = ps->tunnel_sock; - if (sk_tun == NULL) - goto abort_put_sess; - tunnel = l2tp_sock_to_tunnel(sk_tun); - if (tunnel == NULL) - goto abort_put_sess; + tunnel = session->tunnel; uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(struct udphdr) : 0; headroom = NET_SKB_PAD + @@ -406,7 +389,7 @@ static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb) session->hdr_len + /* L2TP header */ 2; /* 2 bytes for PPP_ALLSTATIONS & PPP_UI */ if (skb_cow_head(skb, headroom)) - goto abort_put_sess_tun; + goto abort_put_sess; /* Setup PPP header */ __skb_push(skb, 2); @@ -417,12 +400,10 @@ static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb) l2tp_xmit_skb(session, skb, session->hdr_len); local_bh_enable(); - sock_put(sk_tun); sock_put(sk); + return 1; -abort_put_sess_tun: - sock_put(sk_tun); abort_put_sess: sock_put(sk); abort: @@ -609,7 +590,6 @@ static void pppol2tp_session_init(struct l2tp_session *session) ps = l2tp_session_priv(session); mutex_init(&ps->sk_lock); - ps->tunnel_sock = session->tunnel->sock; ps->owner = current->pid; /* If PMTU discovery was enabled, use the MTU that was discovered */ @@ -760,13 +740,6 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, error = -EEXIST; goto end; } - - /* consistency checks */ - if (ps->tunnel_sock != tunnel->sock) { - mutex_unlock(&ps->sk_lock); - error = -EEXIST; - goto end; - } } else { /* Default MTU must allow space for UDP/L2TP/PPP headers */ cfg.mtu = 1500 - PPPOL2TP_HEADER_OVERHEAD; @@ -919,9 +892,7 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr, goto end; pls = l2tp_session_priv(session); - tunnel = l2tp_sock_to_tunnel(pls->tunnel_sock); - if (tunnel == NULL) - goto end_put_sess; + tunnel = session->tunnel; inet = inet_sk(tunnel->sock); if ((tunnel->version == 2) && (tunnel->sock->sk_family == AF_INET)) { @@ -1001,8 +972,6 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr, *usockaddr_len = len; error = 0; - sock_put(pls->tunnel_sock); -end_put_sess: sock_put(sk); end: return error; @@ -1241,7 +1210,6 @@ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd, struct sock *sk = sock->sk; struct l2tp_session *session; struct l2tp_tunnel *tunnel; - struct pppol2tp_session *ps; int err; if (!sk) @@ -1265,16 +1233,10 @@ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd, /* Special case: if session's session_id is zero, treat ioctl as a * tunnel ioctl */ - ps = l2tp_session_priv(session); if ((session->session_id == 0) && (session->peer_session_id == 0)) { - err = -EBADF; - tunnel = l2tp_sock_to_tunnel(ps->tunnel_sock); - if (tunnel == NULL) - goto end_put_sess; - + tunnel = session->tunnel; err = pppol2tp_tunnel_ioctl(tunnel, cmd, arg); - sock_put(ps->tunnel_sock); goto end_put_sess; } @@ -1400,7 +1362,6 @@ static int pppol2tp_setsockopt(struct socket *sock, int level, int optname, struct sock *sk = sock->sk; struct l2tp_session *session; struct l2tp_tunnel *tunnel; - struct pppol2tp_session *ps; int val; int err; @@ -1425,20 +1386,14 @@ static int pppol2tp_setsockopt(struct socket *sock, int level, int optname, /* Special case: if session_id == 0x0000, treat as operation on tunnel */ - ps = l2tp_session_priv(session); if ((session->session_id == 0) && (session->peer_session_id == 0)) { - err = -EBADF; - tunnel = l2tp_sock_to_tunnel(ps->tunnel_sock); - if (tunnel == NULL) - goto end_put_sess; - + tunnel = session->tunnel; err = pppol2tp_tunnel_setsockopt(sk, tunnel, optname, val); - sock_put(ps->tunnel_sock); - } else + } else { err = pppol2tp_session_setsockopt(sk, session, optname, val); + } -end_put_sess: sock_put(sk); end: return err; @@ -1526,7 +1481,6 @@ static int pppol2tp_getsockopt(struct socket *sock, int level, int optname, struct l2tp_tunnel *tunnel; int val, len; int err; - struct pppol2tp_session *ps; if (level != SOL_PPPOL2TP) return -EINVAL; @@ -1550,16 +1504,10 @@ static int pppol2tp_getsockopt(struct socket *sock, int level, int optname, goto end; /* Special case: if session_id == 0x0000, treat as operation on tunnel */ - ps = l2tp_session_priv(session); if ((session->session_id == 0) && (session->peer_session_id == 0)) { - err = -EBADF; - tunnel = l2tp_sock_to_tunnel(ps->tunnel_sock); - if (tunnel == NULL) - goto end_put_sess; - + tunnel = session->tunnel; err = pppol2tp_tunnel_getsockopt(sk, tunnel, optname, &val); - sock_put(ps->tunnel_sock); if (err) goto end_put_sess; } else { diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c index dd3e83328ad5..82cb93f66b9b 100644 --- a/net/llc/llc_input.c +++ b/net/llc/llc_input.c @@ -193,7 +193,7 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev, */ rcv = rcu_dereference(sap->rcv_func); dest = llc_pdu_type(skb); - sap_handler = dest ? ACCESS_ONCE(llc_type_handlers[dest - 1]) : NULL; + sap_handler = dest ? READ_ONCE(llc_type_handlers[dest - 1]) : NULL; if (unlikely(!sap_handler)) { if (rcv) rcv(skb, dev, pt, orig_dev); @@ -214,7 +214,7 @@ drop: kfree_skb(skb); goto out; handle_station: - sta_handler = ACCESS_ONCE(llc_station_handler); + sta_handler = READ_ONCE(llc_station_handler); if (!sta_handler) goto drop; sta_handler(skb); diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 9673e157bf8f..a3060e55122c 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -2017,7 +2017,7 @@ static void sta_stats_decode_rate(struct ieee80211_local *local, u16 rate, static int sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo) { - u16 rate = ACCESS_ONCE(sta_get_last_rx_stats(sta)->last_rate); + u16 rate = READ_ONCE(sta_get_last_rx_stats(sta)->last_rate); if (rate == STA_STATS_RATE_INVALID) return -EINVAL; diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c index f135938bf781..67e708e98ccf 100644 --- a/net/ncsi/ncsi-aen.c +++ b/net/ncsi/ncsi-aen.c @@ -73,6 +73,9 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp, ncm->data[2] = data; ncm->data[4] = ntohl(lsc->oem_status); + netdev_info(ndp->ndev.dev, "NCSI: LSC AEN - channel %u state %s\n", + nc->id, data & 0x1 ? "up" : "down"); + chained = !list_empty(&nc->link); state = nc->state; spin_unlock_irqrestore(&nc->lock, flags); @@ -145,6 +148,8 @@ static int ncsi_aen_handler_hncdsc(struct ncsi_dev_priv *ndp, ncm = &nc->modes[NCSI_MODE_LINK]; hncdsc = (struct ncsi_aen_hncdsc_pkt *)h; ncm->data[3] = ntohl(hncdsc->status); + netdev_info(ndp->ndev.dev, "NCSI: HNCDSC AEN - channel %u state %s\n", + nc->id, ncm->data[3] & 0x3 ? "up" : "down"); if (!list_empty(&nc->link) || nc->state != NCSI_CHANNEL_ACTIVE) { spin_unlock_irqrestore(&nc->lock, flags); @@ -212,10 +217,18 @@ int ncsi_aen_handler(struct ncsi_dev_priv *ndp, struct sk_buff *skb) } ret = ncsi_validate_aen_pkt(h, nah->payload); - if (ret) + if (ret) { + netdev_warn(ndp->ndev.dev, + "NCSI: 'bad' packet ignored for AEN type 0x%x\n", + h->type); goto out; + } ret = nah->handler(ndp, h); + if (ret) + netdev_err(ndp->ndev.dev, + "NCSI: Handler for AEN type 0x%x returned %d\n", + h->type, ret); out: consume_skb(skb); return ret; diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 47baf914eec2..a2b904a718c6 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -229,6 +229,8 @@ static void ncsi_channel_monitor(unsigned long data) case NCSI_CHANNEL_MONITOR_WAIT ... NCSI_CHANNEL_MONITOR_WAIT_MAX: break; default: + netdev_err(ndp->ndev.dev, "NCSI Channel %d timed out!\n", + nc->id); if (!(ndp->flags & NCSI_DEV_HWA)) { ncsi_report_link(ndp, true); ndp->flags |= NCSI_DEV_RESHUFFLE; @@ -682,7 +684,7 @@ static int clear_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc, data = ncsi_get_filter(nc, NCSI_FILTER_VLAN, index); if (!data) { netdev_err(ndp->ndev.dev, - "ncsi: failed to retrieve filter %d\n", index); + "NCSI: failed to retrieve filter %d\n", index); /* Set the VLAN id to 0 - this will still disable the entry in * the filter table, but we won't know what it was. */ @@ -692,7 +694,7 @@ static int clear_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc, } netdev_printk(KERN_DEBUG, ndp->ndev.dev, - "ncsi: removed vlan tag %u at index %d\n", + "NCSI: removed vlan tag %u at index %d\n", vid, index + 1); ncsi_remove_filter(nc, NCSI_FILTER_VLAN, index); @@ -718,7 +720,7 @@ static int set_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc, if (index < 0) { /* New tag to add */ netdev_printk(KERN_DEBUG, ndp->ndev.dev, - "ncsi: new vlan id to set: %u\n", + "NCSI: new vlan id to set: %u\n", vlan->vid); break; } @@ -745,7 +747,7 @@ static int set_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc, } netdev_printk(KERN_DEBUG, ndp->ndev.dev, - "ncsi: set vid %u in packet, index %u\n", + "NCSI: set vid %u in packet, index %u\n", vlan->vid, index + 1); nca->type = NCSI_PKT_CMD_SVF; nca->words[1] = vlan->vid; @@ -784,8 +786,11 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) nca.package = np->id; nca.channel = NCSI_RESERVED_CHANNEL; ret = ncsi_xmit_cmd(&nca); - if (ret) + if (ret) { + netdev_err(ndp->ndev.dev, + "NCSI: Failed to transmit CMD_SP\n"); goto error; + } nd->state = ncsi_dev_state_config_cis; break; @@ -797,8 +802,11 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) nca.package = np->id; nca.channel = nc->id; ret = ncsi_xmit_cmd(&nca); - if (ret) + if (ret) { + netdev_err(ndp->ndev.dev, + "NCSI: Failed to transmit CMD_CIS\n"); goto error; + } nd->state = ncsi_dev_state_config_clear_vids; break; @@ -895,10 +903,16 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) } ret = ncsi_xmit_cmd(&nca); - if (ret) + if (ret) { + netdev_err(ndp->ndev.dev, + "NCSI: Failed to transmit CMD %x\n", + nca.type); goto error; + } break; case ncsi_dev_state_config_done: + netdev_printk(KERN_DEBUG, ndp->ndev.dev, + "NCSI: channel %u config done\n", nc->id); spin_lock_irqsave(&nc->lock, flags); if (nc->reconfigure_needed) { /* This channel's configuration has been updated @@ -925,6 +939,9 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) } else { hot_nc = NULL; nc->state = NCSI_CHANNEL_INACTIVE; + netdev_warn(ndp->ndev.dev, + "NCSI: channel %u link down after config\n", + nc->id); } spin_unlock_irqrestore(&nc->lock, flags); @@ -937,8 +954,8 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) ncsi_process_next_channel(ndp); break; default: - netdev_warn(dev, "Wrong NCSI state 0x%x in config\n", - nd->state); + netdev_alert(dev, "Wrong NCSI state 0x%x in config\n", + nd->state); } return; @@ -990,10 +1007,17 @@ static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp) } if (!found) { + netdev_warn(ndp->ndev.dev, + "NCSI: No channel found with link\n"); ncsi_report_link(ndp, true); return -ENODEV; } + ncm = &found->modes[NCSI_MODE_LINK]; + netdev_printk(KERN_DEBUG, ndp->ndev.dev, + "NCSI: Channel %u added to queue (link %s)\n", + found->id, ncm->data[2] & 0x1 ? "up" : "down"); + out: spin_lock_irqsave(&ndp->lock, flags); list_add_tail_rcu(&found->link, &ndp->channel_queue); @@ -1055,6 +1079,8 @@ static int ncsi_enable_hwa(struct ncsi_dev_priv *ndp) /* We can have no channels in extremely case */ if (list_empty(&ndp->channel_queue)) { + netdev_err(ndp->ndev.dev, + "NCSI: No available channels for HWA\n"); ncsi_report_link(ndp, false); return -ENOENT; } @@ -1223,6 +1249,9 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) return; error: + netdev_err(ndp->ndev.dev, + "NCSI: Failed to transmit cmd 0x%x during probe\n", + nca.type); ncsi_report_link(ndp, true); } @@ -1276,10 +1305,14 @@ int ncsi_process_next_channel(struct ncsi_dev_priv *ndp) switch (old_state) { case NCSI_CHANNEL_INACTIVE: ndp->ndev.state = ncsi_dev_state_config; + netdev_info(ndp->ndev.dev, "NCSI: configuring channel %u\n", + nc->id); ncsi_configure_channel(ndp); break; case NCSI_CHANNEL_ACTIVE: ndp->ndev.state = ncsi_dev_state_suspend; + netdev_info(ndp->ndev.dev, "NCSI: suspending channel %u\n", + nc->id); ncsi_suspend_channel(ndp); break; default: @@ -1299,6 +1332,8 @@ out: return ncsi_choose_active_channel(ndp); } + netdev_printk(KERN_DEBUG, ndp->ndev.dev, + "NCSI: No more channels to process\n"); ncsi_report_link(ndp, false); return -ENODEV; } @@ -1390,7 +1425,7 @@ static int ncsi_kick_channels(struct ncsi_dev_priv *ndp) ncsi_dev_state_config || !list_empty(&nc->link)) { netdev_printk(KERN_DEBUG, nd->dev, - "ncsi: channel %p marked dirty\n", + "NCSI: channel %p marked dirty\n", nc); nc->reconfigure_needed = true; } @@ -1410,7 +1445,7 @@ static int ncsi_kick_channels(struct ncsi_dev_priv *ndp) spin_unlock_irqrestore(&ndp->lock, flags); netdev_printk(KERN_DEBUG, nd->dev, - "ncsi: kicked channel %p\n", nc); + "NCSI: kicked channel %p\n", nc); n++; } } @@ -1431,7 +1466,7 @@ int ncsi_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) nd = ncsi_find_dev(dev); if (!nd) { - netdev_warn(dev, "ncsi: No net_device?\n"); + netdev_warn(dev, "NCSI: No net_device?\n"); return 0; } @@ -1442,7 +1477,7 @@ int ncsi_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) n_vids++; if (vlan->vid == vid) { netdev_printk(KERN_DEBUG, dev, - "vid %u already registered\n", vid); + "NCSI: vid %u already registered\n", vid); return 0; } } @@ -1461,7 +1496,7 @@ int ncsi_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) vlan->vid = vid; list_add_rcu(&vlan->list, &ndp->vlan_vids); - netdev_printk(KERN_DEBUG, dev, "Added new vid %u\n", vid); + netdev_printk(KERN_DEBUG, dev, "NCSI: Added new vid %u\n", vid); found = ncsi_kick_channels(ndp) != 0; @@ -1481,7 +1516,7 @@ int ncsi_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) nd = ncsi_find_dev(dev); if (!nd) { - netdev_warn(dev, "ncsi: no net_device?\n"); + netdev_warn(dev, "NCSI: no net_device?\n"); return 0; } @@ -1491,14 +1526,14 @@ int ncsi_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) list_for_each_entry_safe(vlan, tmp, &ndp->vlan_vids, list) if (vlan->vid == vid) { netdev_printk(KERN_DEBUG, dev, - "vid %u found, removing\n", vid); + "NCSI: vid %u found, removing\n", vid); list_del_rcu(&vlan->list); found = true; kfree(vlan); } if (!found) { - netdev_err(dev, "ncsi: vid %u wasn't registered!\n", vid); + netdev_err(dev, "NCSI: vid %u wasn't registered!\n", vid); return -EINVAL; } @@ -1581,10 +1616,12 @@ int ncsi_start_dev(struct ncsi_dev *nd) return 0; } - if (ndp->flags & NCSI_DEV_HWA) + if (ndp->flags & NCSI_DEV_HWA) { + netdev_info(ndp->ndev.dev, "NCSI: Enabling HWA mode\n"); ret = ncsi_enable_hwa(ndp); - else + } else { ret = ncsi_choose_active_channel(ndp); + } return ret; } @@ -1615,6 +1652,7 @@ void ncsi_stop_dev(struct ncsi_dev *nd) } } + netdev_printk(KERN_DEBUG, ndp->ndev.dev, "NCSI: Stopping device\n"); ncsi_report_link(ndp, true); } EXPORT_SYMBOL_GPL(ncsi_stop_dev); diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index 927dad4759d1..efd933ff5570 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -146,7 +146,7 @@ static int ncsi_rsp_handler_ec(struct ncsi_request *nr) ncm = &nc->modes[NCSI_MODE_ENABLE]; if (ncm->enable) - return -EBUSY; + return 0; ncm->enable = 1; return 0; @@ -173,7 +173,7 @@ static int ncsi_rsp_handler_dc(struct ncsi_request *nr) ncm = &nc->modes[NCSI_MODE_ENABLE]; if (!ncm->enable) - return -EBUSY; + return 0; ncm->enable = 0; return 0; @@ -217,7 +217,7 @@ static int ncsi_rsp_handler_ecnt(struct ncsi_request *nr) ncm = &nc->modes[NCSI_MODE_TX_ENABLE]; if (ncm->enable) - return -EBUSY; + return 0; ncm->enable = 1; return 0; @@ -239,7 +239,7 @@ static int ncsi_rsp_handler_dcnt(struct ncsi_request *nr) ncm = &nc->modes[NCSI_MODE_TX_ENABLE]; if (!ncm->enable) - return -EBUSY; + return 0; ncm->enable = 1; return 0; @@ -263,7 +263,7 @@ static int ncsi_rsp_handler_ae(struct ncsi_request *nr) /* Check if the AEN has been enabled */ ncm = &nc->modes[NCSI_MODE_AEN]; if (ncm->enable) - return -EBUSY; + return 0; /* Update to AEN configuration */ cmd = (struct ncsi_cmd_ae_pkt *)skb_network_header(nr->cmd); @@ -382,7 +382,7 @@ static int ncsi_rsp_handler_ev(struct ncsi_request *nr) /* Check if VLAN mode has been enabled */ ncm = &nc->modes[NCSI_MODE_VLAN]; if (ncm->enable) - return -EBUSY; + return 0; /* Update to VLAN mode */ cmd = (struct ncsi_cmd_ev_pkt *)skb_network_header(nr->cmd); @@ -409,7 +409,7 @@ static int ncsi_rsp_handler_dv(struct ncsi_request *nr) /* Check if VLAN mode has been enabled */ ncm = &nc->modes[NCSI_MODE_VLAN]; if (!ncm->enable) - return -EBUSY; + return 0; /* Update to VLAN mode */ ncm->enable = 0; @@ -455,13 +455,10 @@ static int ncsi_rsp_handler_sma(struct ncsi_request *nr) bitmap = &ncf->bitmap; if (cmd->at_e & 0x1) { - if (test_and_set_bit(cmd->index, bitmap)) - return -EBUSY; + set_bit(cmd->index, bitmap); memcpy(ncf->data + 6 * cmd->index, cmd->mac, 6); } else { - if (!test_and_clear_bit(cmd->index, bitmap)) - return -EBUSY; - + clear_bit(cmd->index, bitmap); memset(ncf->data + 6 * cmd->index, 0, 6); } @@ -485,7 +482,7 @@ static int ncsi_rsp_handler_ebf(struct ncsi_request *nr) /* Check if broadcast filter has been enabled */ ncm = &nc->modes[NCSI_MODE_BC]; if (ncm->enable) - return -EBUSY; + return 0; /* Update to broadcast filter mode */ cmd = (struct ncsi_cmd_ebf_pkt *)skb_network_header(nr->cmd); @@ -511,7 +508,7 @@ static int ncsi_rsp_handler_dbf(struct ncsi_request *nr) /* Check if broadcast filter isn't enabled */ ncm = &nc->modes[NCSI_MODE_BC]; if (!ncm->enable) - return -EBUSY; + return 0; /* Update to broadcast filter mode */ ncm->enable = 0; @@ -538,7 +535,7 @@ static int ncsi_rsp_handler_egmf(struct ncsi_request *nr) /* Check if multicast filter has been enabled */ ncm = &nc->modes[NCSI_MODE_MC]; if (ncm->enable) - return -EBUSY; + return 0; /* Update to multicast filter mode */ cmd = (struct ncsi_cmd_egmf_pkt *)skb_network_header(nr->cmd); @@ -564,7 +561,7 @@ static int ncsi_rsp_handler_dgmf(struct ncsi_request *nr) /* Check if multicast filter has been enabled */ ncm = &nc->modes[NCSI_MODE_MC]; if (!ncm->enable) - return -EBUSY; + return 0; /* Update to multicast filter mode */ ncm->enable = 0; @@ -591,7 +588,7 @@ static int ncsi_rsp_handler_snfc(struct ncsi_request *nr) /* Check if flow control has been enabled */ ncm = &nc->modes[NCSI_MODE_FC]; if (ncm->enable) - return -EBUSY; + return 0; /* Update to flow control mode */ cmd = (struct ncsi_cmd_snfc_pkt *)skb_network_header(nr->cmd); @@ -1032,11 +1029,19 @@ int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev, if (payload < 0) payload = ntohs(hdr->length); ret = ncsi_validate_rsp_pkt(nr, payload); - if (ret) + if (ret) { + netdev_warn(ndp->ndev.dev, + "NCSI: 'bad' packet ignored for type 0x%x\n", + hdr->type); goto out; + } /* Process the packet */ ret = nrh->handler(nr); + if (ret) + netdev_err(ndp->ndev.dev, + "NCSI: Handler for packet type 0x%x returned %d\n", + hdr->type, ret); out: ncsi_free_request(nr); return ret; diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index a2f19b9906e9..0f164e986bf1 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -434,7 +434,7 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; if (unlikely(tb[IPSET_ATTR_CIDR])) { - u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (cidr != HOST_MASK) return -IPSET_ERR_INVALID_CIDR; diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index c9b4e05ad940..e864681b8dc5 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -454,7 +454,6 @@ static size_t list_set_memsize(const struct list_set *map, size_t dsize) { struct set_elem *e; - size_t memsize; u32 n = 0; rcu_read_lock(); @@ -462,9 +461,7 @@ list_set_memsize(const struct list_set *map, size_t dsize) n++; rcu_read_unlock(); - memsize = sizeof(*map) + n * dsize; - - return memsize; + return (sizeof(*map) + n * dsize); } static int diff --git a/net/netfilter/ipset/pfxlen.c b/net/netfilter/ipset/pfxlen.c index 1c8a42c1056c..d5be9c25fad6 100644 --- a/net/netfilter/ipset/pfxlen.c +++ b/net/netfilter/ipset/pfxlen.c @@ -3,6 +3,141 @@ /* Prefixlen maps for fast conversions, by Jan Engelhardt. */ +#ifdef E +#undef E +#endif + +#define PREFIXES_MAP \ + E(0x00000000, 0x00000000, 0x00000000, 0x00000000), \ + E(0x80000000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xC0000000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xE0000000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xF0000000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xF8000000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFC000000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFE000000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFF000000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFF800000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFC00000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFE00000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFF00000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFF80000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFC0000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFE0000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFF0000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFF8000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFFC000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFFE000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFFF000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFFF800, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFFFC00, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFFFE00, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFFFF00, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFFFF80, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFFFFC0, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFFFFE0, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFFFFF0, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFFFFF8, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFC, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFE, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0x80000000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xC0000000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xE0000000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xF0000000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xF8000000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFC000000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFE000000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFF000000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFF800000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFC00000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFE00000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFF00000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFF80000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFC0000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFE0000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFF0000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFF8000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFC000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFE000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFF000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFF800, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFC00, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFE00, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFF00, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFF80, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFC0, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFE0, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFF0, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFF8, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFC, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFE, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0x80000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x80000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF), + #define E(a, b, c, d) \ {.ip6 = { \ htonl(a), htonl(b), \ @@ -13,135 +148,7 @@ * just use prefixlen_netmask_map[prefixlength].ip. */ const union nf_inet_addr ip_set_netmask_map[] = { - E(0x00000000, 0x00000000, 0x00000000, 0x00000000), - E(0x80000000, 0x00000000, 0x00000000, 0x00000000), - E(0xC0000000, 0x00000000, 0x00000000, 0x00000000), - E(0xE0000000, 0x00000000, 0x00000000, 0x00000000), - E(0xF0000000, 0x00000000, 0x00000000, 0x00000000), - E(0xF8000000, 0x00000000, 0x00000000, 0x00000000), - E(0xFC000000, 0x00000000, 0x00000000, 0x00000000), - E(0xFE000000, 0x00000000, 0x00000000, 0x00000000), - E(0xFF000000, 0x00000000, 0x00000000, 0x00000000), - E(0xFF800000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFC00000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFE00000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFF00000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFF80000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFC0000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFE0000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFF0000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFF8000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFC000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFE000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFF000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFF800, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFC00, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFE00, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFF00, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFF80, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFFC0, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFFE0, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFFF0, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFFF8, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFFFC, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFFFE, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0x80000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xC0000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xE0000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xF0000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xF8000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFC000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFE000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFF000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFF800000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFC00000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFE00000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFF00000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFF80000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFC0000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFE0000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFF0000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFF8000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFC000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFE000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFF000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFF800, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFC00, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFE00, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFF00, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFF80, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFC0, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFE0, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFF0, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFF8, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFC, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFE, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0x80000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x80000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF), + PREFIXES_MAP }; EXPORT_SYMBOL_GPL(ip_set_netmask_map); @@ -155,135 +162,7 @@ EXPORT_SYMBOL_GPL(ip_set_netmask_map); * just use prefixlen_hostmask_map[prefixlength].ip. */ const union nf_inet_addr ip_set_hostmask_map[] = { - E(0x00000000, 0x00000000, 0x00000000, 0x00000000), - E(0x80000000, 0x00000000, 0x00000000, 0x00000000), - E(0xC0000000, 0x00000000, 0x00000000, 0x00000000), - E(0xE0000000, 0x00000000, 0x00000000, 0x00000000), - E(0xF0000000, 0x00000000, 0x00000000, 0x00000000), - E(0xF8000000, 0x00000000, 0x00000000, 0x00000000), - E(0xFC000000, 0x00000000, 0x00000000, 0x00000000), - E(0xFE000000, 0x00000000, 0x00000000, 0x00000000), - E(0xFF000000, 0x00000000, 0x00000000, 0x00000000), - E(0xFF800000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFC00000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFE00000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFF00000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFF80000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFC0000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFE0000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFF0000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFF8000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFC000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFE000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFF000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFF800, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFC00, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFE00, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFF00, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFF80, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFFC0, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFFE0, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFFF0, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFFF8, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFFFC, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFFFE, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0x80000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xC0000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xE0000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xF0000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xF8000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFC000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFE000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFF000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFF800000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFC00000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFE00000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFF00000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFF80000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFC0000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFE0000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFF0000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFF8000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFC000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFE000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFF000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFF800, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFC00, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFE00, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFF00, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFF80, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFC0, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFE0, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFF0, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFF8, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFC, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFE, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0x80000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x80000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF), + PREFIXES_MAP }; EXPORT_SYMBOL_GPL(ip_set_hostmask_map); diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 3d2ac71a83ec..3e053cb30070 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -104,7 +104,7 @@ static inline void ct_write_unlock_bh(unsigned int key) spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); } -static void ip_vs_conn_expire(unsigned long data); +static void ip_vs_conn_expire(struct timer_list *t); /* * Returns hash value for IPVS connection entry @@ -185,7 +185,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) hlist_add_head_rcu(&cp->c_list, &ip_vs_conn_tab[hash]); ret = 1; } else { - pr_err("%s(): request for already hashed, called from %pF\n", + pr_err("%s(): request for already hashed, called from %pS\n", __func__, __builtin_return_address(0)); ret = 0; } @@ -457,7 +457,7 @@ EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto); static void __ip_vs_conn_put_notimer(struct ip_vs_conn *cp) { __ip_vs_conn_put(cp); - ip_vs_conn_expire((unsigned long)cp); + ip_vs_conn_expire(&cp->timer); } /* @@ -817,9 +817,9 @@ static void ip_vs_conn_rcu_free(struct rcu_head *head) kmem_cache_free(ip_vs_conn_cachep, cp); } -static void ip_vs_conn_expire(unsigned long data) +static void ip_vs_conn_expire(struct timer_list *t) { - struct ip_vs_conn *cp = (struct ip_vs_conn *)data; + struct ip_vs_conn *cp = from_timer(cp, t, timer); struct netns_ipvs *ipvs = cp->ipvs; /* @@ -909,7 +909,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, } INIT_HLIST_NODE(&cp->c_list); - setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); + timer_setup(&cp->timer, ip_vs_conn_expire, 0); cp->ipvs = ipvs; cp->af = p->af; cp->daf = dest_af; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 4f940d7eb2f7..fff213eacf2a 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -300,7 +300,7 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) unsigned int hash; if (svc->flags & IP_VS_SVC_F_HASHED) { - pr_err("%s(): request for already hashed, called from %pF\n", + pr_err("%s(): request for already hashed, called from %pS\n", __func__, __builtin_return_address(0)); return 0; } @@ -334,7 +334,7 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) static int ip_vs_svc_unhash(struct ip_vs_service *svc) { if (!(svc->flags & IP_VS_SVC_F_HASHED)) { - pr_err("%s(): request for unhash flagged, called from %pF\n", + pr_err("%s(): request for unhash flagged, called from %pS\n", __func__, __builtin_return_address(0)); return 0; } @@ -1146,9 +1146,9 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) return 0; } -static void ip_vs_dest_trash_expire(unsigned long data) +static void ip_vs_dest_trash_expire(struct timer_list *t) { - struct netns_ipvs *ipvs = (struct netns_ipvs *)data; + struct netns_ipvs *ipvs = from_timer(ipvs, t, dest_trash_timer); struct ip_vs_dest *dest, *next; unsigned long now = jiffies; @@ -2034,12 +2034,16 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) seq_puts(seq, " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n"); } else { + struct net *net = seq_file_net(seq); + struct netns_ipvs *ipvs = net_ipvs(net); const struct ip_vs_service *svc = v; const struct ip_vs_iter *iter = seq->private; const struct ip_vs_dest *dest; struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler); char *sched_name = sched ? sched->name : "none"; + if (svc->ipvs != ipvs) + return 0; if (iter->table == ip_vs_svc_table) { #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) @@ -4019,8 +4023,7 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) INIT_LIST_HEAD(&ipvs->dest_trash); spin_lock_init(&ipvs->dest_trash_lock); - setup_timer(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire, - (unsigned long) ipvs); + timer_setup(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire, 0); atomic_set(&ipvs->ftpsvc_counter, 0); atomic_set(&ipvs->nullsvc_counter, 0); atomic_set(&ipvs->conn_out_counter, 0); diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 457c6c193e13..489055091a9b 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -97,12 +97,12 @@ static void ip_vs_read_cpu_stats(struct ip_vs_kstats *sum, } -static void estimation_timer(unsigned long arg) +static void estimation_timer(struct timer_list *t) { struct ip_vs_estimator *e; struct ip_vs_stats *s; u64 rate; - struct netns_ipvs *ipvs = (struct netns_ipvs *)arg; + struct netns_ipvs *ipvs = from_timer(ipvs, t, est_timer); spin_lock(&ipvs->est_lock); list_for_each_entry(e, &ipvs->est_list, list) { @@ -192,7 +192,7 @@ int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs) { INIT_LIST_HEAD(&ipvs->est_list); spin_lock_init(&ipvs->est_lock); - setup_timer(&ipvs->est_timer, estimation_timer, (unsigned long)ipvs); + timer_setup(&ipvs->est_timer, estimation_timer, 0); mod_timer(&ipvs->est_timer, jiffies + 2 * HZ); return 0; } diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index b6aa4a970c6e..d625179de485 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -106,6 +106,7 @@ struct ip_vs_lblc_table { struct rcu_head rcu_head; struct hlist_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ struct timer_list periodic_timer; /* collect stale entries */ + struct ip_vs_service *svc; /* pointer back to service */ atomic_t entries; /* number of entries */ int max_size; /* maximum size of entries */ int rover; /* rover for expire check */ @@ -294,10 +295,10 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) * of the table. * The full expiration check is for this purpose now. */ -static void ip_vs_lblc_check_expire(unsigned long data) +static void ip_vs_lblc_check_expire(struct timer_list *t) { - struct ip_vs_service *svc = (struct ip_vs_service *) data; - struct ip_vs_lblc_table *tbl = svc->sched_data; + struct ip_vs_lblc_table *tbl = from_timer(tbl, t, periodic_timer); + struct ip_vs_service *svc = tbl->svc; unsigned long now = jiffies; int goal; int i, j; @@ -369,12 +370,12 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) tbl->rover = 0; tbl->counter = 1; tbl->dead = 0; + tbl->svc = svc; /* * Hook periodic timer for garbage collection */ - setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire, - (unsigned long)svc); + timer_setup(&tbl->periodic_timer, ip_vs_lblc_check_expire, 0); mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); return 0; diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index c13ff575f9f7..84c57b62a588 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -278,6 +278,7 @@ struct ip_vs_lblcr_table { atomic_t entries; /* number of entries */ int max_size; /* maximum size of entries */ struct timer_list periodic_timer; /* collect stale entries */ + struct ip_vs_service *svc; /* pointer back to service */ int rover; /* rover for expire check */ int counter; /* counter for no expire */ bool dead; @@ -458,10 +459,10 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) * of the table. * The full expiration check is for this purpose now. */ -static void ip_vs_lblcr_check_expire(unsigned long data) +static void ip_vs_lblcr_check_expire(struct timer_list *t) { - struct ip_vs_service *svc = (struct ip_vs_service *) data; - struct ip_vs_lblcr_table *tbl = svc->sched_data; + struct ip_vs_lblcr_table *tbl = from_timer(tbl, t, periodic_timer); + struct ip_vs_service *svc = tbl->svc; unsigned long now = jiffies; int goal; int i, j; @@ -532,12 +533,12 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) tbl->rover = 0; tbl->counter = 1; tbl->dead = 0; + tbl->svc = svc; /* * Hook periodic timer for garbage collection */ - setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire, - (unsigned long)svc); + timer_setup(&tbl->periodic_timer, ip_vs_lblcr_check_expire, 0); mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); return 0; diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 13f740875507..9ee71cb276d7 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -458,7 +458,7 @@ static inline bool in_persistence(struct ip_vs_conn *cp) static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts) { - unsigned long orig = ACCESS_ONCE(cp->sync_endtime); + unsigned long orig = READ_ONCE(cp->sync_endtime); unsigned long now = jiffies; unsigned long n = (now + cp->timeout) & ~3UL; unsigned int sync_refresh_period; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 01130392b7c0..85f643c1e227 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1083,7 +1083,7 @@ static void gc_worker(struct work_struct *work) next_run = gc_work->next_gc_run; gc_work->last_bucket = i; gc_work->early_drop = false; - queue_delayed_work(system_long_wq, &gc_work->dwork, next_run); + queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run); } static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) @@ -1419,7 +1419,7 @@ repeat: /* Decide what timeout policy we want to apply to this flow. */ timeouts = nf_ct_timeout_lookup(net, ct, l4proto); - ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, timeouts); + ret = l4proto->packet(ct, skb, dataoff, ctinfo, timeouts); if (ret <= 0) { /* Invalid: inverse of the return code tells * the netfilter core what to do */ @@ -1563,9 +1563,14 @@ int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], } EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple); -int nf_ct_port_nlattr_tuple_size(void) +unsigned int nf_ct_port_nlattr_tuple_size(void) { - return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); + static unsigned int size __read_mostly; + + if (!size) + size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); + + return size; } EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size); #endif @@ -1940,7 +1945,7 @@ int nf_conntrack_hash_resize(unsigned int hashsize) return 0; } -int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) +int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp) { unsigned int hashsize; int rc; @@ -2084,7 +2089,7 @@ int nf_conntrack_init_start(void) goto err_proto; conntrack_gc_work_init(&conntrack_gc_work); - queue_delayed_work(system_long_wq, &conntrack_gc_work.dwork, HZ); + queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ); return 0; diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c index 89b2e46925c4..cf1bf2605c10 100644 --- a/net/netfilter/nf_conntrack_h323_asn1.c +++ b/net/netfilter/nf_conntrack_h323_asn1.c @@ -91,41 +91,41 @@ typedef struct field_t { } field_t; /* Bit Stream */ -typedef struct { +struct bitstr { unsigned char *buf; unsigned char *beg; unsigned char *end; unsigned char *cur; unsigned int bit; -} bitstr_t; +}; /* Tool Functions */ #define INC_BIT(bs) if((++(bs)->bit)>7){(bs)->cur++;(bs)->bit=0;} #define INC_BITS(bs,b) if(((bs)->bit+=(b))>7){(bs)->cur+=(bs)->bit>>3;(bs)->bit&=7;} #define BYTE_ALIGN(bs) if((bs)->bit){(bs)->cur++;(bs)->bit=0;} #define CHECK_BOUND(bs,n) if((bs)->cur+(n)>(bs)->end)return(H323_ERROR_BOUND) -static unsigned int get_len(bitstr_t *bs); -static unsigned int get_bit(bitstr_t *bs); -static unsigned int get_bits(bitstr_t *bs, unsigned int b); -static unsigned int get_bitmap(bitstr_t *bs, unsigned int b); -static unsigned int get_uint(bitstr_t *bs, int b); +static unsigned int get_len(struct bitstr *bs); +static unsigned int get_bit(struct bitstr *bs); +static unsigned int get_bits(struct bitstr *bs, unsigned int b); +static unsigned int get_bitmap(struct bitstr *bs, unsigned int b); +static unsigned int get_uint(struct bitstr *bs, int b); /* Decoder Functions */ -static int decode_nul(bitstr_t *bs, const struct field_t *f, char *base, int level); -static int decode_bool(bitstr_t *bs, const struct field_t *f, char *base, int level); -static int decode_oid(bitstr_t *bs, const struct field_t *f, char *base, int level); -static int decode_int(bitstr_t *bs, const struct field_t *f, char *base, int level); -static int decode_enum(bitstr_t *bs, const struct field_t *f, char *base, int level); -static int decode_bitstr(bitstr_t *bs, const struct field_t *f, char *base, int level); -static int decode_numstr(bitstr_t *bs, const struct field_t *f, char *base, int level); -static int decode_octstr(bitstr_t *bs, const struct field_t *f, char *base, int level); -static int decode_bmpstr(bitstr_t *bs, const struct field_t *f, char *base, int level); -static int decode_seq(bitstr_t *bs, const struct field_t *f, char *base, int level); -static int decode_seqof(bitstr_t *bs, const struct field_t *f, char *base, int level); -static int decode_choice(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_nul(struct bitstr *bs, const struct field_t *f, char *base, int level); +static int decode_bool(struct bitstr *bs, const struct field_t *f, char *base, int level); +static int decode_oid(struct bitstr *bs, const struct field_t *f, char *base, int level); +static int decode_int(struct bitstr *bs, const struct field_t *f, char *base, int level); +static int decode_enum(struct bitstr *bs, const struct field_t *f, char *base, int level); +static int decode_bitstr(struct bitstr *bs, const struct field_t *f, char *base, int level); +static int decode_numstr(struct bitstr *bs, const struct field_t *f, char *base, int level); +static int decode_octstr(struct bitstr *bs, const struct field_t *f, char *base, int level); +static int decode_bmpstr(struct bitstr *bs, const struct field_t *f, char *base, int level); +static int decode_seq(struct bitstr *bs, const struct field_t *f, char *base, int level); +static int decode_seqof(struct bitstr *bs, const struct field_t *f, char *base, int level); +static int decode_choice(struct bitstr *bs, const struct field_t *f, char *base, int level); /* Decoder Functions Vector */ -typedef int (*decoder_t)(bitstr_t *, const struct field_t *, char *, int); +typedef int (*decoder_t)(struct bitstr *, const struct field_t *, char *, int); static const decoder_t Decoders[] = { decode_nul, decode_bool, @@ -150,7 +150,7 @@ static const decoder_t Decoders[] = { * Functions ****************************************************************************/ /* Assume bs is aligned && v < 16384 */ -static unsigned int get_len(bitstr_t *bs) +static unsigned int get_len(struct bitstr *bs) { unsigned int v; @@ -166,7 +166,7 @@ static unsigned int get_len(bitstr_t *bs) } /****************************************************************************/ -static unsigned int get_bit(bitstr_t *bs) +static unsigned int get_bit(struct bitstr *bs) { unsigned int b = (*bs->cur) & (0x80 >> bs->bit); @@ -177,7 +177,7 @@ static unsigned int get_bit(bitstr_t *bs) /****************************************************************************/ /* Assume b <= 8 */ -static unsigned int get_bits(bitstr_t *bs, unsigned int b) +static unsigned int get_bits(struct bitstr *bs, unsigned int b) { unsigned int v, l; @@ -203,7 +203,7 @@ static unsigned int get_bits(bitstr_t *bs, unsigned int b) /****************************************************************************/ /* Assume b <= 32 */ -static unsigned int get_bitmap(bitstr_t *bs, unsigned int b) +static unsigned int get_bitmap(struct bitstr *bs, unsigned int b) { unsigned int v, l, shift, bytes; @@ -242,7 +242,7 @@ static unsigned int get_bitmap(bitstr_t *bs, unsigned int b) /**************************************************************************** * Assume bs is aligned and sizeof(unsigned int) == 4 ****************************************************************************/ -static unsigned int get_uint(bitstr_t *bs, int b) +static unsigned int get_uint(struct bitstr *bs, int b) { unsigned int v = 0; @@ -264,7 +264,7 @@ static unsigned int get_uint(bitstr_t *bs, int b) } /****************************************************************************/ -static int decode_nul(bitstr_t *bs, const struct field_t *f, +static int decode_nul(struct bitstr *bs, const struct field_t *f, char *base, int level) { PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); @@ -273,7 +273,7 @@ static int decode_nul(bitstr_t *bs, const struct field_t *f, } /****************************************************************************/ -static int decode_bool(bitstr_t *bs, const struct field_t *f, +static int decode_bool(struct bitstr *bs, const struct field_t *f, char *base, int level) { PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); @@ -285,7 +285,7 @@ static int decode_bool(bitstr_t *bs, const struct field_t *f, } /****************************************************************************/ -static int decode_oid(bitstr_t *bs, const struct field_t *f, +static int decode_oid(struct bitstr *bs, const struct field_t *f, char *base, int level) { int len; @@ -302,7 +302,7 @@ static int decode_oid(bitstr_t *bs, const struct field_t *f, } /****************************************************************************/ -static int decode_int(bitstr_t *bs, const struct field_t *f, +static int decode_int(struct bitstr *bs, const struct field_t *f, char *base, int level) { unsigned int len; @@ -346,7 +346,7 @@ static int decode_int(bitstr_t *bs, const struct field_t *f, } /****************************************************************************/ -static int decode_enum(bitstr_t *bs, const struct field_t *f, +static int decode_enum(struct bitstr *bs, const struct field_t *f, char *base, int level) { PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); @@ -362,7 +362,7 @@ static int decode_enum(bitstr_t *bs, const struct field_t *f, } /****************************************************************************/ -static int decode_bitstr(bitstr_t *bs, const struct field_t *f, +static int decode_bitstr(struct bitstr *bs, const struct field_t *f, char *base, int level) { unsigned int len; @@ -396,7 +396,7 @@ static int decode_bitstr(bitstr_t *bs, const struct field_t *f, } /****************************************************************************/ -static int decode_numstr(bitstr_t *bs, const struct field_t *f, +static int decode_numstr(struct bitstr *bs, const struct field_t *f, char *base, int level) { unsigned int len; @@ -414,7 +414,7 @@ static int decode_numstr(bitstr_t *bs, const struct field_t *f, } /****************************************************************************/ -static int decode_octstr(bitstr_t *bs, const struct field_t *f, +static int decode_octstr(struct bitstr *bs, const struct field_t *f, char *base, int level) { unsigned int len; @@ -463,7 +463,7 @@ static int decode_octstr(bitstr_t *bs, const struct field_t *f, } /****************************************************************************/ -static int decode_bmpstr(bitstr_t *bs, const struct field_t *f, +static int decode_bmpstr(struct bitstr *bs, const struct field_t *f, char *base, int level) { unsigned int len; @@ -489,7 +489,7 @@ static int decode_bmpstr(bitstr_t *bs, const struct field_t *f, } /****************************************************************************/ -static int decode_seq(bitstr_t *bs, const struct field_t *f, +static int decode_seq(struct bitstr *bs, const struct field_t *f, char *base, int level) { unsigned int ext, bmp, i, opt, len = 0, bmp2, bmp2_len; @@ -606,7 +606,7 @@ static int decode_seq(bitstr_t *bs, const struct field_t *f, } /****************************************************************************/ -static int decode_seqof(bitstr_t *bs, const struct field_t *f, +static int decode_seqof(struct bitstr *bs, const struct field_t *f, char *base, int level) { unsigned int count, effective_count = 0, i, len = 0; @@ -696,7 +696,7 @@ static int decode_seqof(bitstr_t *bs, const struct field_t *f, /****************************************************************************/ -static int decode_choice(bitstr_t *bs, const struct field_t *f, +static int decode_choice(struct bitstr *bs, const struct field_t *f, char *base, int level) { unsigned int type, ext, len = 0; @@ -772,7 +772,7 @@ int DecodeRasMessage(unsigned char *buf, size_t sz, RasMessage *ras) FNAME("RasMessage") CHOICE, 5, 24, 32, DECODE | EXT, 0, _RasMessage }; - bitstr_t bs; + struct bitstr bs; bs.buf = bs.beg = bs.cur = buf; bs.end = buf + sz; @@ -789,7 +789,7 @@ static int DecodeH323_UserInformation(unsigned char *buf, unsigned char *beg, FNAME("H323-UserInformation") SEQ, 1, 2, 2, DECODE | EXT, 0, _H323_UserInformation }; - bitstr_t bs; + struct bitstr bs; bs.buf = buf; bs.beg = bs.cur = beg; @@ -808,7 +808,7 @@ int DecodeMultimediaSystemControlMessage(unsigned char *buf, size_t sz, FNAME("MultimediaSystemControlMessage") CHOICE, 2, 4, 4, DECODE | EXT, 0, _MultimediaSystemControlMessage }; - bitstr_t bs; + struct bitstr bs; bs.buf = bs.beg = bs.cur = buf; bs.end = buf + sz; @@ -877,6 +877,7 @@ int DecodeQ931(unsigned char *buf, size_t sz, Q931 *q931) if (sz < 1) break; len = *p++; + sz--; if (sz < len) break; p += len; diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index de4053d84364..59c08997bfdf 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -533,11 +533,12 @@ nla_put_failure: return -1; } -static inline size_t ctnetlink_proto_size(const struct nf_conn *ct) +#if defined(CONFIG_NETFILTER_NETLINK_GLUE_CT) || defined(CONFIG_NF_CONNTRACK_EVENTS) +static size_t ctnetlink_proto_size(const struct nf_conn *ct) { const struct nf_conntrack_l3proto *l3proto; const struct nf_conntrack_l4proto *l4proto; - size_t len; + size_t len, len4 = 0; l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct)); len = l3proto->nla_size; @@ -545,9 +546,14 @@ static inline size_t ctnetlink_proto_size(const struct nf_conn *ct) l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); len += l4proto->nla_size; + if (l4proto->nlattr_tuple_size) { + len4 = l4proto->nlattr_tuple_size(); + len4 *= 3u; /* ORIG, REPLY, MASTER */ + } - return len; + return len + len4; } +#endif static inline size_t ctnetlink_acct_size(const struct nf_conn *ct) { diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index b3e489c859ec..c8e9c9503a08 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -27,6 +27,7 @@ #include <net/netfilter/nf_conntrack_l3proto.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_log.h> static struct nf_conntrack_l4proto __rcu **nf_ct_protos[NFPROTO_NUMPROTO] __read_mostly; struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[NFPROTO_NUMPROTO] __read_mostly; @@ -63,6 +64,52 @@ nf_ct_unregister_sysctl(struct ctl_table_header **header, *header = NULL; *table = NULL; } + +__printf(5, 6) +void nf_l4proto_log_invalid(const struct sk_buff *skb, + struct net *net, + u16 pf, u8 protonum, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + if (net->ct.sysctl_log_invalid != protonum || + net->ct.sysctl_log_invalid != IPPROTO_RAW) + return; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, + "nf_ct_proto_%d: %pV ", protonum, &vaf); + va_end(args); +} +EXPORT_SYMBOL_GPL(nf_l4proto_log_invalid); + +__printf(3, 4) +void nf_ct_l4proto_log_invalid(const struct sk_buff *skb, + const struct nf_conn *ct, + const char *fmt, ...) +{ + struct va_format vaf; + struct net *net; + va_list args; + + net = nf_ct_net(ct); + if (likely(net->ct.sysctl_log_invalid == 0)) + return; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + nf_l4proto_log_invalid(skb, net, nf_ct_l3num(ct), + nf_ct_protonum(ct), "%pV", &vaf); + va_end(args); +} +EXPORT_SYMBOL_GPL(nf_ct_l4proto_log_invalid); #endif const struct nf_conntrack_l4proto * @@ -125,7 +172,7 @@ void nf_ct_l3proto_module_put(unsigned short l3proto) } EXPORT_SYMBOL_GPL(nf_ct_l3proto_module_put); -int nf_ct_netns_get(struct net *net, u8 nfproto) +static int nf_ct_netns_do_get(struct net *net, u8 nfproto) { const struct nf_conntrack_l3proto *l3proto; int ret; @@ -150,9 +197,33 @@ int nf_ct_netns_get(struct net *net, u8 nfproto) return ret; } + +int nf_ct_netns_get(struct net *net, u8 nfproto) +{ + int err; + + if (nfproto == NFPROTO_INET) { + err = nf_ct_netns_do_get(net, NFPROTO_IPV4); + if (err < 0) + goto err1; + err = nf_ct_netns_do_get(net, NFPROTO_IPV6); + if (err < 0) + goto err2; + } else { + err = nf_ct_netns_do_get(net, nfproto); + if (err < 0) + goto err1; + } + return 0; + +err2: + nf_ct_netns_put(net, NFPROTO_IPV4); +err1: + return err; +} EXPORT_SYMBOL_GPL(nf_ct_netns_get); -void nf_ct_netns_put(struct net *net, u8 nfproto) +static void nf_ct_netns_do_put(struct net *net, u8 nfproto) { const struct nf_conntrack_l3proto *l3proto; @@ -171,6 +242,15 @@ void nf_ct_netns_put(struct net *net, u8 nfproto) nf_ct_l3proto_module_put(nfproto); } + +void nf_ct_netns_put(struct net *net, uint8_t nfproto) +{ + if (nfproto == NFPROTO_INET) { + nf_ct_netns_do_put(net, NFPROTO_IPV4); + nf_ct_netns_do_put(net, NFPROTO_IPV6); + } else + nf_ct_netns_do_put(net, nfproto); +} EXPORT_SYMBOL_GPL(nf_ct_netns_put); const struct nf_conntrack_l4proto * @@ -351,8 +431,6 @@ int nf_ct_l4proto_register_one(struct nf_conntrack_l4proto *l4proto) l4proto->nla_size = 0; if (l4proto->nlattr_size) l4proto->nla_size += l4proto->nlattr_size(); - if (l4proto->nlattr_tuple_size) - l4proto->nla_size += 3 * l4proto->nlattr_tuple_size(); rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto], l4proto); diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 0f5a4d79f6b8..2a446f4a554c 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -428,13 +428,13 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, default: dn = dccp_pernet(net); if (dn->dccp_loose == 0) { - msg = "nf_ct_dccp: not picking up existing connection "; + msg = "not picking up existing connection "; goto out_invalid; } case CT_DCCP_REQUEST: break; case CT_DCCP_INVALID: - msg = "nf_ct_dccp: invalid state transition "; + msg = "invalid state transition "; goto out_invalid; } @@ -447,9 +447,7 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, return true; out_invalid: - if (LOG_INVALID(net, IPPROTO_DCCP)) - nf_log_packet(net, nf_ct_l3num(ct), 0, skb, NULL, NULL, - NULL, "%s", msg); + nf_ct_l4proto_log_invalid(skb, ct, "%s", msg); return false; } @@ -469,10 +467,8 @@ static unsigned int *dccp_get_timeouts(struct net *net) static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - u_int8_t pf, unsigned int *timeouts) { - struct net *net = nf_ct_net(ct); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct dccp_hdr _dh, *dh; u_int8_t type, old_state, new_state; @@ -534,15 +530,11 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, ct->proto.dccp.last_pkt = type; spin_unlock_bh(&ct->lock); - if (LOG_INVALID(net, IPPROTO_DCCP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_dccp: invalid packet ignored "); + nf_ct_l4proto_log_invalid(skb, ct, "%s", "invalid packet"); return NF_ACCEPT; case CT_DCCP_INVALID: spin_unlock_bh(&ct->lock); - if (LOG_INVALID(net, IPPROTO_DCCP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_dccp: invalid state transition "); + nf_ct_l4proto_log_invalid(skb, ct, "%s", "invalid state transition"); return -NF_ACCEPT; } @@ -604,8 +596,7 @@ static int dccp_error(struct net *net, struct nf_conn *tmpl, return NF_ACCEPT; out_invalid: - if (LOG_INVALID(net, IPPROTO_DCCP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "%s", msg); + nf_l4proto_log_invalid(skb, net, pf, IPPROTO_DCCP, "%s", msg); return -NF_ACCEPT; } diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index 9cd40700842e..1f86ddf6649a 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -60,7 +60,6 @@ static int generic_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - u_int8_t pf, unsigned int *timeout) { nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 09a90484c27d..a2503005d80b 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -244,7 +244,6 @@ static int gre_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - u_int8_t pf, unsigned int *timeouts) { /* If we've seen traffic both ways, this is a GRE connection. diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 6303a88af12b..80faf04ddf15 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -306,7 +306,6 @@ static int sctp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - u_int8_t pf, unsigned int *timeouts) { enum sctp_conntrack new_state, old_state; @@ -522,8 +521,7 @@ static int sctp_error(struct net *net, struct nf_conn *tpl, struct sk_buff *skb, } return NF_ACCEPT; out_invalid: - if (LOG_INVALID(net, IPPROTO_SCTP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "%s", logmsg); + nf_l4proto_log_invalid(skb, net, pf, IPPROTO_SCTP, "%s", logmsg); return -NF_ACCEPT; } diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index cba1c6ffe51a..b12fc07111d0 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -493,8 +493,7 @@ static bool tcp_in_window(const struct nf_conn *ct, unsigned int index, const struct sk_buff *skb, unsigned int dataoff, - const struct tcphdr *tcph, - u_int8_t pf) + const struct tcphdr *tcph) { struct net *net = nf_ct_net(ct); struct nf_tcp_net *tn = tcp_pernet(net); @@ -702,9 +701,9 @@ static bool tcp_in_window(const struct nf_conn *ct, if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL || tn->tcp_be_liberal) res = true; - if (!res && LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_tcp: %s ", + if (!res) { + nf_ct_l4proto_log_invalid(skb, ct, + "%s", before(seq, sender->td_maxend + 1) ? in_recv_win ? before(sack, receiver->td_end + 1) ? @@ -713,6 +712,7 @@ static bool tcp_in_window(const struct nf_conn *ct, : "ACK is over the upper bound (ACKed data not seen yet)" : "SEQ is under the lower bound (already ACKed data retransmitted)" : "SEQ is over the upper bound (over the window of the receiver)"); + } } pr_debug("tcp_in_window: res=%u sender end=%u maxend=%u maxwin=%u " @@ -738,6 +738,12 @@ static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK| [TCPHDR_ACK|TCPHDR_URG] = 1, }; +static void tcp_error_log(const struct sk_buff *skb, struct net *net, + u8 pf, const char *msg) +{ + nf_l4proto_log_invalid(skb, net, pf, IPPROTO_TCP, "%s", msg); +} + /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */ static int tcp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, @@ -753,17 +759,13 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl, /* Smaller that minimal TCP header? */ th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); if (th == NULL) { - if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_tcp: short packet "); + tcp_error_log(skb, net, pf, "short packet"); return -NF_ACCEPT; } /* Not whole TCP header or malformed packet */ if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { - if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_tcp: truncated/malformed packet "); + tcp_error_log(skb, net, pf, "truncated packet"); return -NF_ACCEPT; } @@ -774,18 +776,14 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl, /* FIXME: Source route IP option packets --RR */ if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) { - if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_tcp: bad TCP checksum "); + tcp_error_log(skb, net, pf, "bad checksum"); return -NF_ACCEPT; } /* Check TCP flags. */ tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH)); if (!tcp_valid_flags[tcpflags]) { - if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_tcp: invalid TCP flag combination "); + tcp_error_log(skb, net, pf, "invalid tcp flag combination"); return -NF_ACCEPT; } @@ -802,7 +800,6 @@ static int tcp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - u_int8_t pf, unsigned int *timeouts) { struct net *net = nf_ct_net(ct); @@ -939,10 +936,8 @@ static int tcp_packet(struct nf_conn *ct, IP_CT_EXP_CHALLENGE_ACK; } spin_unlock_bh(&ct->lock); - if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_tcp: invalid packet ignored in " - "state %s ", tcp_conntrack_names[old_state]); + nf_ct_l4proto_log_invalid(skb, ct, "invalid packet ignored in " + "state %s ", tcp_conntrack_names[old_state]); return NF_ACCEPT; case TCP_CONNTRACK_MAX: /* Special case for SYN proxy: when the SYN to the server or @@ -964,9 +959,7 @@ static int tcp_packet(struct nf_conn *ct, pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", dir, get_conntrack_index(th), old_state); spin_unlock_bh(&ct->lock); - if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_tcp: invalid state "); + nf_ct_l4proto_log_invalid(skb, ct, "invalid state"); return -NF_ACCEPT; case TCP_CONNTRACK_TIME_WAIT: /* RFC5961 compliance cause stack to send "challenge-ACK" @@ -981,9 +974,7 @@ static int tcp_packet(struct nf_conn *ct, /* Detected RFC5961 challenge ACK */ ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK; spin_unlock_bh(&ct->lock); - if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_tcp: challenge-ACK ignored "); + nf_ct_l4proto_log_invalid(skb, ct, "challenge-ack ignored"); return NF_ACCEPT; /* Don't change state */ } break; @@ -993,9 +984,7 @@ static int tcp_packet(struct nf_conn *ct, && before(ntohl(th->seq), ct->proto.tcp.seen[!dir].td_maxack)) { /* Invalid RST */ spin_unlock_bh(&ct->lock); - if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, - NULL, "nf_ct_tcp: invalid RST "); + nf_ct_l4proto_log_invalid(skb, ct, "invalid rst"); return -NF_ACCEPT; } if (index == TCP_RST_SET @@ -1022,7 +1011,7 @@ static int tcp_packet(struct nf_conn *ct, } if (!tcp_in_window(ct, &ct->proto.tcp, dir, index, - skb, dataoff, th, pf)) { + skb, dataoff, th)) { spin_unlock_bh(&ct->lock); return -NF_ACCEPT; } @@ -1288,9 +1277,14 @@ static int tcp_nlattr_size(void) + nla_policy_len(tcp_nla_policy, CTA_PROTOINFO_TCP_MAX + 1); } -static int tcp_nlattr_tuple_size(void) +static unsigned int tcp_nlattr_tuple_size(void) { - return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); + static unsigned int size __read_mostly; + + if (!size) + size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); + + return size; } #endif diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 8af734cd1a94..3a5f727103af 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -73,7 +73,6 @@ static int udp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - u_int8_t pf, unsigned int *timeouts) { /* If we've seen traffic both ways, this is some kind of UDP @@ -99,6 +98,12 @@ static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb, } #ifdef CONFIG_NF_CT_PROTO_UDPLITE +static void udplite_error_log(const struct sk_buff *skb, struct net *net, + u8 pf, const char *msg) +{ + nf_l4proto_log_invalid(skb, net, pf, IPPROTO_UDPLITE, "%s", msg); +} + static int udplite_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, @@ -112,9 +117,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl, /* Header is too small? */ hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (!hdr) { - if (LOG_INVALID(net, IPPROTO_UDPLITE)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_udplite: short packet "); + udplite_error_log(skb, net, pf, "short packet"); return -NF_ACCEPT; } @@ -122,17 +125,13 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl, if (cscov == 0) { cscov = udplen; } else if (cscov < sizeof(*hdr) || cscov > udplen) { - if (LOG_INVALID(net, IPPROTO_UDPLITE)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_udplite: invalid checksum coverage "); + udplite_error_log(skb, net, pf, "invalid checksum coverage"); return -NF_ACCEPT; } /* UDPLITE mandates checksums */ if (!hdr->check) { - if (LOG_INVALID(net, IPPROTO_UDPLITE)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_udplite: checksum missing "); + udplite_error_log(skb, net, pf, "checksum missing"); return -NF_ACCEPT; } @@ -140,9 +139,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl, if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP, pf)) { - if (LOG_INVALID(net, IPPROTO_UDPLITE)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_udplite: bad UDPLite checksum "); + udplite_error_log(skb, net, pf, "bad checksum"); return -NF_ACCEPT; } @@ -150,6 +147,12 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl, } #endif +static void udp_error_log(const struct sk_buff *skb, struct net *net, + u8 pf, const char *msg) +{ + nf_l4proto_log_invalid(skb, net, pf, IPPROTO_UDP, "%s", msg); +} + static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, u_int8_t pf, @@ -162,17 +165,13 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, /* Header is too small? */ hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (hdr == NULL) { - if (LOG_INVALID(net, IPPROTO_UDP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_udp: short packet "); + udp_error_log(skb, net, pf, "short packet"); return -NF_ACCEPT; } /* Truncated/malformed packets */ if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { - if (LOG_INVALID(net, IPPROTO_UDP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_udp: truncated/malformed packet "); + udp_error_log(skb, net, pf, "truncated/malformed packet"); return -NF_ACCEPT; } @@ -186,9 +185,7 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, * FIXME: Source route IP option packets --RR */ if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) { - if (LOG_INVALID(net, IPPROTO_UDP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_udp: bad UDP checksum "); + udp_error_log(skb, net, pf, "bad checksum"); return -NF_ACCEPT; } diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index af8345fc4fbd..6c38421e31f9 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -542,17 +542,14 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data) if (nf_nat_proto_remove(ct, data)) return 1; - if ((ct->status & IPS_SRC_NAT_DONE) == 0) - return 0; - - /* This netns is being destroyed, and conntrack has nat null binding. + /* This module is being removed and conntrack has nat null binding. * Remove it from bysource hash, as the table will be freed soon. * * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack() * will delete entry from already-freed table. */ - clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status); - __nf_nat_cleanup_conntrack(ct); + if (test_and_clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status)) + __nf_nat_cleanup_conntrack(ct); /* don't delete conntrack. Although that would make things a lot * simpler, we'd end up flushing all conntracks on nat rmmod. diff --git a/net/netfilter/nf_nat_ftp.c b/net/netfilter/nf_nat_ftp.c index e84a578dbe35..d76afafdc699 100644 --- a/net/netfilter/nf_nat_ftp.c +++ b/net/netfilter/nf_nat_ftp.c @@ -134,7 +134,7 @@ static int __init nf_nat_ftp_init(void) } /* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ -static int warn_set(const char *val, struct kernel_param *kp) +static int warn_set(const char *val, const struct kernel_param *kp) { printk(KERN_INFO KBUILD_MODNAME ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); diff --git a/net/netfilter/nf_nat_irc.c b/net/netfilter/nf_nat_irc.c index 0648cb096bd8..dcb5f6375d9d 100644 --- a/net/netfilter/nf_nat_irc.c +++ b/net/netfilter/nf_nat_irc.c @@ -106,7 +106,7 @@ static int __init nf_nat_irc_init(void) } /* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ -static int warn_set(const char *val, struct kernel_param *kp) +static int warn_set(const char *val, const struct kernel_param *kp) { printk(KERN_INFO KBUILD_MODNAME ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 64e1ee091225..d8327b43e4dc 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2549,14 +2549,9 @@ nft_select_set_ops(const struct nft_ctx *ctx, case NFT_SET_POL_PERFORMANCE: if (est.lookup < best.lookup) break; - if (est.lookup == best.lookup) { - if (!desc->size) { - if (est.space < best.space) - break; - } else if (est.size < best.size) { - break; - } - } + if (est.lookup == best.lookup && + est.space < best.space) + break; continue; case NFT_SET_POL_MEMORY: if (!desc->size) { @@ -3593,45 +3588,6 @@ static int nf_tables_dump_set_done(struct netlink_callback *cb) return 0; } -static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) -{ - u8 genmask = nft_genmask_cur(net); - const struct nft_set *set; - struct nft_ctx ctx; - int err; - - err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask); - if (err < 0) - return err; - - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], - genmask); - if (IS_ERR(set)) - return PTR_ERR(set); - - if (nlh->nlmsg_flags & NLM_F_DUMP) { - struct netlink_dump_control c = { - .dump = nf_tables_dump_set, - .done = nf_tables_dump_set_done, - }; - struct nft_set_dump_ctx *dump_ctx; - - dump_ctx = kmalloc(sizeof(*dump_ctx), GFP_KERNEL); - if (!dump_ctx) - return -ENOMEM; - - dump_ctx->set = set; - dump_ctx->ctx = ctx; - - c.data = dump_ctx; - return netlink_dump_start(nlsk, skb, nlh, &c); - } - return -EOPNOTSUPP; -} - static int nf_tables_fill_setelem_info(struct sk_buff *skb, const struct nft_ctx *ctx, u32 seq, u32 portid, int event, u16 flags, @@ -3677,6 +3633,135 @@ nla_put_failure: return -1; } +static int nft_setelem_parse_flags(const struct nft_set *set, + const struct nlattr *attr, u32 *flags) +{ + if (attr == NULL) + return 0; + + *flags = ntohl(nla_get_be32(attr)); + if (*flags & ~NFT_SET_ELEM_INTERVAL_END) + return -EINVAL; + if (!(set->flags & NFT_SET_INTERVAL) && + *flags & NFT_SET_ELEM_INTERVAL_END) + return -EINVAL; + + return 0; +} + +static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, + const struct nlattr *attr) +{ + struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; + const struct nft_set_ext *ext; + struct nft_data_desc desc; + struct nft_set_elem elem; + struct sk_buff *skb; + uint32_t flags = 0; + void *priv; + int err; + + err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr, + nft_set_elem_policy, NULL); + if (err < 0) + return err; + + if (!nla[NFTA_SET_ELEM_KEY]) + return -EINVAL; + + err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags); + if (err < 0) + return err; + + err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &desc, + nla[NFTA_SET_ELEM_KEY]); + if (err < 0) + return err; + + err = -EINVAL; + if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) + return err; + + priv = set->ops->get(ctx->net, set, &elem, flags); + if (IS_ERR(priv)) + return PTR_ERR(priv); + + elem.priv = priv; + ext = nft_set_elem_ext(set, &elem); + + err = -ENOMEM; + skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb == NULL) + goto err1; + + err = nf_tables_fill_setelem_info(skb, ctx, ctx->seq, ctx->portid, + NFT_MSG_NEWSETELEM, 0, set, &elem); + if (err < 0) + goto err2; + + err = nfnetlink_unicast(skb, ctx->net, ctx->portid, MSG_DONTWAIT); + /* This avoids a loop in nfnetlink. */ + if (err < 0) + goto err1; + + return 0; +err2: + kfree_skb(skb); +err1: + /* this avoids a loop in nfnetlink. */ + return err == -EAGAIN ? -ENOBUFS : err; +} + +static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const nla[], + struct netlink_ext_ack *extack) +{ + u8 genmask = nft_genmask_cur(net); + struct nft_set *set; + struct nlattr *attr; + struct nft_ctx ctx; + int rem, err = 0; + + err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask); + if (err < 0) + return err; + + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], + genmask); + if (IS_ERR(set)) + return PTR_ERR(set); + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = nf_tables_dump_set, + .done = nf_tables_dump_set_done, + }; + struct nft_set_dump_ctx *dump_ctx; + + dump_ctx = kmalloc(sizeof(*dump_ctx), GFP_KERNEL); + if (!dump_ctx) + return -ENOMEM; + + dump_ctx->set = set; + dump_ctx->ctx = ctx; + + c.data = dump_ctx; + return netlink_dump_start(nlsk, skb, nlh, &c); + } + + if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS]) + return -EINVAL; + + nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { + err = nft_get_set_elem(&ctx, set, attr); + if (err < 0) + break; + } + + return err; +} + static void nf_tables_setelem_notify(const struct nft_ctx *ctx, const struct nft_set *set, const struct nft_set_elem *elem, @@ -3777,22 +3862,6 @@ static void nf_tables_set_elem_destroy(const struct nft_set *set, void *elem) kfree(elem); } -static int nft_setelem_parse_flags(const struct nft_set *set, - const struct nlattr *attr, u32 *flags) -{ - if (attr == NULL) - return 0; - - *flags = ntohl(nla_get_be32(attr)); - if (*flags & ~NFT_SET_ELEM_INTERVAL_END) - return -EINVAL; - if (!(set->flags & NFT_SET_INTERVAL) && - *flags & NFT_SET_ELEM_INTERVAL_END) - return -EINVAL; - - return 0; -} - static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr, u32 nlmsg_flags) { diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index c9796629858f..a16356cacec3 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -401,7 +401,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, outdev = entry->state.out; - switch ((enum nfqnl_config_mode)ACCESS_ONCE(queue->copy_mode)) { + switch ((enum nfqnl_config_mode)READ_ONCE(queue->copy_mode)) { case NFQNL_COPY_META: case NFQNL_COPY_NONE: break; @@ -412,7 +412,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, skb_checksum_help(entskb)) return NULL; - data_len = ACCESS_ONCE(queue->copy_range); + data_len = READ_ONCE(queue->copy_range); if (data_len > entskb->len) data_len = entskb->len; diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index bd0975d7dd6f..2647b895f4b0 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -312,39 +312,6 @@ static const struct nla_policy nft_ct_policy[NFTA_CT_MAX + 1] = { [NFTA_CT_SREG] = { .type = NLA_U32 }, }; -static int nft_ct_netns_get(struct net *net, uint8_t family) -{ - int err; - - if (family == NFPROTO_INET) { - err = nf_ct_netns_get(net, NFPROTO_IPV4); - if (err < 0) - goto err1; - err = nf_ct_netns_get(net, NFPROTO_IPV6); - if (err < 0) - goto err2; - } else { - err = nf_ct_netns_get(net, family); - if (err < 0) - goto err1; - } - return 0; - -err2: - nf_ct_netns_put(net, NFPROTO_IPV4); -err1: - return err; -} - -static void nft_ct_netns_put(struct net *net, uint8_t family) -{ - if (family == NFPROTO_INET) { - nf_ct_netns_put(net, NFPROTO_IPV4); - nf_ct_netns_put(net, NFPROTO_IPV6); - } else - nf_ct_netns_put(net, family); -} - #ifdef CONFIG_NF_CONNTRACK_ZONES static void nft_ct_tmpl_put_pcpu(void) { @@ -489,7 +456,7 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, if (err < 0) return err; - err = nft_ct_netns_get(ctx->net, ctx->afi->family); + err = nf_ct_netns_get(ctx->net, ctx->afi->family); if (err < 0) return err; @@ -583,7 +550,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, if (err < 0) goto err1; - err = nft_ct_netns_get(ctx->net, ctx->afi->family); + err = nf_ct_netns_get(ctx->net, ctx->afi->family); if (err < 0) goto err1; @@ -606,7 +573,7 @@ static void nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv = nft_expr_priv(expr); __nft_ct_set_destroy(ctx, priv); - nft_ct_netns_put(ctx->net, ctx->afi->family); + nf_ct_netns_put(ctx->net, ctx->afi->family); } static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 734989c40579..45fb2752fb63 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -106,6 +106,23 @@ nft_bitmap_elem_find(const struct nft_set *set, struct nft_bitmap_elem *this, return NULL; } +static void *nft_bitmap_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) +{ + const struct nft_bitmap *priv = nft_set_priv(set); + u8 genmask = nft_genmask_cur(net); + struct nft_bitmap_elem *be; + + list_for_each_entry_rcu(be, &priv->list, head) { + if (memcmp(nft_set_ext_key(&be->ext), elem->key.val.data, set->klen) || + !nft_set_elem_active(&be->ext, genmask)) + continue; + + return be; + } + return ERR_PTR(-ENOENT); +} + static int nft_bitmap_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, struct nft_set_ext **ext) @@ -294,6 +311,7 @@ static struct nft_set_ops nft_bitmap_ops __read_mostly = { .activate = nft_bitmap_activate, .lookup = nft_bitmap_lookup, .walk = nft_bitmap_walk, + .get = nft_bitmap_get, }; static struct nft_set_type nft_bitmap_type __read_mostly = { diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 9c0d5a7ce5f9..f8166c1d5430 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -95,6 +95,24 @@ static bool nft_rhash_lookup(const struct net *net, const struct nft_set *set, return !!he; } +static void *nft_rhash_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) +{ + struct nft_rhash *priv = nft_set_priv(set); + struct nft_rhash_elem *he; + struct nft_rhash_cmp_arg arg = { + .genmask = nft_genmask_cur(net), + .set = set, + .key = elem->key.val.data, + }; + + he = rhashtable_lookup_fast(&priv->ht, &arg, nft_rhash_params); + if (he != NULL) + return he; + + return ERR_PTR(-ENOENT); +} + static bool nft_rhash_update(struct nft_set *set, const u32 *key, void *(*new)(struct nft_set *, const struct nft_expr *, @@ -409,6 +427,24 @@ static bool nft_hash_lookup(const struct net *net, const struct nft_set *set, return false; } +static void *nft_hash_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) +{ + struct nft_hash *priv = nft_set_priv(set); + u8 genmask = nft_genmask_cur(net); + struct nft_hash_elem *he; + u32 hash; + + hash = jhash(elem->key.val.data, set->klen, priv->seed); + hash = reciprocal_scale(hash, priv->buckets); + hlist_for_each_entry_rcu(he, &priv->table[hash], node) { + if (!memcmp(nft_set_ext_key(&he->ext), elem->key.val.data, set->klen) && + nft_set_elem_active(&he->ext, genmask)) + return he; + } + return ERR_PTR(-ENOENT); +} + /* nft_hash_select_ops() makes sure key size can be either 2 or 4 bytes . */ static inline u32 nft_hash_key(const u32 *key, u32 klen) { @@ -494,7 +530,7 @@ static void *nft_hash_deactivate(const struct net *net, hash = reciprocal_scale(hash, priv->buckets); hlist_for_each_entry(he, &priv->table[hash], node) { if (!memcmp(nft_set_ext_key(&this->ext), &elem->key.val, - set->klen) || + set->klen) && nft_set_elem_active(&he->ext, genmask)) { nft_set_elem_change_active(net, set, &he->ext); return he; @@ -600,6 +636,7 @@ static struct nft_set_ops nft_rhash_ops __read_mostly = { .lookup = nft_rhash_lookup, .update = nft_rhash_update, .walk = nft_rhash_walk, + .get = nft_rhash_get, .features = NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, }; @@ -617,6 +654,7 @@ static struct nft_set_ops nft_hash_ops __read_mostly = { .remove = nft_hash_remove, .lookup = nft_hash_lookup, .walk = nft_hash_walk, + .get = nft_hash_get, .features = NFT_SET_MAP | NFT_SET_OBJECT, }; @@ -634,6 +672,7 @@ static struct nft_set_ops nft_hash_fast_ops __read_mostly = { .remove = nft_hash_remove, .lookup = nft_hash_lookup_fast, .walk = nft_hash_walk, + .get = nft_hash_get, .features = NFT_SET_MAP | NFT_SET_OBJECT, }; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index d83a4ec5900d..e6f08bc5f359 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -113,6 +113,78 @@ static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, return ret; } +static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set, + const u32 *key, struct nft_rbtree_elem **elem, + unsigned int seq, unsigned int flags, u8 genmask) +{ + struct nft_rbtree_elem *rbe, *interval = NULL; + struct nft_rbtree *priv = nft_set_priv(set); + const struct rb_node *parent; + const void *this; + int d; + + parent = rcu_dereference_raw(priv->root.rb_node); + while (parent != NULL) { + if (read_seqcount_retry(&priv->count, seq)) + return false; + + rbe = rb_entry(parent, struct nft_rbtree_elem, node); + + this = nft_set_ext_key(&rbe->ext); + d = memcmp(this, key, set->klen); + if (d < 0) { + parent = rcu_dereference_raw(parent->rb_left); + interval = rbe; + } else if (d > 0) { + parent = rcu_dereference_raw(parent->rb_right); + } else { + if (!nft_set_elem_active(&rbe->ext, genmask)) + parent = rcu_dereference_raw(parent->rb_left); + + if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) || + (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) == + (flags & NFT_SET_ELEM_INTERVAL_END)) { + *elem = rbe; + return true; + } + return false; + } + } + + if (set->flags & NFT_SET_INTERVAL && interval != NULL && + nft_set_elem_active(&interval->ext, genmask) && + !nft_rbtree_interval_end(interval)) { + *elem = interval; + return true; + } + + return false; +} + +static void *nft_rbtree_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) +{ + struct nft_rbtree *priv = nft_set_priv(set); + unsigned int seq = read_seqcount_begin(&priv->count); + struct nft_rbtree_elem *rbe = ERR_PTR(-ENOENT); + const u32 *key = (const u32 *)&elem->key.val; + u8 genmask = nft_genmask_cur(net); + bool ret; + + ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask); + if (ret || !read_seqcount_retry(&priv->count, seq)) + return rbe; + + read_lock_bh(&priv->lock); + seq = read_seqcount_begin(&priv->count); + ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask); + if (!ret) + rbe = ERR_PTR(-ENOENT); + read_unlock_bh(&priv->lock); + + return rbe; +} + static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree_elem *new, struct nft_set_ext **ext) @@ -336,6 +408,7 @@ static struct nft_set_ops nft_rbtree_ops __read_mostly = { .activate = nft_rbtree_activate, .lookup = nft_rbtree_lookup, .walk = nft_rbtree_walk, + .get = nft_rbtree_get, .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT, }; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index d8571f414208..a77dd514297c 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1153,6 +1153,7 @@ xt_replace_table(struct xt_table *table, int *error) { struct xt_table_info *private; + unsigned int cpu; int ret; ret = xt_jumpstack_alloc(newinfo); @@ -1182,14 +1183,28 @@ xt_replace_table(struct xt_table *table, smp_wmb(); table->private = newinfo; + /* make sure all cpus see new ->private value */ + smp_wmb(); + /* * Even though table entries have now been swapped, other CPU's - * may still be using the old entries. This is okay, because - * resynchronization happens because of the locking done - * during the get_counters() routine. + * may still be using the old entries... */ local_bh_enable(); + /* ... so wait for even xt_recseq on all cpus */ + for_each_possible_cpu(cpu) { + seqcount_t *s = &per_cpu(xt_recseq, cpu); + u32 seq = raw_read_seqcount(s); + + if (seq & 1) { + do { + cond_resched(); + cpu_relax(); + } while (seq == raw_read_seqcount(s)); + } + } + #ifdef CONFIG_AUDIT if (audit_enabled) { audit_log(current->audit_context, GFP_KERNEL, diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index ffa8eec980e9..a6214f235333 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -46,7 +46,6 @@ struct xt_connlimit_conn { struct hlist_node node; struct nf_conntrack_tuple tuple; - union nf_inet_addr addr; }; struct xt_connlimit_rb { @@ -72,16 +71,9 @@ static inline unsigned int connlimit_iphash(__be32 addr) } static inline unsigned int -connlimit_iphash6(const union nf_inet_addr *addr, - const union nf_inet_addr *mask) +connlimit_iphash6(const union nf_inet_addr *addr) { - union nf_inet_addr res; - unsigned int i; - - for (i = 0; i < ARRAY_SIZE(addr->ip6); ++i) - res.ip6[i] = addr->ip6[i] & mask->ip6[i]; - - return jhash2((u32 *)res.ip6, ARRAY_SIZE(res.ip6), + return jhash2((u32 *)addr->ip6, ARRAY_SIZE(addr->ip6), connlimit_rnd) % CONNLIMIT_SLOTS; } @@ -95,24 +87,13 @@ static inline bool already_closed(const struct nf_conn *conn) } static int -same_source_net(const union nf_inet_addr *addr, - const union nf_inet_addr *mask, - const union nf_inet_addr *u3, u_int8_t family) +same_source(const union nf_inet_addr *addr, + const union nf_inet_addr *u3, u_int8_t family) { - if (family == NFPROTO_IPV4) { - return ntohl(addr->ip & mask->ip) - - ntohl(u3->ip & mask->ip); - } else { - union nf_inet_addr lh, rh; - unsigned int i; - - for (i = 0; i < ARRAY_SIZE(addr->ip6); ++i) { - lh.ip6[i] = addr->ip6[i] & mask->ip6[i]; - rh.ip6[i] = u3->ip6[i] & mask->ip6[i]; - } + if (family == NFPROTO_IPV4) + return ntohl(addr->ip) - ntohl(u3->ip); - return memcmp(&lh.ip6, &rh.ip6, sizeof(lh.ip6)); - } + return memcmp(addr->ip6, u3->ip6, sizeof(addr->ip6)); } static bool add_hlist(struct hlist_head *head, @@ -125,7 +106,6 @@ static bool add_hlist(struct hlist_head *head, if (conn == NULL) return false; conn->tuple = *tuple; - conn->addr = *addr; hlist_add_head(&conn->node, head); return true; } @@ -196,7 +176,7 @@ static void tree_nodes_free(struct rb_root *root, static unsigned int count_tree(struct net *net, struct rb_root *root, const struct nf_conntrack_tuple *tuple, - const union nf_inet_addr *addr, const union nf_inet_addr *mask, + const union nf_inet_addr *addr, u8 family, const struct nf_conntrack_zone *zone) { struct xt_connlimit_rb *gc_nodes[CONNLIMIT_GC_MAX_NODES]; @@ -217,7 +197,7 @@ count_tree(struct net *net, struct rb_root *root, rbconn = rb_entry(*rbnode, struct xt_connlimit_rb, node); parent = *rbnode; - diff = same_source_net(addr, mask, &rbconn->addr, family); + diff = same_source(addr, &rbconn->addr, family); if (diff < 0) { rbnode = &((*rbnode)->rb_left); } else if (diff > 0) { @@ -270,7 +250,6 @@ count_tree(struct net *net, struct rb_root *root, } conn->tuple = *tuple; - conn->addr = *addr; rbconn->addr = *addr; INIT_HLIST_HEAD(&rbconn->hhead); @@ -285,7 +264,6 @@ static int count_them(struct net *net, struct xt_connlimit_data *data, const struct nf_conntrack_tuple *tuple, const union nf_inet_addr *addr, - const union nf_inet_addr *mask, u_int8_t family, const struct nf_conntrack_zone *zone) { @@ -294,14 +272,14 @@ static int count_them(struct net *net, u32 hash; if (family == NFPROTO_IPV6) - hash = connlimit_iphash6(addr, mask); + hash = connlimit_iphash6(addr); else - hash = connlimit_iphash(addr->ip & mask->ip); + hash = connlimit_iphash(addr->ip); root = &data->climit_root[hash]; spin_lock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]); - count = count_tree(net, root, tuple, addr, mask, family, zone); + count = count_tree(net, root, tuple, addr, family, zone); spin_unlock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]); @@ -332,16 +310,23 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) if (xt_family(par) == NFPROTO_IPV6) { const struct ipv6hdr *iph = ipv6_hdr(skb); + unsigned int i; + memcpy(&addr.ip6, (info->flags & XT_CONNLIMIT_DADDR) ? &iph->daddr : &iph->saddr, sizeof(addr.ip6)); + + for (i = 0; i < ARRAY_SIZE(addr.ip6); ++i) + addr.ip6[i] &= info->mask.ip6[i]; } else { const struct iphdr *iph = ip_hdr(skb); addr.ip = (info->flags & XT_CONNLIMIT_DADDR) ? iph->daddr : iph->saddr; + + addr.ip &= info->mask.ip; } connections = count_them(net, info->data, tuple_ptr, &addr, - &info->mask, xt_family(par), zone); + xt_family(par), zone); if (connections == 0) /* kmalloc failed, drop it entirely */ goto hotdrop; diff --git a/net/netlabel/netlabel_addrlist.h b/net/netlabel/netlabel_addrlist.h index d0f38bc9af6d..ac709f0f197b 100644 --- a/net/netlabel/netlabel_addrlist.h +++ b/net/netlabel/netlabel_addrlist.h @@ -87,7 +87,7 @@ static inline struct netlbl_af4list *__af4list_valid_rcu(struct list_head *s, struct list_head *i = s; struct netlbl_af4list *n = __af4list_entry(s); while (i != h && !n->valid) { - i = rcu_dereference(i->next); + i = rcu_dereference(list_next_rcu(i)); n = __af4list_entry(i); } return n; @@ -154,7 +154,7 @@ static inline struct netlbl_af6list *__af6list_valid_rcu(struct list_head *s, struct list_head *i = s; struct netlbl_af6list *n = __af6list_entry(s); while (i != h && !n->valid) { - i = rcu_dereference(i->next); + i = rcu_dereference(list_next_rcu(i)); n = __af6list_entry(i); } return n; diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c index d177dd066504..4d748975117d 100644 --- a/net/netlabel/netlabel_calipso.c +++ b/net/netlabel/netlabel_calipso.c @@ -393,7 +393,7 @@ EXPORT_SYMBOL(netlbl_calipso_ops_register); static const struct netlbl_calipso_ops *netlbl_calipso_ops_get(void) { - return ACCESS_ONCE(calipso_ops); + return READ_ONCE(calipso_ops); } /** diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 26ded4239611..b9e0ee4e22f5 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -128,7 +128,6 @@ static const char *const nlk_cb_mutex_key_strings[MAX_LINKS + 1] = { }; static int netlink_dump(struct sock *sk); -static void netlink_skb_destructor(struct sk_buff *skb); /* nl_table locking explained: * Lookup and traversal are protected with an RCU read-side lock. Insertion @@ -2136,7 +2135,7 @@ static int netlink_dump(struct sock *sk) struct sk_buff *skb = NULL; struct nlmsghdr *nlh; struct module *module; - int len, err = -ENOBUFS; + int err = -ENOBUFS; int alloc_min_size; int alloc_size; @@ -2183,9 +2182,11 @@ static int netlink_dump(struct sock *sk) skb_reserve(skb, skb_tailroom(skb) - alloc_size); netlink_skb_set_owner_r(skb, sk); - len = cb->dump(skb, cb); + if (nlk->dump_done_errno > 0) + nlk->dump_done_errno = cb->dump(skb, cb); - if (len > 0) { + if (nlk->dump_done_errno > 0 || + skb_tailroom(skb) < nlmsg_total_size(sizeof(nlk->dump_done_errno))) { mutex_unlock(nlk->cb_mutex); if (sk_filter(sk, skb)) @@ -2195,13 +2196,15 @@ static int netlink_dump(struct sock *sk) return 0; } - nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, sizeof(len), NLM_F_MULTI); - if (!nlh) + nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, + sizeof(nlk->dump_done_errno), NLM_F_MULTI); + if (WARN_ON(!nlh)) goto errout_skb; nl_dump_check_consistent(cb, nlh); - memcpy(nlmsg_data(nlh), &len, sizeof(len)); + memcpy(nlmsg_data(nlh), &nlk->dump_done_errno, + sizeof(nlk->dump_done_errno)); if (sk_filter(sk, skb)) kfree_skb(skb); @@ -2273,6 +2276,7 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, } nlk->cb_running = true; + nlk->dump_done_errno = INT_MAX; mutex_unlock(nlk->cb_mutex); diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index 028188597eaa..962de7b3c023 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -34,6 +34,7 @@ struct netlink_sock { wait_queue_head_t wait; bool bound; bool cb_running; + int dump_done_errno; struct netlink_callback cb; struct mutex *cb_mutex; struct mutex cb_def_mutex; diff --git a/net/netrom/nr_loopback.c b/net/netrom/nr_loopback.c index 94d4e922af53..989ae647825e 100644 --- a/net/netrom/nr_loopback.c +++ b/net/netrom/nr_loopback.c @@ -18,7 +18,7 @@ static void nr_loopback_timer(unsigned long); static struct sk_buff_head loopback_queue; -static DEFINE_TIMER(loopback_timer, nr_loopback_timer, 0, 0); +static DEFINE_TIMER(loopback_timer, nr_loopback_timer); void __init nr_loopback_init(void) { diff --git a/net/nfc/core.c b/net/nfc/core.c index e5e23c2cbe74..947a470f929d 100644 --- a/net/nfc/core.c +++ b/net/nfc/core.c @@ -1015,9 +1015,9 @@ exit: device_unlock(&dev->dev); } -static void nfc_check_pres_timeout(unsigned long data) +static void nfc_check_pres_timeout(struct timer_list *t) { - struct nfc_dev *dev = (struct nfc_dev *)data; + struct nfc_dev *dev = from_timer(dev, t, check_pres_timer); schedule_work(&dev->check_pres_work); } @@ -1094,9 +1094,7 @@ struct nfc_dev *nfc_allocate_device(struct nfc_ops *ops, dev->targets_generation = 1; if (ops->check_presence) { - setup_timer(&dev->check_pres_timer, nfc_check_pres_timeout, - (unsigned long)dev); - + timer_setup(&dev->check_pres_timer, nfc_check_pres_timeout, 0); INIT_WORK(&dev->check_pres_work, nfc_check_pres_work); } @@ -1105,7 +1103,7 @@ struct nfc_dev *nfc_allocate_device(struct nfc_ops *ops, err_free_dev: kfree(dev); - return ERR_PTR(rc); + return NULL; } EXPORT_SYMBOL(nfc_allocate_device); diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c index de6dd37d04c7..ec0a8998e52d 100644 --- a/net/nfc/digital_core.c +++ b/net/nfc/digital_core.c @@ -650,6 +650,7 @@ static void digital_deactivate_target(struct nfc_dev *nfc_dev, return; } + digital_abort_cmd(ddev); ddev->curr_protocol = 0; } diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c index a8a6e7814e09..ac8030c4bcf8 100644 --- a/net/nfc/hci/core.c +++ b/net/nfc/hci/core.c @@ -428,9 +428,9 @@ exit_noskb: nfc_hci_driver_failure(hdev, r); } -static void nfc_hci_cmd_timeout(unsigned long data) +static void nfc_hci_cmd_timeout(struct timer_list *t) { - struct nfc_hci_dev *hdev = (struct nfc_hci_dev *)data; + struct nfc_hci_dev *hdev = from_timer(hdev, t, cmd_timer); schedule_work(&hdev->msg_tx_work); } @@ -1004,8 +1004,7 @@ int nfc_hci_register_device(struct nfc_hci_dev *hdev) INIT_WORK(&hdev->msg_tx_work, nfc_hci_msg_tx_work); - setup_timer(&hdev->cmd_timer, nfc_hci_cmd_timeout, - (unsigned long)hdev); + timer_setup(&hdev->cmd_timer, nfc_hci_cmd_timeout, 0); skb_queue_head_init(&hdev->rx_hcp_frags); diff --git a/net/nfc/hci/llc_shdlc.c b/net/nfc/hci/llc_shdlc.c index 58df37eae1e8..fe988936ad92 100644 --- a/net/nfc/hci/llc_shdlc.c +++ b/net/nfc/hci/llc_shdlc.c @@ -580,27 +580,27 @@ static void llc_shdlc_handle_send_queue(struct llc_shdlc *shdlc) } } -static void llc_shdlc_connect_timeout(unsigned long data) +static void llc_shdlc_connect_timeout(struct timer_list *t) { - struct llc_shdlc *shdlc = (struct llc_shdlc *)data; + struct llc_shdlc *shdlc = from_timer(shdlc, t, connect_timer); pr_debug("\n"); schedule_work(&shdlc->sm_work); } -static void llc_shdlc_t1_timeout(unsigned long data) +static void llc_shdlc_t1_timeout(struct timer_list *t) { - struct llc_shdlc *shdlc = (struct llc_shdlc *)data; + struct llc_shdlc *shdlc = from_timer(shdlc, t, t1_timer); pr_debug("SoftIRQ: need to send ack\n"); schedule_work(&shdlc->sm_work); } -static void llc_shdlc_t2_timeout(unsigned long data) +static void llc_shdlc_t2_timeout(struct timer_list *t) { - struct llc_shdlc *shdlc = (struct llc_shdlc *)data; + struct llc_shdlc *shdlc = from_timer(shdlc, t, t2_timer); pr_debug("SoftIRQ: need to retransmit\n"); @@ -763,14 +763,9 @@ static void *llc_shdlc_init(struct nfc_hci_dev *hdev, xmit_to_drv_t xmit_to_drv, mutex_init(&shdlc->state_mutex); shdlc->state = SHDLC_DISCONNECTED; - setup_timer(&shdlc->connect_timer, llc_shdlc_connect_timeout, - (unsigned long)shdlc); - - setup_timer(&shdlc->t1_timer, llc_shdlc_t1_timeout, - (unsigned long)shdlc); - - setup_timer(&shdlc->t2_timer, llc_shdlc_t2_timeout, - (unsigned long)shdlc); + timer_setup(&shdlc->connect_timer, llc_shdlc_connect_timeout, 0); + timer_setup(&shdlc->t1_timer, llc_shdlc_t1_timeout, 0); + timer_setup(&shdlc->t2_timer, llc_shdlc_t2_timeout, 0); shdlc->w = SHDLC_MAX_WINDOW; shdlc->srej_support = SHDLC_SREJ_SUPPORT; diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c index 7988185072e5..ef4026a23e80 100644 --- a/net/nfc/llcp_core.c +++ b/net/nfc/llcp_core.c @@ -242,9 +242,9 @@ static void nfc_llcp_timeout_work(struct work_struct *work) nfc_dep_link_down(local->dev); } -static void nfc_llcp_symm_timer(unsigned long data) +static void nfc_llcp_symm_timer(struct timer_list *t) { - struct nfc_llcp_local *local = (struct nfc_llcp_local *) data; + struct nfc_llcp_local *local = from_timer(local, t, link_timer); pr_err("SYMM timeout\n"); @@ -285,9 +285,9 @@ static void nfc_llcp_sdreq_timeout_work(struct work_struct *work) nfc_genl_llc_send_sdres(local->dev, &nl_sdres_list); } -static void nfc_llcp_sdreq_timer(unsigned long data) +static void nfc_llcp_sdreq_timer(struct timer_list *t) { - struct nfc_llcp_local *local = (struct nfc_llcp_local *) data; + struct nfc_llcp_local *local = from_timer(local, t, sdreq_timer); schedule_work(&local->sdreq_timeout_work); } @@ -1573,8 +1573,7 @@ int nfc_llcp_register_device(struct nfc_dev *ndev) INIT_LIST_HEAD(&local->list); kref_init(&local->ref); mutex_init(&local->sdp_lock); - setup_timer(&local->link_timer, nfc_llcp_symm_timer, - (unsigned long)local); + timer_setup(&local->link_timer, nfc_llcp_symm_timer, 0); skb_queue_head_init(&local->tx_queue); INIT_WORK(&local->tx_work, nfc_llcp_tx_work); @@ -1600,8 +1599,7 @@ int nfc_llcp_register_device(struct nfc_dev *ndev) mutex_init(&local->sdreq_lock); INIT_HLIST_HEAD(&local->pending_sdreqs); - setup_timer(&local->sdreq_timer, nfc_llcp_sdreq_timer, - (unsigned long)local); + timer_setup(&local->sdreq_timer, nfc_llcp_sdreq_timer, 0); INIT_WORK(&local->sdreq_timeout_work, nfc_llcp_sdreq_timeout_work); list_add(&local->list, &llcp_devices); diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index b251fb936a27..c0b83dc9d993 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -75,7 +75,7 @@ static int nfc_genl_send_target(struct sk_buff *msg, struct nfc_target *target, if (!hdr) return -EMSGSIZE; - genl_dump_check_consistent(cb, hdr, &nfc_genl_family); + genl_dump_check_consistent(cb, hdr); if (nla_put_u32(msg, NFC_ATTR_TARGET_INDEX, target->idx) || nla_put_u32(msg, NFC_ATTR_PROTOCOLS, target->supported_protocols) || @@ -603,7 +603,7 @@ static int nfc_genl_send_device(struct sk_buff *msg, struct nfc_dev *dev, return -EMSGSIZE; if (cb) - genl_dump_check_consistent(cb, hdr, &nfc_genl_family); + genl_dump_check_consistent(cb, hdr); if (nfc_genl_setup_device_added(dev, msg)) goto nla_put_failure; @@ -928,6 +928,30 @@ static int nfc_genl_activate_target(struct sk_buff *skb, struct genl_info *info) return rc; } +static int nfc_genl_deactivate_target(struct sk_buff *skb, + struct genl_info *info) +{ + struct nfc_dev *dev; + u32 device_idx, target_idx; + int rc; + + if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) + return -EINVAL; + + device_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); + + dev = nfc_get_device(device_idx); + if (!dev) + return -ENODEV; + + target_idx = nla_get_u32(info->attrs[NFC_ATTR_TARGET_INDEX]); + + rc = nfc_deactivate_target(dev, target_idx, NFC_TARGET_MODE_SLEEP); + + nfc_put_device(dev); + return rc; +} + static int nfc_genl_dep_link_up(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; @@ -1332,7 +1356,7 @@ static int nfc_genl_send_se(struct sk_buff *msg, struct nfc_dev *dev, goto nla_put_failure; if (cb) - genl_dump_check_consistent(cb, hdr, &nfc_genl_family); + genl_dump_check_consistent(cb, hdr); if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, se->idx) || @@ -1751,6 +1775,11 @@ static const struct genl_ops nfc_genl_ops[] = { .doit = nfc_genl_vendor_cmd, .policy = nfc_genl_policy, }, + { + .cmd = NFC_CMD_DEACTIVATE_TARGET, + .doit = nfc_genl_deactivate_target, + .policy = nfc_genl_policy, + }, }; static struct genl_family nfc_genl_family __ro_after_init = { diff --git a/net/nsh/nsh.c b/net/nsh/nsh.c index 58fb827439a8..d7da99a0b0b8 100644 --- a/net/nsh/nsh.c +++ b/net/nsh/nsh.c @@ -14,6 +14,66 @@ #include <net/nsh.h> #include <net/tun_proto.h> +int nsh_push(struct sk_buff *skb, const struct nshhdr *pushed_nh) +{ + struct nshhdr *nh; + size_t length = nsh_hdr_len(pushed_nh); + u8 next_proto; + + if (skb->mac_len) { + next_proto = TUN_P_ETHERNET; + } else { + next_proto = tun_p_from_eth_p(skb->protocol); + if (!next_proto) + return -EAFNOSUPPORT; + } + + /* Add the NSH header */ + if (skb_cow_head(skb, length) < 0) + return -ENOMEM; + + skb_push(skb, length); + nh = (struct nshhdr *)(skb->data); + memcpy(nh, pushed_nh, length); + nh->np = next_proto; + skb_postpush_rcsum(skb, nh, length); + + skb->protocol = htons(ETH_P_NSH); + skb_reset_mac_header(skb); + skb_reset_network_header(skb); + skb_reset_mac_len(skb); + + return 0; +} +EXPORT_SYMBOL_GPL(nsh_push); + +int nsh_pop(struct sk_buff *skb) +{ + struct nshhdr *nh; + size_t length; + __be16 inner_proto; + + if (!pskb_may_pull(skb, NSH_BASE_HDR_LEN)) + return -ENOMEM; + nh = (struct nshhdr *)(skb->data); + length = nsh_hdr_len(nh); + inner_proto = tun_p_to_eth_p(nh->np); + if (!pskb_may_pull(skb, length)) + return -ENOMEM; + + if (!inner_proto) + return -EAFNOSUPPORT; + + skb_pull_rcsum(skb, length); + skb_reset_mac_header(skb); + skb_reset_network_header(skb); + skb_reset_mac_len(skb); + skb->protocol = inner_proto; + + return 0; +} +EXPORT_SYMBOL_GPL(nsh_pop); + static struct sk_buff *nsh_gso_segment(struct sk_buff *skb, netdev_features_t features) { diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index ce947292ae77..2650205cdaf9 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -14,6 +14,7 @@ config OPENVSWITCH select MPLS select NET_MPLS_GSO select DST_CACHE + select NET_NSH ---help--- Open vSwitch is a multilayer Ethernet switch targeted at virtualized environments. In addition to supporting a variety of features diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile index 299f4476cf44..41109c326f3a 100644 --- a/net/openvswitch/Makefile +++ b/net/openvswitch/Makefile @@ -12,6 +12,7 @@ openvswitch-y := \ flow.o \ flow_netlink.o \ flow_table.o \ + meter.o \ vport.o \ vport-internal_dev.o \ vport-netdev.o diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index a551232daf61..30a5df27116e 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -43,6 +43,7 @@ #include "flow.h" #include "conntrack.h" #include "vport.h" +#include "flow_netlink.h" struct deferred_action { struct sk_buff *skb; @@ -380,6 +381,38 @@ static int push_eth(struct sk_buff *skb, struct sw_flow_key *key, return 0; } +static int push_nsh(struct sk_buff *skb, struct sw_flow_key *key, + const struct nshhdr *nh) +{ + int err; + + err = nsh_push(skb, nh); + if (err) + return err; + + /* safe right before invalidate_flow_key */ + key->mac_proto = MAC_PROTO_NONE; + invalidate_flow_key(key); + return 0; +} + +static int pop_nsh(struct sk_buff *skb, struct sw_flow_key *key) +{ + int err; + + err = nsh_pop(skb); + if (err) + return err; + + /* safe right before invalidate_flow_key */ + if (skb->protocol == htons(ETH_P_TEB)) + key->mac_proto = MAC_PROTO_ETHERNET; + else + key->mac_proto = MAC_PROTO_NONE; + invalidate_flow_key(key); + return 0; +} + static void update_ip_l4_checksum(struct sk_buff *skb, struct iphdr *nh, __be32 addr, __be32 new_addr) { @@ -602,6 +635,69 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key, return 0; } +static int set_nsh(struct sk_buff *skb, struct sw_flow_key *flow_key, + const struct nlattr *a) +{ + struct nshhdr *nh; + size_t length; + int err; + u8 flags; + u8 ttl; + int i; + + struct ovs_key_nsh key; + struct ovs_key_nsh mask; + + err = nsh_key_from_nlattr(a, &key, &mask); + if (err) + return err; + + /* Make sure the NSH base header is there */ + if (!pskb_may_pull(skb, skb_network_offset(skb) + NSH_BASE_HDR_LEN)) + return -ENOMEM; + + nh = nsh_hdr(skb); + length = nsh_hdr_len(nh); + + /* Make sure the whole NSH header is there */ + err = skb_ensure_writable(skb, skb_network_offset(skb) + + length); + if (unlikely(err)) + return err; + + nh = nsh_hdr(skb); + skb_postpull_rcsum(skb, nh, length); + flags = nsh_get_flags(nh); + flags = OVS_MASKED(flags, key.base.flags, mask.base.flags); + flow_key->nsh.base.flags = flags; + ttl = nsh_get_ttl(nh); + ttl = OVS_MASKED(ttl, key.base.ttl, mask.base.ttl); + flow_key->nsh.base.ttl = ttl; + nsh_set_flags_and_ttl(nh, flags, ttl); + nh->path_hdr = OVS_MASKED(nh->path_hdr, key.base.path_hdr, + mask.base.path_hdr); + flow_key->nsh.base.path_hdr = nh->path_hdr; + switch (nh->mdtype) { + case NSH_M_TYPE1: + for (i = 0; i < NSH_MD1_CONTEXT_SIZE; i++) { + nh->md1.context[i] = + OVS_MASKED(nh->md1.context[i], key.context[i], + mask.context[i]); + } + memcpy(flow_key->nsh.context, nh->md1.context, + sizeof(nh->md1.context)); + break; + case NSH_M_TYPE2: + memset(flow_key->nsh.context, 0, + sizeof(flow_key->nsh.context)); + break; + default: + return -EINVAL; + } + skb_postpush_rcsum(skb, nh, length); + return 0; +} + /* Must follow skb_ensure_writable() since that can move the skb data. */ static void set_tp_port(struct sk_buff *skb, __be16 *port, __be16 new_port, __sum16 *check) @@ -1024,6 +1120,10 @@ static int execute_masked_set_action(struct sk_buff *skb, get_mask(a, struct ovs_key_ethernet *)); break; + case OVS_KEY_ATTR_NSH: + err = set_nsh(skb, flow_key, a); + break; + case OVS_KEY_ATTR_IPV4: err = set_ipv4(skb, flow_key, nla_data(a), get_mask(a, struct ovs_key_ipv4 *)); @@ -1214,6 +1314,28 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, case OVS_ACTION_ATTR_POP_ETH: err = pop_eth(skb, key); break; + + case OVS_ACTION_ATTR_PUSH_NSH: { + u8 buffer[NSH_HDR_MAX_LEN]; + struct nshhdr *nh = (struct nshhdr *)buffer; + + err = nsh_hdr_from_nlattr(nla_data(a), nh, + NSH_HDR_MAX_LEN); + if (unlikely(err)) + break; + err = push_nsh(skb, key, nh); + break; + } + + case OVS_ACTION_ATTR_POP_NSH: + err = pop_nsh(skb, key); + break; + + case OVS_ACTION_ATTR_METER: + if (ovs_meter_execute(dp, skb, key, nla_get_u32(a))) { + consume_skb(skb); + return 0; + } } if (unlikely(err)) { diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 4d38ac044cee..0dab33fb9844 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -55,6 +55,7 @@ #include "flow.h" #include "flow_table.h" #include "flow_netlink.h" +#include "meter.h" #include "vport-internal_dev.h" #include "vport-netdev.h" @@ -142,35 +143,6 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *, const struct dp_upcall_info *, uint32_t cutlen); -/* Must be called with rcu_read_lock. */ -static struct datapath *get_dp_rcu(struct net *net, int dp_ifindex) -{ - struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex); - - if (dev) { - struct vport *vport = ovs_internal_dev_get_vport(dev); - if (vport) - return vport->dp; - } - - return NULL; -} - -/* The caller must hold either ovs_mutex or rcu_read_lock to keep the - * returned dp pointer valid. - */ -static inline struct datapath *get_dp(struct net *net, int dp_ifindex) -{ - struct datapath *dp; - - WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held()); - rcu_read_lock(); - dp = get_dp_rcu(net, dp_ifindex); - rcu_read_unlock(); - - return dp; -} - /* Must be called with rcu_read_lock or ovs_mutex. */ const char *ovs_dp_name(const struct datapath *dp) { @@ -203,6 +175,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu) ovs_flow_tbl_destroy(&dp->table); free_percpu(dp->stats_percpu); kfree(dp->ports); + ovs_meters_exit(dp); kfree(dp); } @@ -1601,6 +1574,10 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) INIT_HLIST_HEAD(&dp->ports[i]); + err = ovs_meters_init(dp); + if (err) + goto err_destroy_ports_array; + /* Set up our datapath device. */ parms.name = nla_data(a[OVS_DP_ATTR_NAME]); parms.type = OVS_VPORT_TYPE_INTERNAL; @@ -1629,7 +1606,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) ovs_dp_reset_user_features(skb, info); } - goto err_destroy_ports_array; + goto err_destroy_meters; } err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, @@ -1644,8 +1621,10 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) ovs_notify(&dp_datapath_genl_family, reply, info); return 0; -err_destroy_ports_array: +err_destroy_meters: ovs_unlock(); + ovs_meters_exit(dp); +err_destroy_ports_array: kfree(dp->ports); err_destroy_percpu: free_percpu(dp->stats_percpu); @@ -2294,6 +2273,7 @@ static struct genl_family * const dp_genl_families[] = { &dp_vport_genl_family, &dp_flow_genl_family, &dp_packet_genl_family, + &dp_meter_genl_family, }; static void dp_unregister_genl(int n_families) @@ -2474,3 +2454,4 @@ MODULE_ALIAS_GENL_FAMILY(OVS_DATAPATH_FAMILY); MODULE_ALIAS_GENL_FAMILY(OVS_VPORT_FAMILY); MODULE_ALIAS_GENL_FAMILY(OVS_FLOW_FAMILY); MODULE_ALIAS_GENL_FAMILY(OVS_PACKET_FAMILY); +MODULE_ALIAS_GENL_FAMILY(OVS_METER_FAMILY); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 4a104ef9e12c..523d65526766 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -30,6 +30,8 @@ #include "conntrack.h" #include "flow.h" #include "flow_table.h" +#include "meter.h" +#include "vport-internal_dev.h" #define DP_MAX_PORTS USHRT_MAX #define DP_VPORT_HASH_BUCKETS 1024 @@ -91,6 +93,9 @@ struct datapath { u32 user_features; u32 max_headroom; + + /* Switch meters. */ + struct hlist_head *meters; }; /** @@ -190,6 +195,36 @@ static inline struct vport *ovs_vport_ovsl(const struct datapath *dp, int port_n return ovs_lookup_vport(dp, port_no); } +/* Must be called with rcu_read_lock. */ +static inline struct datapath *get_dp_rcu(struct net *net, int dp_ifindex) +{ + struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex); + + if (dev) { + struct vport *vport = ovs_internal_dev_get_vport(dev); + + if (vport) + return vport->dp; + } + + return NULL; +} + +/* The caller must hold either ovs_mutex or rcu_read_lock to keep the + * returned dp pointer valid. + */ +static inline struct datapath *get_dp(struct net *net, int dp_ifindex) +{ + struct datapath *dp; + + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held()); + rcu_read_lock(); + dp = get_dp_rcu(net, dp_ifindex); + rcu_read_unlock(); + + return dp; +} + extern struct notifier_block ovs_dp_device_notifier; extern struct genl_family dp_vport_genl_family; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 8c94cef25a72..864ddb1e3642 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -46,6 +46,7 @@ #include <net/ipv6.h> #include <net/mpls.h> #include <net/ndisc.h> +#include <net/nsh.h> #include "conntrack.h" #include "datapath.h" @@ -490,6 +491,52 @@ invalid: return 0; } +static int parse_nsh(struct sk_buff *skb, struct sw_flow_key *key) +{ + struct nshhdr *nh; + unsigned int nh_ofs = skb_network_offset(skb); + u8 version, length; + int err; + + err = check_header(skb, nh_ofs + NSH_BASE_HDR_LEN); + if (unlikely(err)) + return err; + + nh = nsh_hdr(skb); + version = nsh_get_ver(nh); + length = nsh_hdr_len(nh); + + if (version != 0) + return -EINVAL; + + err = check_header(skb, nh_ofs + length); + if (unlikely(err)) + return err; + + nh = nsh_hdr(skb); + key->nsh.base.flags = nsh_get_flags(nh); + key->nsh.base.ttl = nsh_get_ttl(nh); + key->nsh.base.mdtype = nh->mdtype; + key->nsh.base.np = nh->np; + key->nsh.base.path_hdr = nh->path_hdr; + switch (key->nsh.base.mdtype) { + case NSH_M_TYPE1: + if (length != NSH_M_TYPE1_LEN) + return -EINVAL; + memcpy(key->nsh.context, nh->md1.context, + sizeof(nh->md1)); + break; + case NSH_M_TYPE2: + memset(key->nsh.context, 0, + sizeof(nh->md1)); + break; + default: + return -EINVAL; + } + + return 0; +} + /** * key_extract - extracts a flow key from an Ethernet frame. * @skb: sk_buff that contains the frame, with skb->data pointing to the @@ -735,6 +782,10 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) memset(&key->tp, 0, sizeof(key->tp)); } } + } else if (key->eth.type == htons(ETH_P_NSH)) { + error = parse_nsh(skb, key); + if (error) + return error; } return 0; } diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 1875bba4f865..c670dd24b8b7 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -35,6 +35,7 @@ #include <net/inet_ecn.h> #include <net/ip_tunnels.h> #include <net/dst_metadata.h> +#include <net/nsh.h> struct sk_buff; @@ -66,6 +67,11 @@ struct vlan_head { (offsetof(struct sw_flow_key, recirc_id) + \ FIELD_SIZEOF(struct sw_flow_key, recirc_id)) +struct ovs_key_nsh { + struct ovs_nsh_key_base base; + __be32 context[NSH_MD1_CONTEXT_SIZE]; +}; + struct sw_flow_key { u8 tun_opts[IP_TUNNEL_OPTS_MAX]; u8 tun_opts_len; @@ -143,6 +149,7 @@ struct sw_flow_key { } nd; }; } ipv6; + struct ovs_key_nsh nsh; /* network service header */ }; struct { /* Connection tracking fields not packed above. */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index dc0d79092e74..dc424798ba6f 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -48,6 +48,7 @@ #include <net/ndisc.h> #include <net/mpls.h> #include <net/vxlan.h> +#include <net/tun_proto.h> #include <net/erspan.h> #include "flow_netlink.h" @@ -80,13 +81,16 @@ static bool actions_may_change_flow(const struct nlattr *actions) case OVS_ACTION_ATTR_HASH: case OVS_ACTION_ATTR_POP_ETH: case OVS_ACTION_ATTR_POP_MPLS: + case OVS_ACTION_ATTR_POP_NSH: case OVS_ACTION_ATTR_POP_VLAN: case OVS_ACTION_ATTR_PUSH_ETH: case OVS_ACTION_ATTR_PUSH_MPLS: + case OVS_ACTION_ATTR_PUSH_NSH: case OVS_ACTION_ATTR_PUSH_VLAN: case OVS_ACTION_ATTR_SAMPLE: case OVS_ACTION_ATTR_SET: case OVS_ACTION_ATTR_SET_MASKED: + case OVS_ACTION_ATTR_METER: default: return true; } @@ -175,7 +179,8 @@ static bool match_validate(const struct sw_flow_match *match, | (1 << OVS_KEY_ATTR_ICMPV6) | (1 << OVS_KEY_ATTR_ARP) | (1 << OVS_KEY_ATTR_ND) - | (1 << OVS_KEY_ATTR_MPLS)); + | (1 << OVS_KEY_ATTR_MPLS) + | (1 << OVS_KEY_ATTR_NSH)); /* Always allowed mask fields. */ mask_allowed |= ((1 << OVS_KEY_ATTR_TUNNEL) @@ -284,6 +289,14 @@ static bool match_validate(const struct sw_flow_match *match, } } + if (match->key->eth.type == htons(ETH_P_NSH)) { + key_expected |= 1 << OVS_KEY_ATTR_NSH; + if (match->mask && + match->mask->key.eth.type == htons(0xffff)) { + mask_allowed |= 1 << OVS_KEY_ATTR_NSH; + } + } + if ((key_attrs & key_expected) != key_expected) { /* Key attributes check failed. */ OVS_NLERR(log, "Missing key (keys=%llx, expected=%llx)", @@ -325,12 +338,25 @@ size_t ovs_tun_key_attr_size(void) + nla_total_size(4); /* OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS */ } +static size_t ovs_nsh_key_attr_size(void) +{ + /* Whenever adding new OVS_NSH_KEY_ FIELDS, we should consider + * updating this function. + */ + return nla_total_size(NSH_BASE_HDR_LEN) /* OVS_NSH_KEY_ATTR_BASE */ + /* OVS_NSH_KEY_ATTR_MD1 and OVS_NSH_KEY_ATTR_MD2 are + * mutually exclusive, so the bigger one can cover + * the small one. + */ + + nla_total_size(NSH_CTX_HDRS_MAX_LEN); +} + size_t ovs_key_attr_size(void) { /* Whenever adding new OVS_KEY_ FIELDS, we should consider * updating this function. */ - BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 28); + BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 29); return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */ + nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */ @@ -344,6 +370,8 @@ size_t ovs_key_attr_size(void) + nla_total_size(4) /* OVS_KEY_ATTR_CT_MARK */ + nla_total_size(16) /* OVS_KEY_ATTR_CT_LABELS */ + nla_total_size(40) /* OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6 */ + + nla_total_size(0) /* OVS_KEY_ATTR_NSH */ + + ovs_nsh_key_attr_size() + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ + nla_total_size(4) /* OVS_KEY_ATTR_VLAN */ @@ -377,6 +405,13 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] [OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS] = { .len = sizeof(u32) }, }; +static const struct ovs_len_tbl +ovs_nsh_key_attr_lens[OVS_NSH_KEY_ATTR_MAX + 1] = { + [OVS_NSH_KEY_ATTR_BASE] = { .len = sizeof(struct ovs_nsh_key_base) }, + [OVS_NSH_KEY_ATTR_MD1] = { .len = sizeof(struct ovs_nsh_key_md1) }, + [OVS_NSH_KEY_ATTR_MD2] = { .len = OVS_ATTR_VARIABLE }, +}; + /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_ENCAP] = { .len = OVS_ATTR_NESTED }, @@ -409,6 +444,8 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { .len = sizeof(struct ovs_key_ct_tuple_ipv4) }, [OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6] = { .len = sizeof(struct ovs_key_ct_tuple_ipv6) }, + [OVS_KEY_ATTR_NSH] = { .len = OVS_ATTR_NESTED, + .next = ovs_nsh_key_attr_lens, }, }; static bool check_attr_len(unsigned int attr_len, unsigned int expected_len) @@ -1227,6 +1264,221 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match, return 0; } +int nsh_hdr_from_nlattr(const struct nlattr *attr, + struct nshhdr *nh, size_t size) +{ + struct nlattr *a; + int rem; + u8 flags = 0; + u8 ttl = 0; + int mdlen = 0; + + /* validate_nsh has check this, so we needn't do duplicate check here + */ + if (size < NSH_BASE_HDR_LEN) + return -ENOBUFS; + + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + + switch (type) { + case OVS_NSH_KEY_ATTR_BASE: { + const struct ovs_nsh_key_base *base = nla_data(a); + + flags = base->flags; + ttl = base->ttl; + nh->np = base->np; + nh->mdtype = base->mdtype; + nh->path_hdr = base->path_hdr; + break; + } + case OVS_NSH_KEY_ATTR_MD1: + mdlen = nla_len(a); + if (mdlen > size - NSH_BASE_HDR_LEN) + return -ENOBUFS; + memcpy(&nh->md1, nla_data(a), mdlen); + break; + + case OVS_NSH_KEY_ATTR_MD2: + mdlen = nla_len(a); + if (mdlen > size - NSH_BASE_HDR_LEN) + return -ENOBUFS; + memcpy(&nh->md2, nla_data(a), mdlen); + break; + + default: + return -EINVAL; + } + } + + /* nsh header length = NSH_BASE_HDR_LEN + mdlen */ + nh->ver_flags_ttl_len = 0; + nsh_set_flags_ttl_len(nh, flags, ttl, NSH_BASE_HDR_LEN + mdlen); + + return 0; +} + +int nsh_key_from_nlattr(const struct nlattr *attr, + struct ovs_key_nsh *nsh, struct ovs_key_nsh *nsh_mask) +{ + struct nlattr *a; + int rem; + + /* validate_nsh has check this, so we needn't do duplicate check here + */ + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + + switch (type) { + case OVS_NSH_KEY_ATTR_BASE: { + const struct ovs_nsh_key_base *base = nla_data(a); + const struct ovs_nsh_key_base *base_mask = base + 1; + + nsh->base = *base; + nsh_mask->base = *base_mask; + break; + } + case OVS_NSH_KEY_ATTR_MD1: { + const struct ovs_nsh_key_md1 *md1 = nla_data(a); + const struct ovs_nsh_key_md1 *md1_mask = md1 + 1; + + memcpy(nsh->context, md1->context, sizeof(*md1)); + memcpy(nsh_mask->context, md1_mask->context, + sizeof(*md1_mask)); + break; + } + case OVS_NSH_KEY_ATTR_MD2: + /* Not supported yet */ + return -ENOTSUPP; + default: + return -EINVAL; + } + } + + return 0; +} + +static int nsh_key_put_from_nlattr(const struct nlattr *attr, + struct sw_flow_match *match, bool is_mask, + bool is_push_nsh, bool log) +{ + struct nlattr *a; + int rem; + bool has_base = false; + bool has_md1 = false; + bool has_md2 = false; + u8 mdtype = 0; + int mdlen = 0; + + if (WARN_ON(is_push_nsh && is_mask)) + return -EINVAL; + + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + int i; + + if (type > OVS_NSH_KEY_ATTR_MAX) { + OVS_NLERR(log, "nsh attr %d is out of range max %d", + type, OVS_NSH_KEY_ATTR_MAX); + return -EINVAL; + } + + if (!check_attr_len(nla_len(a), + ovs_nsh_key_attr_lens[type].len)) { + OVS_NLERR( + log, + "nsh attr %d has unexpected len %d expected %d", + type, + nla_len(a), + ovs_nsh_key_attr_lens[type].len + ); + return -EINVAL; + } + + switch (type) { + case OVS_NSH_KEY_ATTR_BASE: { + const struct ovs_nsh_key_base *base = nla_data(a); + + has_base = true; + mdtype = base->mdtype; + SW_FLOW_KEY_PUT(match, nsh.base.flags, + base->flags, is_mask); + SW_FLOW_KEY_PUT(match, nsh.base.ttl, + base->ttl, is_mask); + SW_FLOW_KEY_PUT(match, nsh.base.mdtype, + base->mdtype, is_mask); + SW_FLOW_KEY_PUT(match, nsh.base.np, + base->np, is_mask); + SW_FLOW_KEY_PUT(match, nsh.base.path_hdr, + base->path_hdr, is_mask); + break; + } + case OVS_NSH_KEY_ATTR_MD1: { + const struct ovs_nsh_key_md1 *md1 = nla_data(a); + + has_md1 = true; + for (i = 0; i < NSH_MD1_CONTEXT_SIZE; i++) + SW_FLOW_KEY_PUT(match, nsh.context[i], + md1->context[i], is_mask); + break; + } + case OVS_NSH_KEY_ATTR_MD2: + if (!is_push_nsh) /* Not supported MD type 2 yet */ + return -ENOTSUPP; + + has_md2 = true; + mdlen = nla_len(a); + if (mdlen > NSH_CTX_HDRS_MAX_LEN || mdlen <= 0) { + OVS_NLERR( + log, + "Invalid MD length %d for MD type %d", + mdlen, + mdtype + ); + return -EINVAL; + } + break; + default: + OVS_NLERR(log, "Unknown nsh attribute %d", + type); + return -EINVAL; + } + } + + if (rem > 0) { + OVS_NLERR(log, "nsh attribute has %d unknown bytes.", rem); + return -EINVAL; + } + + if (has_md1 && has_md2) { + OVS_NLERR( + 1, + "invalid nsh attribute: md1 and md2 are exclusive." + ); + return -EINVAL; + } + + if (!is_mask) { + if ((has_md1 && mdtype != NSH_M_TYPE1) || + (has_md2 && mdtype != NSH_M_TYPE2)) { + OVS_NLERR(1, "nsh attribute has unmatched MD type %d.", + mdtype); + return -EINVAL; + } + + if (is_push_nsh && + (!has_base || (!has_md1 && !has_md2))) { + OVS_NLERR( + 1, + "push_nsh: missing base or metadata attributes" + ); + return -EINVAL; + } + } + + return 0; +} + static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match, u64 attrs, const struct nlattr **a, bool is_mask, bool log) @@ -1354,6 +1606,13 @@ static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match, attrs &= ~(1 << OVS_KEY_ATTR_ARP); } + if (attrs & (1 << OVS_KEY_ATTR_NSH)) { + if (nsh_key_put_from_nlattr(a[OVS_KEY_ATTR_NSH], match, + is_mask, false, log) < 0) + return -EINVAL; + attrs &= ~(1 << OVS_KEY_ATTR_NSH); + } + if (attrs & (1 << OVS_KEY_ATTR_MPLS)) { const struct ovs_key_mpls *mpls_key; @@ -1670,6 +1929,34 @@ static int ovs_nla_put_vlan(struct sk_buff *skb, const struct vlan_head *vh, return 0; } +static int nsh_key_to_nlattr(const struct ovs_key_nsh *nsh, bool is_mask, + struct sk_buff *skb) +{ + struct nlattr *start; + + start = nla_nest_start(skb, OVS_KEY_ATTR_NSH); + if (!start) + return -EMSGSIZE; + + if (nla_put(skb, OVS_NSH_KEY_ATTR_BASE, sizeof(nsh->base), &nsh->base)) + goto nla_put_failure; + + if (is_mask || nsh->base.mdtype == NSH_M_TYPE1) { + if (nla_put(skb, OVS_NSH_KEY_ATTR_MD1, + sizeof(nsh->context), nsh->context)) + goto nla_put_failure; + } + + /* Don't support MD type 2 yet */ + + nla_nest_end(skb, start); + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + static int __ovs_nla_put_key(const struct sw_flow_key *swkey, const struct sw_flow_key *output, bool is_mask, struct sk_buff *skb) @@ -1798,6 +2085,9 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, ipv6_key->ipv6_tclass = output->ip.tos; ipv6_key->ipv6_hlimit = output->ip.ttl; ipv6_key->ipv6_frag = output->ip.frag; + } else if (swkey->eth.type == htons(ETH_P_NSH)) { + if (nsh_key_to_nlattr(&output->nsh, is_mask, skb)) + goto nla_put_failure; } else if (swkey->eth.type == htons(ETH_P_ARP) || swkey->eth.type == htons(ETH_P_RARP)) { struct ovs_key_arp *arp_key; @@ -2292,6 +2582,19 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, return err; } +static bool validate_nsh(const struct nlattr *attr, bool is_mask, + bool is_push_nsh, bool log) +{ + struct sw_flow_match match; + struct sw_flow_key key; + int ret = 0; + + ovs_match_init(&match, &key, true, NULL); + ret = nsh_key_put_from_nlattr(attr, &match, is_mask, + is_push_nsh, log); + return !ret; +} + /* Return false if there are any non-masked bits set. * Mask follows data immediately, before any netlink padding. */ @@ -2434,6 +2737,13 @@ static int validate_set(const struct nlattr *a, break; + case OVS_KEY_ATTR_NSH: + if (eth_type != htons(ETH_P_NSH)) + return -EINVAL; + if (!validate_nsh(nla_data(a), masked, false, log)) + return -EINVAL; + break; + default: return -EINVAL; } @@ -2533,6 +2843,9 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, [OVS_ACTION_ATTR_TRUNC] = sizeof(struct ovs_action_trunc), [OVS_ACTION_ATTR_PUSH_ETH] = sizeof(struct ovs_action_push_eth), [OVS_ACTION_ATTR_POP_ETH] = 0, + [OVS_ACTION_ATTR_PUSH_NSH] = (u32)-1, + [OVS_ACTION_ATTR_POP_NSH] = 0, + [OVS_ACTION_ATTR_METER] = sizeof(u32), }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); @@ -2690,6 +3003,38 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, mac_proto = MAC_PROTO_ETHERNET; break; + case OVS_ACTION_ATTR_PUSH_NSH: + if (mac_proto != MAC_PROTO_ETHERNET) { + u8 next_proto; + + next_proto = tun_p_from_eth_p(eth_type); + if (!next_proto) + return -EINVAL; + } + mac_proto = MAC_PROTO_NONE; + if (!validate_nsh(nla_data(a), false, true, true)) + return -EINVAL; + break; + + case OVS_ACTION_ATTR_POP_NSH: { + __be16 inner_proto; + + if (eth_type != htons(ETH_P_NSH)) + return -EINVAL; + inner_proto = tun_p_to_eth_p(key->nsh.base.np); + if (!inner_proto) + return -EINVAL; + if (key->nsh.base.np == TUN_P_ETHERNET) + mac_proto = MAC_PROTO_ETHERNET; + else + mac_proto = MAC_PROTO_NONE; + break; + } + + case OVS_ACTION_ATTR_METER: + /* Non-existent meters are simply ignored. */ + break; + default: OVS_NLERR(log, "Unknown Action type %d", type); return -EINVAL; diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h index 929c665ac3aa..6657606b2b47 100644 --- a/net/openvswitch/flow_netlink.h +++ b/net/openvswitch/flow_netlink.h @@ -79,4 +79,9 @@ int ovs_nla_put_actions(const struct nlattr *attr, void ovs_nla_free_flow_actions(struct sw_flow_actions *); void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *); +int nsh_key_from_nlattr(const struct nlattr *attr, struct ovs_key_nsh *nsh, + struct ovs_key_nsh *nsh_mask); +int nsh_hdr_from_nlattr(const struct nlattr *attr, struct nshhdr *nh, + size_t size); + #endif /* flow_netlink.h */ diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c new file mode 100644 index 000000000000..3fbfc78991ac --- /dev/null +++ b/net/openvswitch/meter.c @@ -0,0 +1,597 @@ +/* + * Copyright (c) 2017 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/if.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/kernel.h> +#include <linux/openvswitch.h> +#include <linux/netlink.h> +#include <linux/rculist.h> + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include "datapath.h" +#include "meter.h" + +#define METER_HASH_BUCKETS 1024 + +static const struct nla_policy meter_policy[OVS_METER_ATTR_MAX + 1] = { + [OVS_METER_ATTR_ID] = { .type = NLA_U32, }, + [OVS_METER_ATTR_KBPS] = { .type = NLA_FLAG }, + [OVS_METER_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) }, + [OVS_METER_ATTR_BANDS] = { .type = NLA_NESTED }, + [OVS_METER_ATTR_USED] = { .type = NLA_U64 }, + [OVS_METER_ATTR_CLEAR] = { .type = NLA_FLAG }, + [OVS_METER_ATTR_MAX_METERS] = { .type = NLA_U32 }, + [OVS_METER_ATTR_MAX_BANDS] = { .type = NLA_U32 }, +}; + +static const struct nla_policy band_policy[OVS_BAND_ATTR_MAX + 1] = { + [OVS_BAND_ATTR_TYPE] = { .type = NLA_U32, }, + [OVS_BAND_ATTR_RATE] = { .type = NLA_U32, }, + [OVS_BAND_ATTR_BURST] = { .type = NLA_U32, }, + [OVS_BAND_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) }, +}; + +static void ovs_meter_free(struct dp_meter *meter) +{ + if (!meter) + return; + + kfree_rcu(meter, rcu); +} + +static struct hlist_head *meter_hash_bucket(const struct datapath *dp, + u32 meter_id) +{ + return &dp->meters[meter_id & (METER_HASH_BUCKETS - 1)]; +} + +/* Call with ovs_mutex or RCU read lock. */ +static struct dp_meter *lookup_meter(const struct datapath *dp, + u32 meter_id) +{ + struct dp_meter *meter; + struct hlist_head *head; + + head = meter_hash_bucket(dp, meter_id); + hlist_for_each_entry_rcu(meter, head, dp_hash_node) { + if (meter->id == meter_id) + return meter; + } + return NULL; +} + +static void attach_meter(struct datapath *dp, struct dp_meter *meter) +{ + struct hlist_head *head = meter_hash_bucket(dp, meter->id); + + hlist_add_head_rcu(&meter->dp_hash_node, head); +} + +static void detach_meter(struct dp_meter *meter) +{ + ASSERT_OVSL(); + if (meter) + hlist_del_rcu(&meter->dp_hash_node); +} + +static struct sk_buff * +ovs_meter_cmd_reply_start(struct genl_info *info, u8 cmd, + struct ovs_header **ovs_reply_header) +{ + struct sk_buff *skb; + struct ovs_header *ovs_header = info->userhdr; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!skb) + return ERR_PTR(-ENOMEM); + + *ovs_reply_header = genlmsg_put(skb, info->snd_portid, + info->snd_seq, + &dp_meter_genl_family, 0, cmd); + if (!*ovs_reply_header) { + nlmsg_free(skb); + return ERR_PTR(-EMSGSIZE); + } + (*ovs_reply_header)->dp_ifindex = ovs_header->dp_ifindex; + + return skb; +} + +static int ovs_meter_cmd_reply_stats(struct sk_buff *reply, u32 meter_id, + struct dp_meter *meter) +{ + struct nlattr *nla; + struct dp_meter_band *band; + u16 i; + + if (nla_put_u32(reply, OVS_METER_ATTR_ID, meter_id)) + goto error; + + if (!meter) + return 0; + + if (nla_put(reply, OVS_METER_ATTR_STATS, + sizeof(struct ovs_flow_stats), &meter->stats) || + nla_put_u64_64bit(reply, OVS_METER_ATTR_USED, meter->used, + OVS_METER_ATTR_PAD)) + goto error; + + nla = nla_nest_start(reply, OVS_METER_ATTR_BANDS); + if (!nla) + goto error; + + band = meter->bands; + + for (i = 0; i < meter->n_bands; ++i, ++band) { + struct nlattr *band_nla; + + band_nla = nla_nest_start(reply, OVS_BAND_ATTR_UNSPEC); + if (!band_nla || nla_put(reply, OVS_BAND_ATTR_STATS, + sizeof(struct ovs_flow_stats), + &band->stats)) + goto error; + nla_nest_end(reply, band_nla); + } + nla_nest_end(reply, nla); + + return 0; +error: + return -EMSGSIZE; +} + +static int ovs_meter_cmd_features(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *reply; + struct ovs_header *ovs_reply_header; + struct nlattr *nla, *band_nla; + int err; + + reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_FEATURES, + &ovs_reply_header); + if (IS_ERR(reply)) + return PTR_ERR(reply); + + if (nla_put_u32(reply, OVS_METER_ATTR_MAX_METERS, U32_MAX) || + nla_put_u32(reply, OVS_METER_ATTR_MAX_BANDS, DP_MAX_BANDS)) + goto nla_put_failure; + + nla = nla_nest_start(reply, OVS_METER_ATTR_BANDS); + if (!nla) + goto nla_put_failure; + + band_nla = nla_nest_start(reply, OVS_BAND_ATTR_UNSPEC); + if (!band_nla) + goto nla_put_failure; + /* Currently only DROP band type is supported. */ + if (nla_put_u32(reply, OVS_BAND_ATTR_TYPE, OVS_METER_BAND_TYPE_DROP)) + goto nla_put_failure; + nla_nest_end(reply, band_nla); + nla_nest_end(reply, nla); + + genlmsg_end(reply, ovs_reply_header); + return genlmsg_reply(reply, info); + +nla_put_failure: + nlmsg_free(reply); + err = -EMSGSIZE; + return err; +} + +static struct dp_meter *dp_meter_create(struct nlattr **a) +{ + struct nlattr *nla; + int rem; + u16 n_bands = 0; + struct dp_meter *meter; + struct dp_meter_band *band; + int err; + + /* Validate attributes, count the bands. */ + if (!a[OVS_METER_ATTR_BANDS]) + return ERR_PTR(-EINVAL); + + nla_for_each_nested(nla, a[OVS_METER_ATTR_BANDS], rem) + if (++n_bands > DP_MAX_BANDS) + return ERR_PTR(-EINVAL); + + /* Allocate and set up the meter before locking anything. */ + meter = kzalloc(n_bands * sizeof(struct dp_meter_band) + + sizeof(*meter), GFP_KERNEL); + if (!meter) + return ERR_PTR(-ENOMEM); + + meter->used = div_u64(ktime_get_ns(), 1000 * 1000); + meter->kbps = a[OVS_METER_ATTR_KBPS] ? 1 : 0; + meter->keep_stats = !a[OVS_METER_ATTR_CLEAR]; + spin_lock_init(&meter->lock); + if (meter->keep_stats && a[OVS_METER_ATTR_STATS]) { + meter->stats = *(struct ovs_flow_stats *) + nla_data(a[OVS_METER_ATTR_STATS]); + } + meter->n_bands = n_bands; + + /* Set up meter bands. */ + band = meter->bands; + nla_for_each_nested(nla, a[OVS_METER_ATTR_BANDS], rem) { + struct nlattr *attr[OVS_BAND_ATTR_MAX + 1]; + u32 band_max_delta_t; + + err = nla_parse((struct nlattr **)&attr, OVS_BAND_ATTR_MAX, + nla_data(nla), nla_len(nla), band_policy, + NULL); + if (err) + goto exit_free_meter; + + if (!attr[OVS_BAND_ATTR_TYPE] || + !attr[OVS_BAND_ATTR_RATE] || + !attr[OVS_BAND_ATTR_BURST]) { + err = -EINVAL; + goto exit_free_meter; + } + + band->type = nla_get_u32(attr[OVS_BAND_ATTR_TYPE]); + band->rate = nla_get_u32(attr[OVS_BAND_ATTR_RATE]); + band->burst_size = nla_get_u32(attr[OVS_BAND_ATTR_BURST]); + /* Figure out max delta_t that is enough to fill any bucket. + * Keep max_delta_t size to the bucket units: + * pkts => 1/1000 packets, kilobits => bits. + */ + band_max_delta_t = (band->burst_size + band->rate) * 1000; + /* Start with a full bucket. */ + band->bucket = band_max_delta_t; + if (band_max_delta_t > meter->max_delta_t) + meter->max_delta_t = band_max_delta_t; + band++; + } + + return meter; + +exit_free_meter: + kfree(meter); + return ERR_PTR(err); +} + +static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct dp_meter *meter, *old_meter; + struct sk_buff *reply; + struct ovs_header *ovs_reply_header; + struct ovs_header *ovs_header = info->userhdr; + struct datapath *dp; + int err; + u32 meter_id; + bool failed; + + meter = dp_meter_create(a); + if (IS_ERR_OR_NULL(meter)) + return PTR_ERR(meter); + + reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_SET, + &ovs_reply_header); + if (IS_ERR(reply)) { + err = PTR_ERR(reply); + goto exit_free_meter; + } + + ovs_lock(); + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + if (!dp) { + err = -ENODEV; + goto exit_unlock; + } + + if (!a[OVS_METER_ATTR_ID]) { + err = -ENODEV; + goto exit_unlock; + } + + meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); + + /* Cannot fail after this. */ + old_meter = lookup_meter(dp, meter_id); + detach_meter(old_meter); + attach_meter(dp, meter); + ovs_unlock(); + + /* Build response with the meter_id and stats from + * the old meter, if any. + */ + failed = nla_put_u32(reply, OVS_METER_ATTR_ID, meter_id); + WARN_ON(failed); + if (old_meter) { + spin_lock_bh(&old_meter->lock); + if (old_meter->keep_stats) { + err = ovs_meter_cmd_reply_stats(reply, meter_id, + old_meter); + WARN_ON(err); + } + spin_unlock_bh(&old_meter->lock); + ovs_meter_free(old_meter); + } + + genlmsg_end(reply, ovs_reply_header); + return genlmsg_reply(reply, info); + +exit_unlock: + ovs_unlock(); + nlmsg_free(reply); +exit_free_meter: + kfree(meter); + return err; +} + +static int ovs_meter_cmd_get(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + u32 meter_id; + struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_reply_header; + struct datapath *dp; + int err; + struct sk_buff *reply; + struct dp_meter *meter; + + if (!a[OVS_METER_ATTR_ID]) + return -EINVAL; + + meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); + + reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_GET, + &ovs_reply_header); + if (IS_ERR(reply)) + return PTR_ERR(reply); + + ovs_lock(); + + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + if (!dp) { + err = -ENODEV; + goto exit_unlock; + } + + /* Locate meter, copy stats. */ + meter = lookup_meter(dp, meter_id); + if (!meter) { + err = -ENOENT; + goto exit_unlock; + } + + spin_lock_bh(&meter->lock); + err = ovs_meter_cmd_reply_stats(reply, meter_id, meter); + spin_unlock_bh(&meter->lock); + if (err) + goto exit_unlock; + + ovs_unlock(); + + genlmsg_end(reply, ovs_reply_header); + return genlmsg_reply(reply, info); + +exit_unlock: + ovs_unlock(); + nlmsg_free(reply); + return err; +} + +static int ovs_meter_cmd_del(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + u32 meter_id; + struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_reply_header; + struct datapath *dp; + int err; + struct sk_buff *reply; + struct dp_meter *old_meter; + + if (!a[OVS_METER_ATTR_ID]) + return -EINVAL; + meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); + + reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_DEL, + &ovs_reply_header); + if (IS_ERR(reply)) + return PTR_ERR(reply); + + ovs_lock(); + + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + if (!dp) { + err = -ENODEV; + goto exit_unlock; + } + + old_meter = lookup_meter(dp, meter_id); + if (old_meter) { + spin_lock_bh(&old_meter->lock); + err = ovs_meter_cmd_reply_stats(reply, meter_id, old_meter); + WARN_ON(err); + spin_unlock_bh(&old_meter->lock); + detach_meter(old_meter); + } + ovs_unlock(); + ovs_meter_free(old_meter); + genlmsg_end(reply, ovs_reply_header); + return genlmsg_reply(reply, info); + +exit_unlock: + ovs_unlock(); + nlmsg_free(reply); + return err; +} + +/* Meter action execution. + * + * Return true 'meter_id' drop band is triggered. The 'skb' should be + * dropped by the caller'. + */ +bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, u32 meter_id) +{ + struct dp_meter *meter; + struct dp_meter_band *band; + long long int now_ms = div_u64(ktime_get_ns(), 1000 * 1000); + long long int long_delta_ms; + u32 delta_ms; + u32 cost; + int i, band_exceeded_max = -1; + u32 band_exceeded_rate = 0; + + meter = lookup_meter(dp, meter_id); + /* Do not drop the packet when there is no meter. */ + if (!meter) + return false; + + /* Lock the meter while using it. */ + spin_lock(&meter->lock); + + long_delta_ms = (now_ms - meter->used); /* ms */ + + /* Make sure delta_ms will not be too large, so that bucket will not + * wrap around below. + */ + delta_ms = (long_delta_ms > (long long int)meter->max_delta_t) + ? meter->max_delta_t : (u32)long_delta_ms; + + /* Update meter statistics. + */ + meter->used = now_ms; + meter->stats.n_packets += 1; + meter->stats.n_bytes += skb->len; + + /* Bucket rate is either in kilobits per second, or in packets per + * second. We maintain the bucket in the units of either bits or + * 1/1000th of a packet, correspondingly. + * Then, when rate is multiplied with milliseconds, we get the + * bucket units: + * msec * kbps = bits, and + * msec * packets/sec = 1/1000 packets. + * + * 'cost' is the number of bucket units in this packet. + */ + cost = (meter->kbps) ? skb->len * 8 : 1000; + + /* Update all bands and find the one hit with the highest rate. */ + for (i = 0; i < meter->n_bands; ++i) { + long long int max_bucket_size; + + band = &meter->bands[i]; + max_bucket_size = (band->burst_size + band->rate) * 1000; + + band->bucket += delta_ms * band->rate; + if (band->bucket > max_bucket_size) + band->bucket = max_bucket_size; + + if (band->bucket >= cost) { + band->bucket -= cost; + } else if (band->rate > band_exceeded_rate) { + band_exceeded_rate = band->rate; + band_exceeded_max = i; + } + } + + if (band_exceeded_max >= 0) { + /* Update band statistics. */ + band = &meter->bands[band_exceeded_max]; + band->stats.n_packets += 1; + band->stats.n_bytes += skb->len; + + /* Drop band triggered, let the caller drop the 'skb'. */ + if (band->type == OVS_METER_BAND_TYPE_DROP) { + spin_unlock(&meter->lock); + return true; + } + } + + spin_unlock(&meter->lock); + return false; +} + +static struct genl_ops dp_meter_genl_ops[] = { + { .cmd = OVS_METER_CMD_FEATURES, + .flags = 0, /* OK for unprivileged users. */ + .policy = meter_policy, + .doit = ovs_meter_cmd_features + }, + { .cmd = OVS_METER_CMD_SET, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN + * privilege. + */ + .policy = meter_policy, + .doit = ovs_meter_cmd_set, + }, + { .cmd = OVS_METER_CMD_GET, + .flags = 0, /* OK for unprivileged users. */ + .policy = meter_policy, + .doit = ovs_meter_cmd_get, + }, + { .cmd = OVS_METER_CMD_DEL, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN + * privilege. + */ + .policy = meter_policy, + .doit = ovs_meter_cmd_del + }, +}; + +static const struct genl_multicast_group ovs_meter_multicast_group = { + .name = OVS_METER_MCGROUP, +}; + +struct genl_family dp_meter_genl_family __ro_after_init = { + .hdrsize = sizeof(struct ovs_header), + .name = OVS_METER_FAMILY, + .version = OVS_METER_VERSION, + .maxattr = OVS_METER_ATTR_MAX, + .netnsok = true, + .parallel_ops = true, + .ops = dp_meter_genl_ops, + .n_ops = ARRAY_SIZE(dp_meter_genl_ops), + .mcgrps = &ovs_meter_multicast_group, + .n_mcgrps = 1, + .module = THIS_MODULE, +}; + +int ovs_meters_init(struct datapath *dp) +{ + int i; + + dp->meters = kmalloc_array(METER_HASH_BUCKETS, + sizeof(struct hlist_head), GFP_KERNEL); + + if (!dp->meters) + return -ENOMEM; + + for (i = 0; i < METER_HASH_BUCKETS; i++) + INIT_HLIST_HEAD(&dp->meters[i]); + + return 0; +} + +void ovs_meters_exit(struct datapath *dp) +{ + int i; + + for (i = 0; i < METER_HASH_BUCKETS; i++) { + struct hlist_head *head = &dp->meters[i]; + struct dp_meter *meter; + struct hlist_node *n; + + hlist_for_each_entry_safe(meter, n, head, dp_hash_node) + kfree(meter); + } + + kfree(dp->meters); +} diff --git a/net/openvswitch/meter.h b/net/openvswitch/meter.h new file mode 100644 index 000000000000..964ace2650f8 --- /dev/null +++ b/net/openvswitch/meter.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2017 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ + +#ifndef METER_H +#define METER_H 1 + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/netlink.h> +#include <linux/openvswitch.h> +#include <linux/genetlink.h> +#include <linux/skbuff.h> + +#include "flow.h" +struct datapath; + +#define DP_MAX_BANDS 1 + +struct dp_meter_band { + u32 type; + u32 rate; + u32 burst_size; + u32 bucket; /* 1/1000 packets, or in bits */ + struct ovs_flow_stats stats; +}; + +struct dp_meter { + spinlock_t lock; /* Per meter lock */ + struct rcu_head rcu; + struct hlist_node dp_hash_node; /*Element in datapath->meters + * hash table. + */ + u32 id; + u16 kbps:1, keep_stats:1; + u16 n_bands; + u32 max_delta_t; + u64 used; + struct ovs_flow_stats stats; + struct dp_meter_band bands[]; +}; + +extern struct genl_family dp_meter_genl_family; +int ovs_meters_init(struct datapath *dp); +void ovs_meters_exit(struct datapath *dp); +bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, u32 meter_id); + +#endif /* meter.h */ diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 9603f6ff17a4..737092ca9b4e 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4560,6 +4560,7 @@ static int __net_init packet_net_init(struct net *net) static void __net_exit packet_net_exit(struct net *net) { remove_proc_entry("packet", net->proc_net); + WARN_ON_ONCE(!hlist_empty(&net->packet.sklist)); } static struct pernet_operations packet_net_ops = { diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index 2cb4c5dfad6f..77787512fc32 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -331,7 +331,10 @@ static int __net_init phonet_init_net(struct net *net) static void __net_exit phonet_exit_net(struct net *net) { + struct phonet_net *pnn = phonet_pernet(net); + remove_proc_entry("phonet", net->proc_net); + WARN_ON_ONCE(!list_empty(&pnn->pndevs.list)); } static struct pernet_operations phonet_net_ops = { diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index e458ece96d3d..77ab05e23001 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -1120,7 +1120,7 @@ static int __init qrtr_proto_init(void) return 0; } -module_init(qrtr_proto_init); +postcore_initcall(qrtr_proto_init); static void __exit qrtr_proto_fini(void) { diff --git a/net/rds/ib_fmr.c b/net/rds/ib_fmr.c index 86ef907067bb..e0f70c4051b6 100644 --- a/net/rds/ib_fmr.c +++ b/net/rds/ib_fmr.c @@ -139,8 +139,8 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, return -EINVAL; } - dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC, - rdsibdev_to_node(rds_ibdev)); + dma_pages = kmalloc_array_node(sizeof(u64), page_cnt, GFP_ATOMIC, + rdsibdev_to_node(rds_ibdev)); if (!dma_pages) { ib_dma_unmap_sg(dev, sg, nents, DMA_BIDIRECTIONAL); return -ENOMEM; diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 9722bf839d9d..b4e421aa9727 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -410,14 +410,14 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp) break; } - /* XXX when can this fail? */ - ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); - rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv, + rdsdebug("recv %p ibinc %p page %p addr %lu\n", recv, recv->r_ibinc, sg_page(&recv->r_frag->f_sg), (long) ib_sg_dma_address( ic->i_cm_id->device, - &recv->r_frag->f_sg), - ret); + &recv->r_frag->f_sg)); + + /* XXX when can this fail? */ + ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); if (ret) { rds_ib_conn_error(conn, "recv post on " "%pI4 returned %d, disconnecting and " diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 344b2dcad52d..9b5c46b052fd 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -322,6 +322,14 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, } EXPORT_SYMBOL(rxrpc_kernel_begin_call); +/* + * Dummy function used to stop the notifier talking to recvmsg(). + */ +static void rxrpc_dummy_notify_rx(struct sock *sk, struct rxrpc_call *rxcall, + unsigned long call_user_ID) +{ +} + /** * rxrpc_kernel_end_call - Allow a kernel service to end a call it was using * @sock: The socket the call is on @@ -336,6 +344,14 @@ void rxrpc_kernel_end_call(struct socket *sock, struct rxrpc_call *call) mutex_lock(&call->user_mutex); rxrpc_release_call(rxrpc_sk(sock->sk), call); + + /* Make sure we're not going to call back into a kernel service */ + if (call->notify_rx) { + spin_lock_bh(&call->notify_lock); + call->notify_rx = rxrpc_dummy_notify_rx; + spin_unlock_bh(&call->notify_lock); + } + mutex_unlock(&call->user_mutex); rxrpc_put_call(call, rxrpc_call_put_kernel); } diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index ea5600b747cc..b2151993d384 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -525,6 +525,7 @@ struct rxrpc_call { unsigned long flags; unsigned long events; spinlock_t lock; + spinlock_t notify_lock; /* Kernel notification lock */ rwlock_t state_lock; /* lock for state transition */ u32 abort_code; /* Local/remote abort code */ int error; /* Local error incurred */ diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 7a77844aab16..3574508baf9a 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -386,7 +386,7 @@ recheck_state: now = ktime_get_real(); if (ktime_before(call->expire_at, now)) { - rxrpc_abort_call("EXP", call, 0, RX_CALL_TIMEOUT, -ETIME); + rxrpc_abort_call("EXP", call, 0, RX_USER_ABORT, -ETIME); set_bit(RXRPC_CALL_EV_ABORT, &call->events); goto recheck_state; } diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index fcdd6555a820..4c7fbc6dcce7 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -124,6 +124,7 @@ struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) INIT_LIST_HEAD(&call->sock_link); init_waitqueue_head(&call->waitq); spin_lock_init(&call->lock); + spin_lock_init(&call->notify_lock); rwlock_init(&call->state_lock); atomic_set(&call->usage, 1); call->debug_id = atomic_inc_return(&rxrpc_debug_id); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 1e37eb1c0c66..1b592073ec96 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -298,8 +298,6 @@ static bool rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun, write_unlock(&call->state_lock); if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) { - rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, 0, false, true, - rxrpc_propose_ack_client_tx_end); trace_rxrpc_transmit(call, rxrpc_transmit_await_reply); } else { trace_rxrpc_transmit(call, rxrpc_transmit_end); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 71e6f713fbe7..f47659c7b224 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -35,7 +35,8 @@ struct rxrpc_abort_buffer { /* * Fill out an ACK packet. */ -static size_t rxrpc_fill_out_ack(struct rxrpc_call *call, +static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn, + struct rxrpc_call *call, struct rxrpc_ack_buffer *pkt, rxrpc_seq_t *_hard_ack, rxrpc_seq_t *_top, @@ -77,8 +78,8 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_call *call, } while (before_eq(seq, top)); } - mtu = call->conn->params.peer->if_mtu; - mtu -= call->conn->params.peer->hdrsize; + mtu = conn->params.peer->if_mtu; + mtu -= conn->params.peer->hdrsize; jmax = (call->nr_jumbo_bad > 3) ? 1 : rxrpc_rx_jumbo_max; pkt->ackinfo.rxMTU = htonl(rxrpc_rx_mtu); pkt->ackinfo.maxMTU = htonl(mtu); @@ -148,7 +149,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping) } call->ackr_reason = 0; } - n = rxrpc_fill_out_ack(call, pkt, &hard_ack, &top, reason); + n = rxrpc_fill_out_ack(conn, call, pkt, &hard_ack, &top, reason); spin_unlock_bh(&call->lock); @@ -221,6 +222,16 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call) rxrpc_serial_t serial; int ret; + /* Don't bother sending aborts for a client call once the server has + * hard-ACK'd all of its request data. After that point, we're not + * going to stop the operation proceeding, and whilst we might limit + * the reply, it's not worth it if we can send a new call on the same + * channel instead, thereby closing off this call. + */ + if (rxrpc_is_client_call(call) && + test_bit(RXRPC_CALL_TX_LAST, &call->flags)) + return 0; + spin_lock_bh(&call->lock); if (call->conn) conn = rxrpc_get_connection_maybe(call->conn); diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index e4937b3f3685..8510a98b87e1 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -40,7 +40,9 @@ void rxrpc_notify_socket(struct rxrpc_call *call) sk = &rx->sk; if (rx && sk->sk_state < RXRPC_CLOSE) { if (call->notify_rx) { + spin_lock_bh(&call->notify_lock); call->notify_rx(sk, call, call->user_call_ID); + spin_unlock_bh(&call->notify_lock); } else { write_lock_bh(&rx->recvmsg_lock); if (list_empty(&call->recvmsg_link)) { diff --git a/net/sched/act_api.c b/net/sched/act_api.c index ff4d69082376..4d33a50a8a6d 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -80,7 +80,6 @@ static void tcf_idr_remove(struct tcf_idrinfo *idrinfo, struct tc_action *p) spin_lock_bh(&idrinfo->lock); idr_remove_ext(&idrinfo->action_idr, p->tcfa_index); spin_unlock_bh(&idrinfo->lock); - put_net(idrinfo->net); gen_kill_estimator(&p->tcfa_rate_est); free_tcf(p); } @@ -339,7 +338,6 @@ err3: p->idrinfo = idrinfo; p->ops = ops; INIT_LIST_HEAD(&p->list); - get_net(idrinfo->net); *a = p; return 0; } diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 035ed268290b..5ef8ce8c83d4 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -398,7 +398,7 @@ static __net_init int bpf_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, bpf_net_id); - return tc_action_net_init(tn, &act_bpf_ops, net); + return tc_action_net_init(tn, &act_bpf_ops); } static void __net_exit bpf_exit_net(struct net *net) diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 34e52d01a5dd..10b7a8855a6c 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -206,7 +206,7 @@ static __net_init int connmark_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, connmark_net_id); - return tc_action_net_init(tn, &act_connmark_ops, net); + return tc_action_net_init(tn, &act_connmark_ops); } static void __net_exit connmark_exit_net(struct net *net) diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 35171df2ebef..1c40caadcff9 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -626,7 +626,7 @@ static __net_init int csum_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, csum_net_id); - return tc_action_net_init(tn, &act_csum_ops, net); + return tc_action_net_init(tn, &act_csum_ops); } static void __net_exit csum_exit_net(struct net *net) diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index ef7f7f39d26d..e29a48ef7fc3 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -232,7 +232,7 @@ static __net_init int gact_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, gact_net_id); - return tc_action_net_init(tn, &act_gact_ops, net); + return tc_action_net_init(tn, &act_gact_ops); } static void __net_exit gact_exit_net(struct net *net) diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index eca272533418..3007cb1310ea 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -855,7 +855,7 @@ static __net_init int ife_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, ife_net_id); - return tc_action_net_init(tn, &act_ife_ops, net); + return tc_action_net_init(tn, &act_ife_ops); } static void __net_exit ife_exit_net(struct net *net) diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index dbdf3b2470d5..d9e399a7e3d5 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -334,7 +334,7 @@ static __net_init int ipt_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, ipt_net_id); - return tc_action_net_init(tn, &act_ipt_ops, net); + return tc_action_net_init(tn, &act_ipt_ops); } static void __net_exit ipt_exit_net(struct net *net) @@ -384,7 +384,7 @@ static __net_init int xt_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, xt_net_id); - return tc_action_net_init(tn, &act_xt_ops, net); + return tc_action_net_init(tn, &act_xt_ops); } static void __net_exit xt_exit_net(struct net *net) diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 8df5775bbf22..8b3e59388480 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -340,7 +340,7 @@ static __net_init int mirred_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, mirred_net_id); - return tc_action_net_init(tn, &act_mirred_ops, net); + return tc_action_net_init(tn, &act_mirred_ops); } static void __net_exit mirred_exit_net(struct net *net) diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 7eeaaf9217b6..c365d01b99c8 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -307,7 +307,7 @@ static __net_init int nat_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, nat_net_id); - return tc_action_net_init(tn, &act_nat_ops, net); + return tc_action_net_init(tn, &act_nat_ops); } static void __net_exit nat_exit_net(struct net *net) diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index b3d82c334a5f..491fe5deb09e 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -450,7 +450,7 @@ static __net_init int pedit_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, pedit_net_id); - return tc_action_net_init(tn, &act_pedit_ops, net); + return tc_action_net_init(tn, &act_pedit_ops); } static void __net_exit pedit_exit_net(struct net *net) diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 9ec42b26e4b9..3bb2ebf9e9ae 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -331,7 +331,7 @@ static __net_init int police_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, police_net_id); - return tc_action_net_init(tn, &act_police_ops, net); + return tc_action_net_init(tn, &act_police_ops); } static void __net_exit police_exit_net(struct net *net) diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index e69a1e3a39bf..8b5abcd2f32f 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -240,7 +240,7 @@ static __net_init int sample_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, sample_net_id); - return tc_action_net_init(tn, &act_sample_ops, net); + return tc_action_net_init(tn, &act_sample_ops); } static void __net_exit sample_exit_net(struct net *net) diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index a8d0ea95f894..e7b57e5071a3 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -201,7 +201,7 @@ static __net_init int simp_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, simp_net_id); - return tc_action_net_init(tn, &act_simp_ops, net); + return tc_action_net_init(tn, &act_simp_ops); } static void __net_exit simp_exit_net(struct net *net) diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index fbac62472e09..59949d61f20d 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -238,7 +238,7 @@ static __net_init int skbedit_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); - return tc_action_net_init(tn, &act_skbedit_ops, net); + return tc_action_net_init(tn, &act_skbedit_ops); } static void __net_exit skbedit_exit_net(struct net *net) diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index 8e12d8897d2f..b642ad3d39dd 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -263,7 +263,7 @@ static __net_init int skbmod_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, skbmod_net_id); - return tc_action_net_init(tn, &act_skbmod_ops, net); + return tc_action_net_init(tn, &act_skbmod_ops); } static void __net_exit skbmod_exit_net(struct net *net) diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index c33faa373cf2..30c96274c638 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -322,7 +322,7 @@ static __net_init int tunnel_key_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); - return tc_action_net_init(tn, &act_tunnel_key_ops, net); + return tc_action_net_init(tn, &act_tunnel_key_ops); } static void __net_exit tunnel_key_exit_net(struct net *net) diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 115fc33cc6d8..97f717a13ad5 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -26,14 +26,13 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_vlan *v = to_vlan(a); + struct tcf_vlan_params *p; int action; int err; u16 tci; - spin_lock(&v->tcf_lock); tcf_lastuse_update(&v->tcf_tm); - bstats_update(&v->tcf_bstats, skb); - action = v->tcf_action; + bstats_cpu_update(this_cpu_ptr(v->common.cpu_bstats), skb); /* Ensure 'data' points at mac_header prior calling vlan manipulating * functions. @@ -41,15 +40,21 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, if (skb_at_tc_ingress(skb)) skb_push_rcsum(skb, skb->mac_len); - switch (v->tcfv_action) { + rcu_read_lock(); + + action = READ_ONCE(v->tcf_action); + + p = rcu_dereference(v->vlan_p); + + switch (p->tcfv_action) { case TCA_VLAN_ACT_POP: err = skb_vlan_pop(skb); if (err) goto drop; break; case TCA_VLAN_ACT_PUSH: - err = skb_vlan_push(skb, v->tcfv_push_proto, v->tcfv_push_vid | - (v->tcfv_push_prio << VLAN_PRIO_SHIFT)); + err = skb_vlan_push(skb, p->tcfv_push_proto, p->tcfv_push_vid | + (p->tcfv_push_prio << VLAN_PRIO_SHIFT)); if (err) goto drop; break; @@ -68,14 +73,14 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, goto drop; } /* replace the vid */ - tci = (tci & ~VLAN_VID_MASK) | v->tcfv_push_vid; + tci = (tci & ~VLAN_VID_MASK) | p->tcfv_push_vid; /* replace prio bits, if tcfv_push_prio specified */ - if (v->tcfv_push_prio) { + if (p->tcfv_push_prio) { tci &= ~VLAN_PRIO_MASK; - tci |= v->tcfv_push_prio << VLAN_PRIO_SHIFT; + tci |= p->tcfv_push_prio << VLAN_PRIO_SHIFT; } /* put updated tci as hwaccel tag */ - __vlan_hwaccel_put_tag(skb, v->tcfv_push_proto, tci); + __vlan_hwaccel_put_tag(skb, p->tcfv_push_proto, tci); break; default: BUG(); @@ -85,12 +90,13 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, drop: action = TC_ACT_SHOT; - v->tcf_qstats.drops++; + qstats_drop_inc(this_cpu_ptr(v->common.cpu_qstats)); + unlock: + rcu_read_unlock(); if (skb_at_tc_ingress(skb)) skb_pull_rcsum(skb, skb->mac_len); - spin_unlock(&v->tcf_lock); return action; } @@ -107,6 +113,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, { struct tc_action_net *tn = net_generic(net, vlan_net_id); struct nlattr *tb[TCA_VLAN_MAX + 1]; + struct tcf_vlan_params *p, *p_old; struct tc_vlan *parm; struct tcf_vlan *v; int action; @@ -172,7 +179,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, - &act_vlan_ops, bind, false); + &act_vlan_ops, bind, true); if (ret) return ret; @@ -185,46 +192,67 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, v = to_vlan(*a); - spin_lock_bh(&v->tcf_lock); - - v->tcfv_action = action; - v->tcfv_push_vid = push_vid; - v->tcfv_push_prio = push_prio; - v->tcfv_push_proto = push_proto; + ASSERT_RTNL(); + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) { + if (ovr) + tcf_idr_release(*a, bind); + return -ENOMEM; + } v->tcf_action = parm->action; - spin_unlock_bh(&v->tcf_lock); + p_old = rtnl_dereference(v->vlan_p); + + p->tcfv_action = action; + p->tcfv_push_vid = push_vid; + p->tcfv_push_prio = push_prio; + p->tcfv_push_proto = push_proto; + + rcu_assign_pointer(v->vlan_p, p); + + if (p_old) + kfree_rcu(p_old, rcu); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); return ret; } +static void tcf_vlan_cleanup(struct tc_action *a, int bind) +{ + struct tcf_vlan *v = to_vlan(a); + struct tcf_vlan_params *p; + + p = rcu_dereference_protected(v->vlan_p, 1); + kfree_rcu(p, rcu); +} + static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_vlan *v = to_vlan(a); + struct tcf_vlan_params *p = rtnl_dereference(v->vlan_p); struct tc_vlan opt = { .index = v->tcf_index, .refcnt = v->tcf_refcnt - ref, .bindcnt = v->tcf_bindcnt - bind, .action = v->tcf_action, - .v_action = v->tcfv_action, + .v_action = p->tcfv_action, }; struct tcf_t t; if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt)) goto nla_put_failure; - if ((v->tcfv_action == TCA_VLAN_ACT_PUSH || - v->tcfv_action == TCA_VLAN_ACT_MODIFY) && - (nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, v->tcfv_push_vid) || + if ((p->tcfv_action == TCA_VLAN_ACT_PUSH || + p->tcfv_action == TCA_VLAN_ACT_MODIFY) && + (nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, p->tcfv_push_vid) || nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL, - v->tcfv_push_proto) || + p->tcfv_push_proto) || (nla_put_u8(skb, TCA_VLAN_PUSH_VLAN_PRIORITY, - v->tcfv_push_prio)))) + p->tcfv_push_prio)))) goto nla_put_failure; tcf_tm_dump(&t, &v->tcf_tm); @@ -260,6 +288,7 @@ static struct tc_action_ops act_vlan_ops = { .act = tcf_vlan, .dump = tcf_vlan_dump, .init = tcf_vlan_init, + .cleanup = tcf_vlan_cleanup, .walk = tcf_vlan_walker, .lookup = tcf_vlan_search, .size = sizeof(struct tcf_vlan), @@ -269,7 +298,7 @@ static __net_init int vlan_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, vlan_net_id); - return tc_action_net_init(tn, &act_vlan_ops, net); + return tc_action_net_init(tn, &act_vlan_ops); } static void __net_exit vlan_exit_net(struct net *net) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 206e19f4fc01..ab255b421781 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1110,6 +1110,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, exts->actions[i++] = act; exts->nr_actions = i; } + exts->net = net; } #else if ((exts->action && tb[exts->action]) || diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index 871351358c10..5f169ded347e 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -87,16 +87,21 @@ static int basic_init(struct tcf_proto *tp) return 0; } +static void __basic_delete_filter(struct basic_filter *f) +{ + tcf_exts_destroy(&f->exts); + tcf_em_tree_destroy(&f->ematches); + tcf_exts_put_net(&f->exts); + kfree(f); +} + static void basic_delete_filter_work(struct work_struct *work) { struct basic_filter *f = container_of(work, struct basic_filter, work); rtnl_lock(); - tcf_exts_destroy(&f->exts); - tcf_em_tree_destroy(&f->ematches); + __basic_delete_filter(f); rtnl_unlock(); - - kfree(f); } static void basic_delete_filter(struct rcu_head *head) @@ -116,7 +121,10 @@ static void basic_destroy(struct tcf_proto *tp) list_del_rcu(&f->link); tcf_unbind_filter(tp, &f->res); idr_remove_ext(&head->handle_idr, f->handle); - call_rcu(&f->rcu, basic_delete_filter); + if (tcf_exts_get_net(&f->exts)) + call_rcu(&f->rcu, basic_delete_filter); + else + __basic_delete_filter(f); } idr_destroy(&head->handle_idr); kfree_rcu(head, rcu); @@ -130,6 +138,7 @@ static int basic_delete(struct tcf_proto *tp, void *arg, bool *last) list_del_rcu(&f->link); tcf_unbind_filter(tp, &f->res); idr_remove_ext(&head->handle_idr, f->handle); + tcf_exts_get_net(&f->exts); call_rcu(&f->rcu, basic_delete_filter); *last = list_empty(&head->flist); return 0; @@ -225,6 +234,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, idr_replace_ext(&head->handle_idr, fnew, fnew->handle); list_replace_rcu(&fold->link, &fnew->link); tcf_unbind_filter(tp, &fold->res); + tcf_exts_get_net(&fold->exts); call_rcu(&fold->rcu, basic_delete_filter); } else { list_add_rcu(&fnew->link, &head->flist); diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index dc9bd9a0070b..fb680dafac5a 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -261,6 +261,7 @@ static int cls_bpf_init(struct tcf_proto *tp) static void __cls_bpf_delete_prog(struct cls_bpf_prog *prog) { tcf_exts_destroy(&prog->exts); + tcf_exts_put_net(&prog->exts); if (cls_bpf_is_ebpf(prog)) bpf_prog_put(prog->filter); @@ -297,7 +298,10 @@ static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog) cls_bpf_stop_offload(tp, prog); list_del_rcu(&prog->link); tcf_unbind_filter(tp, &prog->res); - call_rcu(&prog->rcu, cls_bpf_delete_prog_rcu); + if (tcf_exts_get_net(&prog->exts)) + call_rcu(&prog->rcu, cls_bpf_delete_prog_rcu); + else + __cls_bpf_delete_prog(prog); } static int cls_bpf_delete(struct tcf_proto *tp, void *arg, bool *last) @@ -526,6 +530,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, idr_replace_ext(&head->handle_idr, prog, handle); list_replace_rcu(&oldprog->link, &prog->link); tcf_unbind_filter(tp, &oldprog->res); + tcf_exts_get_net(&oldprog->exts); call_rcu(&oldprog->rcu, cls_bpf_delete_prog_rcu); } else { list_add_rcu(&prog->link, &head->plist); diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index a97e069bee89..309d5899265f 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -60,15 +60,21 @@ static const struct nla_policy cgroup_policy[TCA_CGROUP_MAX + 1] = { [TCA_CGROUP_EMATCHES] = { .type = NLA_NESTED }, }; +static void __cls_cgroup_destroy(struct cls_cgroup_head *head) +{ + tcf_exts_destroy(&head->exts); + tcf_em_tree_destroy(&head->ematches); + tcf_exts_put_net(&head->exts); + kfree(head); +} + static void cls_cgroup_destroy_work(struct work_struct *work) { struct cls_cgroup_head *head = container_of(work, struct cls_cgroup_head, work); rtnl_lock(); - tcf_exts_destroy(&head->exts); - tcf_em_tree_destroy(&head->ematches); - kfree(head); + __cls_cgroup_destroy(head); rtnl_unlock(); } @@ -124,8 +130,10 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, goto errout; rcu_assign_pointer(tp->root, new); - if (head) + if (head) { + tcf_exts_get_net(&head->exts); call_rcu(&head->rcu, cls_cgroup_destroy_rcu); + } return 0; errout: tcf_exts_destroy(&new->exts); @@ -138,8 +146,12 @@ static void cls_cgroup_destroy(struct tcf_proto *tp) struct cls_cgroup_head *head = rtnl_dereference(tp->root); /* Head can still be NULL due to cls_cgroup_init(). */ - if (head) - call_rcu(&head->rcu, cls_cgroup_destroy_rcu); + if (head) { + if (tcf_exts_get_net(&head->exts)) + call_rcu(&head->rcu, cls_cgroup_destroy_rcu); + else + __cls_cgroup_destroy(head); + } } static int cls_cgroup_delete(struct tcf_proto *tp, void *arg, bool *last) diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 9c08fcdaf41c..25c2a888e1f0 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -372,15 +372,21 @@ static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = { [TCA_FLOW_PERTURB] = { .type = NLA_U32 }, }; -static void flow_destroy_filter_work(struct work_struct *work) +static void __flow_destroy_filter(struct flow_filter *f) { - struct flow_filter *f = container_of(work, struct flow_filter, work); - - rtnl_lock(); del_timer_sync(&f->perturb_timer); tcf_exts_destroy(&f->exts); tcf_em_tree_destroy(&f->ematches); + tcf_exts_put_net(&f->exts); kfree(f); +} + +static void flow_destroy_filter_work(struct work_struct *work) +{ + struct flow_filter *f = container_of(work, struct flow_filter, work); + + rtnl_lock(); + __flow_destroy_filter(f); rtnl_unlock(); } @@ -554,8 +560,10 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, *arg = fnew; - if (fold) + if (fold) { + tcf_exts_get_net(&fold->exts); call_rcu(&fold->rcu, flow_destroy_filter); + } return 0; err2: @@ -572,6 +580,7 @@ static int flow_delete(struct tcf_proto *tp, void *arg, bool *last) struct flow_filter *f = arg; list_del_rcu(&f->list); + tcf_exts_get_net(&f->exts); call_rcu(&f->rcu, flow_destroy_filter); *last = list_empty(&head->filters); return 0; @@ -596,7 +605,10 @@ static void flow_destroy(struct tcf_proto *tp) list_for_each_entry_safe(f, next, &head->filters, list) { list_del_rcu(&f->list); - call_rcu(&f->rcu, flow_destroy_filter); + if (tcf_exts_get_net(&f->exts)) + call_rcu(&f->rcu, flow_destroy_filter); + else + __flow_destroy_filter(f); } kfree_rcu(head, rcu); } diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index c99fa9e5be46..543a3e875d05 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -193,13 +193,19 @@ static int fl_init(struct tcf_proto *tp) return 0; } +static void __fl_destroy_filter(struct cls_fl_filter *f) +{ + tcf_exts_destroy(&f->exts); + tcf_exts_put_net(&f->exts); + kfree(f); +} + static void fl_destroy_filter_work(struct work_struct *work) { struct cls_fl_filter *f = container_of(work, struct cls_fl_filter, work); rtnl_lock(); - tcf_exts_destroy(&f->exts); - kfree(f); + __fl_destroy_filter(f); rtnl_unlock(); } @@ -282,7 +288,10 @@ static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f) if (!tc_skip_hw(f->flags)) fl_hw_destroy_filter(tp, f); tcf_unbind_filter(tp, &f->res); - call_rcu(&f->rcu, fl_destroy_filter); + if (tcf_exts_get_net(&f->exts)) + call_rcu(&f->rcu, fl_destroy_filter); + else + __fl_destroy_filter(f); } static void fl_destroy_sleepable(struct work_struct *work) @@ -952,6 +961,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, idr_replace_ext(&head->handle_idr, fnew, fnew->handle); list_replace_rcu(&fold->list, &fnew->list); tcf_unbind_filter(tp, &fold->res); + tcf_exts_get_net(&fold->exts); call_rcu(&fold->rcu, fl_destroy_filter); } else { list_add_tail_rcu(&fnew->list, &head->filters); diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 5908f56f76da..20f0de1a960a 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -125,13 +125,19 @@ static int fw_init(struct tcf_proto *tp) return 0; } +static void __fw_delete_filter(struct fw_filter *f) +{ + tcf_exts_destroy(&f->exts); + tcf_exts_put_net(&f->exts); + kfree(f); +} + static void fw_delete_filter_work(struct work_struct *work) { struct fw_filter *f = container_of(work, struct fw_filter, work); rtnl_lock(); - tcf_exts_destroy(&f->exts); - kfree(f); + __fw_delete_filter(f); rtnl_unlock(); } @@ -157,7 +163,10 @@ static void fw_destroy(struct tcf_proto *tp) RCU_INIT_POINTER(head->ht[h], rtnl_dereference(f->next)); tcf_unbind_filter(tp, &f->res); - call_rcu(&f->rcu, fw_delete_filter); + if (tcf_exts_get_net(&f->exts)) + call_rcu(&f->rcu, fw_delete_filter); + else + __fw_delete_filter(f); } } kfree_rcu(head, rcu); @@ -182,6 +191,7 @@ static int fw_delete(struct tcf_proto *tp, void *arg, bool *last) if (pfp == f) { RCU_INIT_POINTER(*fp, rtnl_dereference(f->next)); tcf_unbind_filter(tp, &f->res); + tcf_exts_get_net(&f->exts); call_rcu(&f->rcu, fw_delete_filter); ret = 0; break; @@ -302,6 +312,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, RCU_INIT_POINTER(fnew->next, rtnl_dereference(pfp->next)); rcu_assign_pointer(*fp, fnew); tcf_unbind_filter(tp, &f->res); + tcf_exts_get_net(&f->exts); call_rcu(&f->rcu, fw_delete_filter); *arg = fnew; diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 95dc997873e8..66d4e0099158 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -44,13 +44,19 @@ static int mall_init(struct tcf_proto *tp) return 0; } +static void __mall_destroy(struct cls_mall_head *head) +{ + tcf_exts_destroy(&head->exts); + tcf_exts_put_net(&head->exts); + kfree(head); +} + static void mall_destroy_work(struct work_struct *work) { struct cls_mall_head *head = container_of(work, struct cls_mall_head, work); rtnl_lock(); - tcf_exts_destroy(&head->exts); - kfree(head); + __mall_destroy(head); rtnl_unlock(); } @@ -116,7 +122,10 @@ static void mall_destroy(struct tcf_proto *tp) if (!tc_skip_hw(head->flags)) mall_destroy_hw_filter(tp, head, (unsigned long) head); - call_rcu(&head->rcu, mall_destroy_rcu); + if (tcf_exts_get_net(&head->exts)) + call_rcu(&head->rcu, mall_destroy_rcu); + else + __mall_destroy(head); } static void *mall_get(struct tcf_proto *tp, u32 handle) diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 4b14ccd8b8f2..ac9a5b8825b9 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -257,13 +257,19 @@ static int route4_init(struct tcf_proto *tp) return 0; } +static void __route4_delete_filter(struct route4_filter *f) +{ + tcf_exts_destroy(&f->exts); + tcf_exts_put_net(&f->exts); + kfree(f); +} + static void route4_delete_filter_work(struct work_struct *work) { struct route4_filter *f = container_of(work, struct route4_filter, work); rtnl_lock(); - tcf_exts_destroy(&f->exts); - kfree(f); + __route4_delete_filter(f); rtnl_unlock(); } @@ -297,7 +303,10 @@ static void route4_destroy(struct tcf_proto *tp) next = rtnl_dereference(f->next); RCU_INIT_POINTER(b->ht[h2], next); tcf_unbind_filter(tp, &f->res); - call_rcu(&f->rcu, route4_delete_filter); + if (tcf_exts_get_net(&f->exts)) + call_rcu(&f->rcu, route4_delete_filter); + else + __route4_delete_filter(f); } } RCU_INIT_POINTER(head->table[h1], NULL); @@ -338,6 +347,7 @@ static int route4_delete(struct tcf_proto *tp, void *arg, bool *last) /* Delete it */ tcf_unbind_filter(tp, &f->res); + tcf_exts_get_net(&f->exts); call_rcu(&f->rcu, route4_delete_filter); /* Strip RTNL protected tree */ @@ -541,6 +551,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, *arg = f; if (fold) { tcf_unbind_filter(tp, &fold->res); + tcf_exts_get_net(&fold->exts); call_rcu(&fold->rcu, route4_delete_filter); } return 0; diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index bdbc541787f8..cf325625c99d 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -285,13 +285,19 @@ static int rsvp_init(struct tcf_proto *tp) return -ENOBUFS; } +static void __rsvp_delete_filter(struct rsvp_filter *f) +{ + tcf_exts_destroy(&f->exts); + tcf_exts_put_net(&f->exts); + kfree(f); +} + static void rsvp_delete_filter_work(struct work_struct *work) { struct rsvp_filter *f = container_of(work, struct rsvp_filter, work); rtnl_lock(); - tcf_exts_destroy(&f->exts); - kfree(f); + __rsvp_delete_filter(f); rtnl_unlock(); } @@ -310,7 +316,10 @@ static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f) * grace period, since converted-to-rcu actions are relying on that * in cleanup() callback */ - call_rcu(&f->rcu, rsvp_delete_filter_rcu); + if (tcf_exts_get_net(&f->exts)) + call_rcu(&f->rcu, rsvp_delete_filter_rcu); + else + __rsvp_delete_filter(f); } static void rsvp_destroy(struct tcf_proto *tp) diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index d6abfa6757f2..67467ae24c97 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -142,13 +142,19 @@ static int tcindex_init(struct tcf_proto *tp) return 0; } +static void __tcindex_destroy_rexts(struct tcindex_filter_result *r) +{ + tcf_exts_destroy(&r->exts); + tcf_exts_put_net(&r->exts); +} + static void tcindex_destroy_rexts_work(struct work_struct *work) { struct tcindex_filter_result *r; r = container_of(work, struct tcindex_filter_result, work); rtnl_lock(); - tcf_exts_destroy(&r->exts); + __tcindex_destroy_rexts(r); rtnl_unlock(); } @@ -161,14 +167,20 @@ static void tcindex_destroy_rexts(struct rcu_head *head) tcf_queue_work(&r->work); } +static void __tcindex_destroy_fexts(struct tcindex_filter *f) +{ + tcf_exts_destroy(&f->result.exts); + tcf_exts_put_net(&f->result.exts); + kfree(f); +} + static void tcindex_destroy_fexts_work(struct work_struct *work) { struct tcindex_filter *f = container_of(work, struct tcindex_filter, work); rtnl_lock(); - tcf_exts_destroy(&f->result.exts); - kfree(f); + __tcindex_destroy_fexts(f); rtnl_unlock(); } @@ -213,10 +225,17 @@ found: * grace period, since converted-to-rcu actions are relying on that * in cleanup() callback */ - if (f) - call_rcu(&f->rcu, tcindex_destroy_fexts); - else - call_rcu(&r->rcu, tcindex_destroy_rexts); + if (f) { + if (tcf_exts_get_net(&f->result.exts)) + call_rcu(&f->rcu, tcindex_destroy_fexts); + else + __tcindex_destroy_fexts(f); + } else { + if (tcf_exts_get_net(&r->exts)) + call_rcu(&r->rcu, tcindex_destroy_rexts); + else + __tcindex_destroy_rexts(r); + } *last = false; return 0; diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 2737b71854c9..ac152b4f4247 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -399,6 +399,7 @@ static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n, bool free_pf) { tcf_exts_destroy(&n->exts); + tcf_exts_put_net(&n->exts); if (n->ht_down) n->ht_down->refcnt--; #ifdef CONFIG_CLS_U32_PERF @@ -476,6 +477,7 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) RCU_INIT_POINTER(*kp, key->next); tcf_unbind_filter(tp, &key->res); + tcf_exts_get_net(&key->exts); call_rcu(&key->rcu, u32_delete_key_freepf_rcu); return 0; } @@ -590,7 +592,10 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) tcf_unbind_filter(tp, &n->res); u32_remove_hw_knode(tp, n->handle); idr_remove_ext(&ht->handle_idr, n->handle); - call_rcu(&n->rcu, u32_delete_key_freepf_rcu); + if (tcf_exts_get_net(&n->exts)) + call_rcu(&n->rcu, u32_delete_key_freepf_rcu); + else + u32_destroy_key(n->tp, n, true); } } } @@ -949,6 +954,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, u32_replace_knode(tp, tp_c, new); tcf_unbind_filter(tp, &n->res); + tcf_exts_get_net(&n->exts); call_rcu(&n->rcu, u32_delete_key_rcu); return 0; } diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index bdb533b7fb8c..7a72980c1509 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -212,7 +212,7 @@ static void cbs_disable_offload(struct net_device *dev, cbs.queue = q->queue; cbs.enable = 0; - err = ops->ndo_setup_tc(dev, TC_SETUP_CBS, &cbs); + err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_CBS, &cbs); if (err < 0) pr_warn("Couldn't disable CBS offload for queue %d\n", cbs.queue); @@ -236,7 +236,7 @@ static int cbs_enable_offload(struct net_device *dev, struct cbs_sched_data *q, cbs.idleslope = opt->idleslope; cbs.sendslope = opt->sendslope; - err = ops->ndo_setup_tc(dev, TC_SETUP_CBS, &cbs); + err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_CBS, &cbs); if (err < 0) return err; diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 4d5ed45123f0..b85885a9d8a1 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -50,7 +50,8 @@ static void mqprio_destroy(struct Qdisc *sch) switch (priv->mode) { case TC_MQPRIO_MODE_DCB: case TC_MQPRIO_MODE_CHANNEL: - dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_MQPRIO, + dev->netdev_ops->ndo_setup_tc(dev, + TC_SETUP_QDISC_MQPRIO, &mqprio); break; default: @@ -265,7 +266,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) return -EINVAL; } err = dev->netdev_ops->ndo_setup_tc(dev, - TC_SETUP_MQPRIO, + TC_SETUP_QDISC_MQPRIO, &mqprio); if (err) return err; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index db0228a65e8c..dd70924cbcdf 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -77,8 +77,8 @@ struct netem_sched_data { struct qdisc_watchdog watchdog; - psched_tdiff_t latency; - psched_tdiff_t jitter; + s64 latency; + s64 jitter; u32 loss; u32 ecn; @@ -135,6 +135,13 @@ struct netem_sched_data { u32 a5; /* p23 used only in 4-states */ } clg; + struct tc_netem_slot slot_config; + struct slotstate { + u64 slot_next; + s32 packets_left; + s32 bytes_left; + } slot; + }; /* Time stamp put into socket buffer control block @@ -145,7 +152,7 @@ struct netem_sched_data { * we save skb->tstamp value in skb->cb[] before destroying it. */ struct netem_skb_cb { - psched_time_t time_to_send; + u64 time_to_send; }; static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) @@ -305,11 +312,11 @@ static bool loss_event(struct netem_sched_data *q) * std deviation sigma. Uses table lookup to approximate the desired * distribution, and a uniformly-distributed pseudo-random source. */ -static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma, - struct crndstate *state, - const struct disttable *dist) +static s64 tabledist(s64 mu, s32 sigma, + struct crndstate *state, + const struct disttable *dist) { - psched_tdiff_t x; + s64 x; long t; u32 rnd; @@ -320,7 +327,7 @@ static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma, /* default uniform distribution */ if (dist == NULL) - return (rnd % (2*sigma)) - sigma + mu; + return (rnd % (2 * sigma)) - sigma + mu; t = dist->table[rnd % dist->size]; x = (sigma % NETEM_DIST_SCALE) * t; @@ -332,10 +339,8 @@ static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma, return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu; } -static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q) +static u64 packet_time_ns(u64 len, const struct netem_sched_data *q) { - u64 ticks; - len += q->packet_overhead; if (q->cell_size) { @@ -346,10 +351,7 @@ static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sche len = cells * (q->cell_size + q->cell_overhead); } - ticks = (u64)len * NSEC_PER_SEC; - - do_div(ticks, q->rate); - return PSCHED_NS2TICKS(ticks); + return div64_u64(len * NSEC_PER_SEC, q->rate); } static void tfifo_reset(struct Qdisc *sch) @@ -369,7 +371,7 @@ static void tfifo_reset(struct Qdisc *sch) static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); - psched_time_t tnext = netem_skb_cb(nskb)->time_to_send; + u64 tnext = netem_skb_cb(nskb)->time_to_send; struct rb_node **p = &q->t_root.rb_node, *parent = NULL; while (*p) { @@ -515,13 +517,13 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (q->gap == 0 || /* not doing reordering */ q->counter < q->gap - 1 || /* inside last reordering gap */ q->reorder < get_crandom(&q->reorder_cor)) { - psched_time_t now; - psched_tdiff_t delay; + u64 now; + s64 delay; delay = tabledist(q->latency, q->jitter, &q->delay_cor, q->delay_dist); - now = psched_get_time(); + now = ktime_get_ns(); if (q->rate) { struct netem_skb_cb *last = NULL; @@ -547,11 +549,11 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, * from delay. */ delay -= last->time_to_send - now; - delay = max_t(psched_tdiff_t, 0, delay); + delay = max_t(s64, 0, delay); now = last->time_to_send; } - delay += packet_len_2_sched_time(qdisc_pkt_len(skb), q); + delay += packet_time_ns(qdisc_pkt_len(skb), q); } cb->time_to_send = now + delay; @@ -562,7 +564,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, * Do re-ordering by putting one out of N packets at the front * of the queue. */ - cb->time_to_send = psched_get_time(); + cb->time_to_send = ktime_get_ns(); q->counter = 0; netem_enqueue_skb_head(&sch->q, skb); @@ -593,6 +595,20 @@ finish_segs: return NET_XMIT_SUCCESS; } +/* Delay the next round with a new future slot with a + * correct number of bytes and packets. + */ + +static void get_slot_next(struct netem_sched_data *q, u64 now) +{ + q->slot.slot_next = now + q->slot_config.min_delay + + (prandom_u32() * + (q->slot_config.max_delay - + q->slot_config.min_delay) >> 32); + q->slot.packets_left = q->slot_config.max_packets; + q->slot.bytes_left = q->slot_config.max_bytes; +} + static struct sk_buff *netem_dequeue(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); @@ -609,15 +625,18 @@ deliver: } p = rb_first(&q->t_root); if (p) { - psched_time_t time_to_send; + u64 time_to_send; + u64 now = ktime_get_ns(); skb = rb_to_skb(p); /* if more time remaining? */ time_to_send = netem_skb_cb(skb)->time_to_send; - if (time_to_send <= psched_get_time()) { - rb_erase(p, &q->t_root); + if (q->slot.slot_next && q->slot.slot_next < time_to_send) + get_slot_next(q, now); + if (time_to_send <= now && q->slot.slot_next <= now) { + rb_erase(p, &q->t_root); sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); skb->next = NULL; @@ -636,6 +655,14 @@ deliver: skb->tstamp = 0; #endif + if (q->slot.slot_next) { + q->slot.packets_left--; + q->slot.bytes_left -= qdisc_pkt_len(skb); + if (q->slot.packets_left <= 0 || + q->slot.bytes_left <= 0) + get_slot_next(q, now); + } + if (q->qdisc) { unsigned int pkt_len = qdisc_pkt_len(skb); struct sk_buff *to_free = NULL; @@ -659,7 +686,10 @@ deliver: if (skb) goto deliver; } - qdisc_watchdog_schedule(&q->watchdog, time_to_send); + + qdisc_watchdog_schedule_ns(&q->watchdog, + max(time_to_send, + q->slot.slot_next)); } if (q->qdisc) { @@ -690,6 +720,7 @@ static void dist_free(struct disttable *d) * Distribution data is a variable size payload containing * signed 16 bit values. */ + static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr) { struct netem_sched_data *q = qdisc_priv(sch); @@ -720,6 +751,23 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr) return 0; } +static void get_slot(struct netem_sched_data *q, const struct nlattr *attr) +{ + const struct tc_netem_slot *c = nla_data(attr); + + q->slot_config = *c; + if (q->slot_config.max_packets == 0) + q->slot_config.max_packets = INT_MAX; + if (q->slot_config.max_bytes == 0) + q->slot_config.max_bytes = INT_MAX; + q->slot.packets_left = q->slot_config.max_packets; + q->slot.bytes_left = q->slot_config.max_bytes; + if (q->slot_config.min_delay | q->slot_config.max_delay) + q->slot.slot_next = ktime_get_ns(); + else + q->slot.slot_next = 0; +} + static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr) { const struct tc_netem_corr *c = nla_data(attr); @@ -821,6 +869,9 @@ static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = { [TCA_NETEM_LOSS] = { .type = NLA_NESTED }, [TCA_NETEM_ECN] = { .type = NLA_U32 }, [TCA_NETEM_RATE64] = { .type = NLA_U64 }, + [TCA_NETEM_LATENCY64] = { .type = NLA_S64 }, + [TCA_NETEM_JITTER64] = { .type = NLA_S64 }, + [TCA_NETEM_SLOT] = { .len = sizeof(struct tc_netem_slot) }, }; static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, @@ -888,8 +939,8 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt) sch->limit = qopt->limit; - q->latency = qopt->latency; - q->jitter = qopt->jitter; + q->latency = PSCHED_TICKS2NS(qopt->latency); + q->jitter = PSCHED_TICKS2NS(qopt->jitter); q->limit = qopt->limit; q->gap = qopt->gap; q->counter = 0; @@ -918,9 +969,18 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt) q->rate = max_t(u64, q->rate, nla_get_u64(tb[TCA_NETEM_RATE64])); + if (tb[TCA_NETEM_LATENCY64]) + q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]); + + if (tb[TCA_NETEM_JITTER64]) + q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]); + if (tb[TCA_NETEM_ECN]) q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]); + if (tb[TCA_NETEM_SLOT]) + get_slot(q, tb[TCA_NETEM_SLOT]); + return ret; } @@ -1010,9 +1070,12 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) struct tc_netem_reorder reorder; struct tc_netem_corrupt corrupt; struct tc_netem_rate rate; + struct tc_netem_slot slot; - qopt.latency = q->latency; - qopt.jitter = q->jitter; + qopt.latency = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->latency), + UINT_MAX); + qopt.jitter = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->jitter), + UINT_MAX); qopt.limit = q->limit; qopt.loss = q->loss; qopt.gap = q->gap; @@ -1020,6 +1083,12 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt)) goto nla_put_failure; + if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency)) + goto nla_put_failure; + + if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter)) + goto nla_put_failure; + cor.delay_corr = q->delay_cor.rho; cor.loss_corr = q->loss_cor.rho; cor.dup_corr = q->dup_cor.rho; @@ -1056,6 +1125,16 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) if (dump_loss_model(q, skb) != 0) goto nla_put_failure; + if (q->slot_config.min_delay | q->slot_config.max_delay) { + slot = q->slot_config; + if (slot.max_packets == INT_MAX) + slot.max_packets = 0; + if (slot.max_bytes == INT_MAX) + slot.max_bytes = 0; + if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot)) + goto nla_put_failure; + } + return nla_nest_end(skb, nla); nla_put_failure: diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index fdfdb56aaae2..7f8ea9e297c3 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -19,6 +19,7 @@ #include <linux/kernel.h> #include <linux/skbuff.h> #include <net/pkt_sched.h> +#include <net/pkt_cls.h> #include <net/inet_ecn.h> #include <net/red.h> @@ -148,11 +149,37 @@ static void red_reset(struct Qdisc *sch) red_restart(&q->vars); } +static int red_offload(struct Qdisc *sch, bool enable) +{ + struct red_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct tc_red_qopt_offload opt = { + .handle = sch->handle, + .parent = sch->parent, + }; + + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return -EOPNOTSUPP; + + if (enable) { + opt.command = TC_RED_REPLACE; + opt.set.min = q->parms.qth_min >> q->parms.Wlog; + opt.set.max = q->parms.qth_max >> q->parms.Wlog; + opt.set.probability = q->parms.max_P; + opt.set.is_ecn = red_use_ecn(q); + } else { + opt.command = TC_RED_DESTROY; + } + + return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, &opt); +} + static void red_destroy(struct Qdisc *sch) { struct red_sched_data *q = qdisc_priv(sch); del_timer_sync(&q->adapt_timer); + red_offload(sch, false); qdisc_destroy(q->qdisc); } @@ -219,6 +246,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt) red_start_of_idle_period(&q->vars); sch_tree_unlock(sch); + red_offload(sch, true); return 0; } @@ -244,6 +272,35 @@ static int red_init(struct Qdisc *sch, struct nlattr *opt) return red_change(sch, opt); } +static int red_dump_offload(struct Qdisc *sch, struct tc_red_qopt *opt) +{ + struct net_device *dev = qdisc_dev(sch); + struct tc_red_qopt_offload hw_stats = { + .command = TC_RED_STATS, + .handle = sch->handle, + .parent = sch->parent, + { + .stats.bstats = &sch->bstats, + .stats.qstats = &sch->qstats, + }, + }; + int err; + + opt->flags &= ~TC_RED_OFFLOADED; + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return 0; + + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, + &hw_stats); + if (err == -EOPNOTSUPP) + return 0; + + if (!err) + opt->flags |= TC_RED_OFFLOADED; + + return err; +} + static int red_dump(struct Qdisc *sch, struct sk_buff *skb) { struct red_sched_data *q = qdisc_priv(sch); @@ -257,8 +314,13 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb) .Plog = q->parms.Plog, .Scell_log = q->parms.Scell_log, }; + int err; sch->qstats.backlog = q->qdisc->qstats.backlog; + err = red_dump_offload(sch, &opt); + if (err) + goto nla_put_failure; + opts = nla_nest_start(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; @@ -275,6 +337,7 @@ nla_put_failure: static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct red_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); struct tc_red_xstats st = { .early = q->stats.prob_drop + q->stats.forced_drop, .pdrop = q->stats.pdrop, @@ -282,6 +345,26 @@ static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d) .marked = q->stats.prob_mark + q->stats.forced_mark, }; + if (tc_can_offload(dev) && dev->netdev_ops->ndo_setup_tc) { + struct red_stats hw_stats = {0}; + struct tc_red_qopt_offload hw_stats_request = { + .command = TC_RED_XSTATS, + .handle = sch->handle, + .parent = sch->parent, + { + .xstats = &hw_stats, + }, + }; + if (!dev->netdev_ops->ndo_setup_tc(dev, + TC_SETUP_QDISC_RED, + &hw_stats_request)) { + st.early += hw_stats.prob_drop + hw_stats.forced_drop; + st.pdrop += hw_stats.pdrop; + st.other += hw_stats.other; + st.marked += hw_stats.prob_mark + hw_stats.forced_mark; + } + } + return gnet_stats_copy_app(d, &st, sizeof(st)); } diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index a6dfa86c0201..3b18085e3b10 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -807,9 +807,10 @@ static void sctp_inet6_skb_msgname(struct sk_buff *skb, char *msgname, addr->v6.sin6_flowinfo = 0; addr->v6.sin6_port = sh->source; addr->v6.sin6_addr = ipv6_hdr(skb)->saddr; - if (ipv6_addr_type(&addr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) { + if (ipv6_addr_type(&addr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) addr->v6.sin6_scope_id = sctp_v6_skb_iif(skb); - } + else + addr->v6.sin6_scope_id = 0; } *addr_len = sctp_v6_addr_to_user(sctp_sk(skb->sk), addr); diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 514465b03829..9bf575f2e8ed 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -3594,8 +3594,8 @@ struct sctp_chunk *sctp_make_strreset_req( __u16 stream_num, __be16 *stream_list, bool out, bool in) { + __u16 stream_len = stream_num * sizeof(__u16); struct sctp_strreset_outreq outreq; - __u16 stream_len = stream_num * 2; struct sctp_strreset_inreq inreq; struct sctp_chunk *retval; __u16 outlen, inlen; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index b029757bea03..3204a9b29407 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -84,8 +84,8 @@ /* Forward declarations for internal helper functions. */ static int sctp_writeable(struct sock *sk); static void sctp_wfree(struct sk_buff *skb); -static int sctp_wait_for_sndbuf(struct sctp_association *, long *timeo_p, - size_t msg_len); +static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, + size_t msg_len, struct sock **orig_sk); static int sctp_wait_for_packet(struct sock *sk, int *err, long *timeo_p); static int sctp_wait_for_connect(struct sctp_association *, long *timeo_p); static int sctp_wait_for_accept(struct sock *sk, long timeo); @@ -1970,9 +1970,16 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); if (!sctp_wspace(asoc)) { - err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len); - if (err) + /* sk can be changed by peel off when waiting for buf. */ + err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len, &sk); + if (err) { + if (err == -ESRCH) { + /* asoc is already dead. */ + new_asoc = NULL; + err = -EPIPE; + } goto out_free; + } } /* If an address is passed with the sendto/sendmsg call, it is used @@ -3133,9 +3140,9 @@ static int sctp_setsockopt_mappedv4(struct sock *sk, char __user *optval, unsign */ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned int optlen) { + struct sctp_sock *sp = sctp_sk(sk); struct sctp_assoc_value params; struct sctp_association *asoc; - struct sctp_sock *sp = sctp_sk(sk); int val; if (optlen == sizeof(int)) { @@ -3151,26 +3158,35 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned if (copy_from_user(¶ms, optval, optlen)) return -EFAULT; val = params.assoc_value; - } else + } else { return -EINVAL; + } - if ((val != 0) && ((val < 8) || (val > SCTP_MAX_CHUNK_LEN))) - return -EINVAL; + if (val) { + int min_len, max_len; - asoc = sctp_id2assoc(sk, params.assoc_id); - if (!asoc && params.assoc_id && sctp_style(sk, UDP)) - return -EINVAL; + min_len = SCTP_DEFAULT_MINSEGMENT - sp->pf->af->net_header_len; + min_len -= sizeof(struct sctphdr) + + sizeof(struct sctp_data_chunk); + + max_len = SCTP_MAX_CHUNK_LEN - sizeof(struct sctp_data_chunk); + if (val < min_len || val > max_len) + return -EINVAL; + } + + asoc = sctp_id2assoc(sk, params.assoc_id); if (asoc) { if (val == 0) { - val = asoc->pathmtu; - val -= sp->pf->af->net_header_len; + val = asoc->pathmtu - sp->pf->af->net_header_len; val -= sizeof(struct sctphdr) + - sizeof(struct sctp_data_chunk); + sizeof(struct sctp_data_chunk); } asoc->user_frag = val; asoc->frag_point = sctp_frag_point(asoc, asoc->pathmtu); } else { + if (params.assoc_id && sctp_style(sk, UDP)) + return -EINVAL; sp->user_frag = val; } @@ -5015,12 +5031,6 @@ int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp) if (!asoc) return -EINVAL; - /* If there is a thread waiting on more sndbuf space for - * sending on this asoc, it cannot be peeled. - */ - if (waitqueue_active(&asoc->wait)) - return -EBUSY; - /* An association cannot be branched off from an already peeled-off * socket, nor is this supported for tcp style sockets. */ @@ -7989,7 +7999,7 @@ void sctp_sock_rfree(struct sk_buff *skb) /* Helper function to wait for space in the sndbuf. */ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, - size_t msg_len) + size_t msg_len, struct sock **orig_sk) { struct sock *sk = asoc->base.sk; int err = 0; @@ -8006,10 +8016,11 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, for (;;) { prepare_to_wait_exclusive(&asoc->wait, &wait, TASK_INTERRUPTIBLE); + if (asoc->base.dead) + goto do_dead; if (!*timeo_p) goto do_nonblock; - if (sk->sk_err || asoc->state >= SCTP_STATE_SHUTDOWN_PENDING || - asoc->base.dead) + if (sk->sk_err || asoc->state >= SCTP_STATE_SHUTDOWN_PENDING) goto do_error; if (signal_pending(current)) goto do_interrupted; @@ -8022,11 +8033,17 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, release_sock(sk); current_timeo = schedule_timeout(current_timeo); lock_sock(sk); + if (sk != asoc->base.sk) { + release_sock(sk); + sk = asoc->base.sk; + lock_sock(sk); + } *timeo_p = current_timeo; } out: + *orig_sk = sk; finish_wait(&asoc->wait, &wait); /* Release the association's refcnt. */ @@ -8034,6 +8051,10 @@ out: return err; +do_dead: + err = -ESRCH; + goto out; + do_error: err = -EPIPE; goto out; diff --git a/net/sctp/stream.c b/net/sctp/stream.c index b8c8cabb1a58..a11db21dc8a0 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -282,15 +282,31 @@ int sctp_send_reset_streams(struct sctp_association *asoc, str_nums = params->srs_number_streams; str_list = params->srs_stream_list; - if (out && str_nums) - for (i = 0; i < str_nums; i++) - if (str_list[i] >= stream->outcnt) - goto out; + if (str_nums) { + int param_len = 0; - if (in && str_nums) - for (i = 0; i < str_nums; i++) - if (str_list[i] >= stream->incnt) - goto out; + if (out) { + for (i = 0; i < str_nums; i++) + if (str_list[i] >= stream->outcnt) + goto out; + + param_len = str_nums * sizeof(__u16) + + sizeof(struct sctp_strreset_outreq); + } + + if (in) { + for (i = 0; i < str_nums; i++) + if (str_list[i] >= stream->incnt) + goto out; + + param_len += str_nums * sizeof(__u16) + + sizeof(struct sctp_strreset_inreq); + } + + if (param_len > SCTP_MAX_CHUNK_LEN - + sizeof(struct sctp_reconf_chunk)) + goto out; + } nstr_list = kcalloc(str_nums, sizeof(__be16), GFP_KERNEL); if (!nstr_list) { diff --git a/net/socket.c b/net/socket.c index c729625eb5d3..42d8e9c9ccd5 100644 --- a/net/socket.c +++ b/net/socket.c @@ -568,7 +568,6 @@ struct socket *sock_alloc(void) sock = SOCKET_I(inode); - kmemcheck_annotate_bitfield(sock, type); inode->i_ino = get_next_ino(); inode->i_mode = S_IFSOCK | S_IRWXUGO; inode->i_uid = current_fsuid(); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 2ad827db2704..a801da812f86 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1491,7 +1491,6 @@ rpc_restart_call(struct rpc_task *task) } EXPORT_SYMBOL_GPL(rpc_restart_call); -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) const char *rpc_proc_name(const struct rpc_task *task) { @@ -1505,7 +1504,6 @@ const char } else return "no proc"; } -#endif /* * 0. Initial state @@ -1519,6 +1517,7 @@ call_start(struct rpc_task *task) struct rpc_clnt *clnt = task->tk_client; int idx = task->tk_msg.rpc_proc->p_statidx; + trace_rpc_request(task); dprintk("RPC: %5u call_start %s%d proc %s (%s)\n", task->tk_pid, clnt->cl_program->name, clnt->cl_vers, rpc_proc_name(task), @@ -1586,6 +1585,7 @@ call_reserveresult(struct rpc_task *task) switch (status) { case -ENOMEM: rpc_delay(task, HZ >> 2); + /* fall through */ case -EAGAIN: /* woken up; retry */ task->tk_action = call_retry_reserve; return; @@ -1647,10 +1647,13 @@ call_refreshresult(struct rpc_task *task) /* Use rate-limiting and a max number of retries if refresh * had status 0 but failed to update the cred. */ + /* fall through */ case -ETIMEDOUT: rpc_delay(task, 3*HZ); + /* fall through */ case -EAGAIN: status = -EACCES; + /* fall through */ case -EKEYEXPIRED: if (!task->tk_cred_retry) break; @@ -1911,6 +1914,7 @@ call_connect_status(struct rpc_task *task) task->tk_action = call_bind; return; } + /* fall through */ case -ECONNRESET: case -ECONNABORTED: case -ENETUNREACH: @@ -1924,6 +1928,7 @@ call_connect_status(struct rpc_task *task) break; /* retry with existing socket, after a delay */ rpc_delay(task, 3*HZ); + /* fall through */ case -EAGAIN: /* Check for timeouts before looping back to call_bind */ case -ETIMEDOUT: @@ -2025,6 +2030,7 @@ call_transmit_status(struct rpc_task *task) rpc_exit(task, task->tk_status); break; } + /* fall through */ case -ECONNRESET: case -ECONNABORTED: case -EADDRINUSE: @@ -2145,6 +2151,7 @@ call_status(struct rpc_task *task) * were a timeout. */ rpc_delay(task, 3*HZ); + /* fall through */ case -ETIMEDOUT: task->tk_action = call_timeout; break; @@ -2152,14 +2159,17 @@ call_status(struct rpc_task *task) case -ECONNRESET: case -ECONNABORTED: rpc_force_rebind(clnt); + /* fall through */ case -EADDRINUSE: rpc_delay(task, 3*HZ); + /* fall through */ case -EPIPE: case -ENOTCONN: task->tk_action = call_bind; break; case -ENOBUFS: rpc_delay(task, HZ>>2); + /* fall through */ case -EAGAIN: task->tk_action = call_transmit; break; diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 61a504fb1ae2..7803f3b6aa53 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -1410,8 +1410,8 @@ rpc_fill_super(struct super_block *sb, void *data, int silent) return PTR_ERR(gssd_dentry); } - dprintk("RPC: sending pipefs MOUNT notification for net %p%s\n", - net, NET_NAME(net)); + dprintk("RPC: sending pipefs MOUNT notification for net %x%s\n", + net->ns.inum, NET_NAME(net)); mutex_lock(&sn->pipefs_sb_lock); sn->pipefs_sb = sb; err = blocking_notifier_call_chain(&rpc_pipefs_notifier_list, @@ -1462,8 +1462,8 @@ static void rpc_kill_sb(struct super_block *sb) goto out; } sn->pipefs_sb = NULL; - dprintk("RPC: sending pipefs UMOUNT notification for net %p%s\n", - net, NET_NAME(net)); + dprintk("RPC: sending pipefs UMOUNT notification for net %x%s\n", + net->ns.inum, NET_NAME(net)); blocking_notifier_call_chain(&rpc_pipefs_notifier_list, RPC_PIPEFS_UMOUNT, sb); diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index ea0676f199c8..c526f8fb37c9 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -216,9 +216,9 @@ static void rpcb_set_local(struct net *net, struct rpc_clnt *clnt, smp_wmb(); sn->rpcb_users = 1; dprintk("RPC: created new rpcb local clients (rpcb_local_clnt: " - "%p, rpcb_local_clnt4: %p) for net %p%s\n", - sn->rpcb_local_clnt, sn->rpcb_local_clnt4, - net, (net == &init_net) ? " (init_net)" : ""); + "%p, rpcb_local_clnt4: %p) for net %x%s\n", + sn->rpcb_local_clnt, sn->rpcb_local_clnt4, + net->ns.inum, (net == &init_net) ? " (init_net)" : ""); } /* diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 5dea47eb31bb..b1b49edd7c4d 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -274,10 +274,9 @@ static inline void rpc_task_set_debuginfo(struct rpc_task *task) static void rpc_set_active(struct rpc_task *task) { - trace_rpc_task_begin(task->tk_client, task, NULL); - rpc_task_set_debuginfo(task); set_bit(RPC_TASK_ACTIVE, &task->tk_runstate); + trace_rpc_task_begin(task->tk_client, task, NULL); } /* diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index c73de181467a..56f9eff74150 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -65,10 +65,13 @@ err_proc: static __net_exit void sunrpc_exit_net(struct net *net) { + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); + rpc_pipefs_exit_net(net); unix_gid_cache_destroy(net); ip_map_cache_destroy(net); rpc_proc_exit(net); + WARN_ON_ONCE(!list_empty(&sn->all_clients)); } static struct pernet_operations sunrpc_net_ops = { diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 33f4ae68426d..387cc4add6f6 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -50,7 +50,7 @@ EXPORT_SYMBOL_GPL(svc_pool_map); static DEFINE_MUTEX(svc_pool_map_mutex);/* protects svc_pool_map.count only */ static int -param_set_pool_mode(const char *val, struct kernel_param *kp) +param_set_pool_mode(const char *val, const struct kernel_param *kp) { int *ip = (int *)kp->arg; struct svc_pool_map *m = &svc_pool_map; @@ -80,7 +80,7 @@ out: } static int -param_get_pool_mode(char *buf, struct kernel_param *kp) +param_get_pool_mode(char *buf, const struct kernel_param *kp) { int *ip = (int *)kp->arg; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 6160d17a31c4..333b9d697ae5 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1139,6 +1139,7 @@ void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) case -EAGAIN: xprt_add_backlog(xprt, task); dprintk("RPC: waiting for request slot\n"); + /* fall through */ default: task->tk_status = -EAGAIN; } diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 823a781ec89c..8b818bb3518a 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -43,7 +43,7 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, req = rpcrdma_create_req(r_xprt); if (IS_ERR(req)) return PTR_ERR(req); - req->rl_backchannel = true; + __set_bit(RPCRDMA_REQ_F_BACKCHANNEL, &req->rl_flags); rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE, DMA_TO_DEVICE, GFP_KERNEL); @@ -223,8 +223,8 @@ int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) *p++ = xdr_zero; *p = xdr_zero; - if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, RPCRDMA_HDRLEN_MIN, - &rqst->rq_snd_buf, rpcrdma_noch)) + if (rpcrdma_prepare_send_sges(r_xprt, req, RPCRDMA_HDRLEN_MIN, + &rqst->rq_snd_buf, rpcrdma_noch)) return -EIO; return 0; } diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index fa759dd2b0f3..29fc84c7ff98 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -306,28 +306,9 @@ out_reset: } } -/* Use a slow, safe mechanism to invalidate all memory regions - * that were registered for "req". - */ -static void -fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, - bool sync) -{ - struct rpcrdma_mw *mw; - - while (!list_empty(&req->rl_registered)) { - mw = rpcrdma_pop_mw(&req->rl_registered); - if (sync) - fmr_op_recover_mr(mw); - else - rpcrdma_defer_mr_recovery(mw); - } -} - const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { .ro_map = fmr_op_map, .ro_unmap_sync = fmr_op_unmap_sync, - .ro_unmap_safe = fmr_op_unmap_safe, .ro_recover_mr = fmr_op_recover_mr, .ro_open = fmr_op_open, .ro_maxpages = fmr_op_maxpages, diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 35d7517ef0e6..773e66e10a15 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -420,7 +420,6 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : IB_ACCESS_REMOTE_READ; - rpcrdma_set_signaled(&r_xprt->rx_ep, ®_wr->wr); rc = ib_post_send(ia->ri_id->qp, ®_wr->wr, &bad_wr); if (rc) goto out_senderr; @@ -508,12 +507,6 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) f->fr_cqe.done = frwr_wc_localinv_wake; reinit_completion(&f->fr_linv_done); - /* Initialize CQ count, since there is always a signaled - * WR being posted here. The new cqcount depends on how - * many SQEs are about to be consumed. - */ - rpcrdma_init_cqcount(&r_xprt->rx_ep, count); - /* Transport disconnect drains the receive CQ before it * replaces the QP. The RPC reply handler won't call us * unless ri_id->qp is a valid pointer. @@ -546,7 +539,6 @@ reset_mrs: /* Find and reset the MRs in the LOCAL_INV WRs that did not * get posted. */ - rpcrdma_init_cqcount(&r_xprt->rx_ep, -count); while (bad_wr) { f = container_of(bad_wr, struct rpcrdma_frmr, fr_invwr); @@ -559,28 +551,9 @@ reset_mrs: goto unmap; } -/* Use a slow, safe mechanism to invalidate all memory regions - * that were registered for "req". - */ -static void -frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, - bool sync) -{ - struct rpcrdma_mw *mw; - - while (!list_empty(&req->rl_registered)) { - mw = rpcrdma_pop_mw(&req->rl_registered); - if (sync) - frwr_op_recover_mr(mw); - else - rpcrdma_defer_mr_recovery(mw); - } -} - const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { .ro_map = frwr_op_map, .ro_unmap_sync = frwr_op_unmap_sync, - .ro_unmap_safe = frwr_op_unmap_safe, .ro_recover_mr = frwr_op_recover_mr, .ro_open = frwr_op_open, .ro_maxpages = frwr_op_maxpages, diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index f1889f4d4803..ed34dc0f144c 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2014-2017 Oracle. All rights reserved. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -75,11 +76,11 @@ static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs) /* Maximum Read list size */ maxsegs += 2; /* segment for head and tail buffers */ - size = maxsegs * sizeof(struct rpcrdma_read_chunk); + size = maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32); /* Minimal Read chunk size */ size += sizeof(__be32); /* segment count */ - size += sizeof(struct rpcrdma_segment); + size += rpcrdma_segment_maxsz * sizeof(__be32); size += sizeof(__be32); /* list discriminator */ dprintk("RPC: %s: max call header size = %u\n", @@ -102,7 +103,7 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) /* Maximum Write list size */ maxsegs += 2; /* segment for head and tail buffers */ size = sizeof(__be32); /* segment count */ - size += maxsegs * sizeof(struct rpcrdma_segment); + size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32); size += sizeof(__be32); /* list discriminator */ dprintk("RPC: %s: max reply header size = %u\n", @@ -511,27 +512,60 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, return 0; } -/* Prepare the RPC-over-RDMA header SGE. +/** + * rpcrdma_unmap_sendctx - DMA-unmap Send buffers + * @sc: sendctx containing SGEs to unmap + * + */ +void +rpcrdma_unmap_sendctx(struct rpcrdma_sendctx *sc) +{ + struct rpcrdma_ia *ia = &sc->sc_xprt->rx_ia; + struct ib_sge *sge; + unsigned int count; + + dprintk("RPC: %s: unmapping %u sges for sc=%p\n", + __func__, sc->sc_unmap_count, sc); + + /* The first two SGEs contain the transport header and + * the inline buffer. These are always left mapped so + * they can be cheaply re-used. + */ + sge = &sc->sc_sges[2]; + for (count = sc->sc_unmap_count; count; ++sge, --count) + ib_dma_unmap_page(ia->ri_device, + sge->addr, sge->length, DMA_TO_DEVICE); + + if (test_and_clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &sc->sc_req->rl_flags)) { + smp_mb__after_atomic(); + wake_up_bit(&sc->sc_req->rl_flags, RPCRDMA_REQ_F_TX_RESOURCES); + } +} + +/* Prepare an SGE for the RPC-over-RDMA transport header. */ static bool rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req, u32 len) { + struct rpcrdma_sendctx *sc = req->rl_sendctx; struct rpcrdma_regbuf *rb = req->rl_rdmabuf; - struct ib_sge *sge = &req->rl_send_sge[0]; + struct ib_sge *sge = sc->sc_sges; - if (unlikely(!rpcrdma_regbuf_is_mapped(rb))) { - if (!__rpcrdma_dma_map_regbuf(ia, rb)) - return false; - sge->addr = rdmab_addr(rb); - sge->lkey = rdmab_lkey(rb); - } + if (!rpcrdma_dma_map_regbuf(ia, rb)) + goto out_regbuf; + sge->addr = rdmab_addr(rb); sge->length = len; + sge->lkey = rdmab_lkey(rb); ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length, DMA_TO_DEVICE); - req->rl_send_wr.num_sge++; + sc->sc_wr.num_sge++; return true; + +out_regbuf: + pr_err("rpcrdma: failed to DMA map a Send buffer\n"); + return false; } /* Prepare the Send SGEs. The head and tail iovec, and each entry @@ -541,10 +575,11 @@ static bool rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req, struct xdr_buf *xdr, enum rpcrdma_chunktype rtype) { + struct rpcrdma_sendctx *sc = req->rl_sendctx; unsigned int sge_no, page_base, len, remaining; struct rpcrdma_regbuf *rb = req->rl_sendbuf; struct ib_device *device = ia->ri_device; - struct ib_sge *sge = req->rl_send_sge; + struct ib_sge *sge = sc->sc_sges; u32 lkey = ia->ri_pd->local_dma_lkey; struct page *page, **ppages; @@ -552,7 +587,7 @@ rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req, * DMA-mapped. Sync the content that has changed. */ if (!rpcrdma_dma_map_regbuf(ia, rb)) - return false; + goto out_regbuf; sge_no = 1; sge[sge_no].addr = rdmab_addr(rb); sge[sge_no].length = xdr->head[0].iov_len; @@ -607,7 +642,7 @@ rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req, sge[sge_no].length = len; sge[sge_no].lkey = lkey; - req->rl_mapped_sges++; + sc->sc_unmap_count++; ppages++; remaining -= len; page_base = 0; @@ -633,56 +668,61 @@ map_tail: goto out_mapping_err; sge[sge_no].length = len; sge[sge_no].lkey = lkey; - req->rl_mapped_sges++; + sc->sc_unmap_count++; } out: - req->rl_send_wr.num_sge = sge_no + 1; + sc->sc_wr.num_sge += sge_no; + if (sc->sc_unmap_count) + __set_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags); return true; +out_regbuf: + pr_err("rpcrdma: failed to DMA map a Send buffer\n"); + return false; + out_mapping_overflow: + rpcrdma_unmap_sendctx(sc); pr_err("rpcrdma: too many Send SGEs (%u)\n", sge_no); return false; out_mapping_err: + rpcrdma_unmap_sendctx(sc); pr_err("rpcrdma: Send mapping error\n"); return false; } -bool -rpcrdma_prepare_send_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req, - u32 hdrlen, struct xdr_buf *xdr, - enum rpcrdma_chunktype rtype) +/** + * rpcrdma_prepare_send_sges - Construct SGEs for a Send WR + * @r_xprt: controlling transport + * @req: context of RPC Call being marshalled + * @hdrlen: size of transport header, in bytes + * @xdr: xdr_buf containing RPC Call + * @rtype: chunk type being encoded + * + * Returns 0 on success; otherwise a negative errno is returned. + */ +int +rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, u32 hdrlen, + struct xdr_buf *xdr, enum rpcrdma_chunktype rtype) { - req->rl_send_wr.num_sge = 0; - req->rl_mapped_sges = 0; - - if (!rpcrdma_prepare_hdr_sge(ia, req, hdrlen)) - goto out_map; + req->rl_sendctx = rpcrdma_sendctx_get_locked(&r_xprt->rx_buf); + if (!req->rl_sendctx) + return -ENOBUFS; + req->rl_sendctx->sc_wr.num_sge = 0; + req->rl_sendctx->sc_unmap_count = 0; + req->rl_sendctx->sc_req = req; + __clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags); + + if (!rpcrdma_prepare_hdr_sge(&r_xprt->rx_ia, req, hdrlen)) + return -EIO; if (rtype != rpcrdma_areadch) - if (!rpcrdma_prepare_msg_sges(ia, req, xdr, rtype)) - goto out_map; - - return true; - -out_map: - pr_err("rpcrdma: failed to DMA map a Send buffer\n"); - return false; -} - -void -rpcrdma_unmap_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req) -{ - struct ib_device *device = ia->ri_device; - struct ib_sge *sge; - int count; + if (!rpcrdma_prepare_msg_sges(&r_xprt->rx_ia, req, xdr, rtype)) + return -EIO; - sge = &req->rl_send_sge[2]; - for (count = req->rl_mapped_sges; count--; sge++) - ib_dma_unmap_page(device, sge->addr, sge->length, - DMA_TO_DEVICE); - req->rl_mapped_sges = 0; + return 0; } /** @@ -833,12 +873,10 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) transfertypes[rtype], transfertypes[wtype], xdr_stream_pos(xdr)); - if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, - xdr_stream_pos(xdr), - &rqst->rq_snd_buf, rtype)) { - ret = -EIO; + ret = rpcrdma_prepare_send_sges(r_xprt, req, xdr_stream_pos(xdr), + &rqst->rq_snd_buf, rtype); + if (ret) goto out_err; - } return 0; out_err: @@ -970,14 +1008,13 @@ rpcrdma_mark_remote_invalidation(struct list_head *mws, * straightforward to check the RPC header's direction field. */ static bool -rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, - __be32 xid, __be32 proc) +rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) #if defined(CONFIG_SUNRPC_BACKCHANNEL) { struct xdr_stream *xdr = &rep->rr_stream; __be32 *p; - if (proc != rdma_msg) + if (rep->rr_proc != rdma_msg) return false; /* Peek at stream contents without advancing. */ @@ -992,7 +1029,7 @@ rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, return false; /* RPC header */ - if (*p++ != xid) + if (*p++ != rep->rr_xid) return false; if (*p != cpu_to_be32(RPC_CALL)) return false; @@ -1212,105 +1249,170 @@ rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, return -EREMOTEIO; } +/* Perform XID lookup, reconstruction of the RPC reply, and + * RPC completion while holding the transport lock to ensure + * the rep, rqst, and rq_task pointers remain stable. + */ +void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) +{ + struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; + struct rpc_xprt *xprt = &r_xprt->rx_xprt; + struct rpc_rqst *rqst = rep->rr_rqst; + unsigned long cwnd; + int status; + + xprt->reestablish_timeout = 0; + + switch (rep->rr_proc) { + case rdma_msg: + status = rpcrdma_decode_msg(r_xprt, rep, rqst); + break; + case rdma_nomsg: + status = rpcrdma_decode_nomsg(r_xprt, rep); + break; + case rdma_error: + status = rpcrdma_decode_error(r_xprt, rep, rqst); + break; + default: + status = -EIO; + } + if (status < 0) + goto out_badheader; + +out: + spin_lock(&xprt->recv_lock); + cwnd = xprt->cwnd; + xprt->cwnd = r_xprt->rx_buf.rb_credits << RPC_CWNDSHIFT; + if (xprt->cwnd > cwnd) + xprt_release_rqst_cong(rqst->rq_task); + + xprt_complete_rqst(rqst->rq_task, status); + xprt_unpin_rqst(rqst); + spin_unlock(&xprt->recv_lock); + return; + +/* If the incoming reply terminated a pending RPC, the next + * RPC call will post a replacement receive buffer as it is + * being marshaled. + */ +out_badheader: + dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n", + rqst->rq_task->tk_pid, __func__, be32_to_cpu(rep->rr_proc)); + r_xprt->rx_stats.bad_reply_count++; + status = -EIO; + goto out; +} + +void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) +{ + /* Invalidate and unmap the data payloads before waking + * the waiting application. This guarantees the memory + * regions are properly fenced from the server before the + * application accesses the data. It also ensures proper + * send flow control: waking the next RPC waits until this + * RPC has relinquished all its Send Queue entries. + */ + if (!list_empty(&req->rl_registered)) + r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, + &req->rl_registered); + + /* Ensure that any DMA mapped pages associated with + * the Send of the RPC Call have been unmapped before + * allowing the RPC to complete. This protects argument + * memory not controlled by the RPC client from being + * re-used before we're done with it. + */ + if (test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) { + r_xprt->rx_stats.reply_waits_for_send++; + out_of_line_wait_on_bit(&req->rl_flags, + RPCRDMA_REQ_F_TX_RESOURCES, + bit_wait, + TASK_UNINTERRUPTIBLE); + } +} + +/* Reply handling runs in the poll worker thread. Anything that + * might wait is deferred to a separate workqueue. + */ +void rpcrdma_deferred_completion(struct work_struct *work) +{ + struct rpcrdma_rep *rep = + container_of(work, struct rpcrdma_rep, rr_work); + struct rpcrdma_req *req = rpcr_to_rdmar(rep->rr_rqst); + + rpcrdma_mark_remote_invalidation(&req->rl_registered, rep); + rpcrdma_release_rqst(rep->rr_rxprt, req); + rpcrdma_complete_rqst(rep); +} + /* Process received RPC/RDMA messages. * * Errors must result in the RPC task either being awakened, or * allowed to timeout, to discover the errors at that time. */ -void -rpcrdma_reply_handler(struct work_struct *work) +void rpcrdma_reply_handler(struct rpcrdma_rep *rep) { - struct rpcrdma_rep *rep = - container_of(work, struct rpcrdma_rep, rr_work); struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; struct rpc_xprt *xprt = &r_xprt->rx_xprt; - struct xdr_stream *xdr = &rep->rr_stream; + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_req *req; struct rpc_rqst *rqst; - __be32 *p, xid, vers, proc; - unsigned long cwnd; - int status; + u32 credits; + __be32 *p; dprintk("RPC: %s: incoming rep %p\n", __func__, rep); if (rep->rr_hdrbuf.head[0].iov_len == 0) goto out_badstatus; - xdr_init_decode(xdr, &rep->rr_hdrbuf, + xdr_init_decode(&rep->rr_stream, &rep->rr_hdrbuf, rep->rr_hdrbuf.head[0].iov_base); /* Fixed transport header fields */ - p = xdr_inline_decode(xdr, 4 * sizeof(*p)); + p = xdr_inline_decode(&rep->rr_stream, 4 * sizeof(*p)); if (unlikely(!p)) goto out_shortreply; - xid = *p++; - vers = *p++; - p++; /* credits */ - proc = *p++; + rep->rr_xid = *p++; + rep->rr_vers = *p++; + credits = be32_to_cpu(*p++); + rep->rr_proc = *p++; + + if (rep->rr_vers != rpcrdma_version) + goto out_badversion; - if (rpcrdma_is_bcall(r_xprt, rep, xid, proc)) + if (rpcrdma_is_bcall(r_xprt, rep)) return; /* Match incoming rpcrdma_rep to an rpcrdma_req to * get context for handling any incoming chunks. */ spin_lock(&xprt->recv_lock); - rqst = xprt_lookup_rqst(xprt, xid); + rqst = xprt_lookup_rqst(xprt, rep->rr_xid); if (!rqst) goto out_norqst; xprt_pin_rqst(rqst); + + if (credits == 0) + credits = 1; /* don't deadlock */ + else if (credits > buf->rb_max_requests) + credits = buf->rb_max_requests; + buf->rb_credits = credits; + spin_unlock(&xprt->recv_lock); + req = rpcr_to_rdmar(rqst); req->rl_reply = rep; + rep->rr_rqst = rqst; + clear_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags); dprintk("RPC: %s: reply %p completes request %p (xid 0x%08x)\n", - __func__, rep, req, be32_to_cpu(xid)); - - /* Invalidate and unmap the data payloads before waking the - * waiting application. This guarantees the memory regions - * are properly fenced from the server before the application - * accesses the data. It also ensures proper send flow control: - * waking the next RPC waits until this RPC has relinquished - * all its Send Queue entries. - */ - if (!list_empty(&req->rl_registered)) { - rpcrdma_mark_remote_invalidation(&req->rl_registered, rep); - r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, - &req->rl_registered); - } - - xprt->reestablish_timeout = 0; - if (vers != rpcrdma_version) - goto out_badversion; + __func__, rep, req, be32_to_cpu(rep->rr_xid)); - switch (proc) { - case rdma_msg: - status = rpcrdma_decode_msg(r_xprt, rep, rqst); - break; - case rdma_nomsg: - status = rpcrdma_decode_nomsg(r_xprt, rep); - break; - case rdma_error: - status = rpcrdma_decode_error(r_xprt, rep, rqst); - break; - default: - status = -EIO; - } - if (status < 0) - goto out_badheader; - -out: - spin_lock(&xprt->recv_lock); - cwnd = xprt->cwnd; - xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT; - if (xprt->cwnd > cwnd) - xprt_release_rqst_cong(rqst->rq_task); - - xprt_complete_rqst(rqst->rq_task, status); - xprt_unpin_rqst(rqst); - spin_unlock(&xprt->recv_lock); - dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n", - __func__, xprt, rqst, status); + if (list_empty(&req->rl_registered) && + !test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) + rpcrdma_complete_rqst(rep); + else + queue_work(rpcrdma_receive_wq, &rep->rr_work); return; out_badstatus: @@ -1321,37 +1423,22 @@ out_badstatus: } return; -/* If the incoming reply terminated a pending RPC, the next - * RPC call will post a replacement receive buffer as it is - * being marshaled. - */ out_badversion: dprintk("RPC: %s: invalid version %d\n", - __func__, be32_to_cpu(vers)); - status = -EIO; - r_xprt->rx_stats.bad_reply_count++; - goto out; - -out_badheader: - dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n", - rqst->rq_task->tk_pid, __func__, be32_to_cpu(proc)); - r_xprt->rx_stats.bad_reply_count++; - status = -EIO; - goto out; + __func__, be32_to_cpu(rep->rr_vers)); + goto repost; -/* The req was still available, but by the time the recv_lock - * was acquired, the rqst and task had been released. Thus the RPC - * has already been terminated. +/* The RPC transaction has already been terminated, or the header + * is corrupt. */ out_norqst: spin_unlock(&xprt->recv_lock); dprintk("RPC: %s: no match for incoming xid 0x%08x\n", - __func__, be32_to_cpu(xid)); + __func__, be32_to_cpu(rep->rr_xid)); goto repost; out_shortreply: dprintk("RPC: %s: short/invalid reply\n", __func__); - goto repost; /* If no pending RPC transaction was matched, post a replacement * receive buffer before returning. diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index c84e2b644e13..646c24494ea7 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2014-2017 Oracle. All rights reserved. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -678,16 +679,14 @@ xprt_rdma_free(struct rpc_task *task) struct rpc_rqst *rqst = task->tk_rqstp; struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - if (req->rl_backchannel) + if (test_bit(RPCRDMA_REQ_F_BACKCHANNEL, &req->rl_flags)) return; dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply); - if (!list_empty(&req->rl_registered)) - ia->ri_ops->ro_unmap_safe(r_xprt, req, !RPC_IS_ASYNC(task)); - rpcrdma_unmap_sges(ia, req); + if (test_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags)) + rpcrdma_release_rqst(r_xprt, req); rpcrdma_buffer_put(req); } @@ -728,7 +727,8 @@ xprt_rdma_send_request(struct rpc_task *task) /* On retransmit, remove any previously registered chunks */ if (unlikely(!list_empty(&req->rl_registered))) - r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); + r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, + &req->rl_registered); rc = rpcrdma_marshal_req(r_xprt, rqst); if (rc < 0) @@ -742,6 +742,7 @@ xprt_rdma_send_request(struct rpc_task *task) goto drop_connection; req->rl_connect_cookie = xprt->connect_cookie; + set_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags); if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) goto drop_connection; @@ -789,11 +790,13 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) r_xprt->rx_stats.failed_marshal_count, r_xprt->rx_stats.bad_reply_count, r_xprt->rx_stats.nomsg_call_count); - seq_printf(seq, "%lu %lu %lu %lu\n", + seq_printf(seq, "%lu %lu %lu %lu %lu %lu\n", r_xprt->rx_stats.mrs_recovered, r_xprt->rx_stats.mrs_orphaned, r_xprt->rx_stats.mrs_allocated, - r_xprt->rx_stats.local_inv_needed); + r_xprt->rx_stats.local_inv_needed, + r_xprt->rx_stats.empty_sendctx_q, + r_xprt->rx_stats.reply_waits_for_send); } static int diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 11a1fbf7e59e..710b3f77db82 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2014-2017 Oracle. All rights reserved. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -49,9 +50,10 @@ #include <linux/interrupt.h> #include <linux/slab.h> -#include <linux/prefetch.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/svc_rdma.h> + +#include <asm-generic/barrier.h> #include <asm/bitops.h> #include <rdma/ib_cm.h> @@ -73,7 +75,7 @@ static void rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt); static void rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf); static void rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb); -static struct workqueue_struct *rpcrdma_receive_wq __read_mostly; +struct workqueue_struct *rpcrdma_receive_wq __read_mostly; int rpcrdma_alloc_wq(void) @@ -126,30 +128,17 @@ rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) static void rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) { + struct ib_cqe *cqe = wc->wr_cqe; + struct rpcrdma_sendctx *sc = + container_of(cqe, struct rpcrdma_sendctx, sc_cqe); + /* WARNING: Only wr_cqe and status are reliable at this point */ if (wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR) pr_err("rpcrdma: Send: %s (%u/0x%x)\n", ib_wc_status_msg(wc->status), wc->status, wc->vendor_err); -} - -/* Perform basic sanity checking to avoid using garbage - * to update the credit grant value. - */ -static void -rpcrdma_update_granted_credits(struct rpcrdma_rep *rep) -{ - struct rpcrdma_buffer *buffer = &rep->rr_rxprt->rx_buf; - __be32 *p = rep->rr_rdmabuf->rg_base; - u32 credits; - credits = be32_to_cpup(p + 2); - if (credits == 0) - credits = 1; /* don't deadlock */ - else if (credits > buffer->rb_max_requests) - credits = buffer->rb_max_requests; - - atomic_set(&buffer->rb_credits, credits); + rpcrdma_sendctx_put_locked(sc); } /** @@ -181,11 +170,8 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) rdmab_addr(rep->rr_rdmabuf), wc->byte_len, DMA_FROM_DEVICE); - if (wc->byte_len >= RPCRDMA_HDRLEN_ERR) - rpcrdma_update_granted_credits(rep); - out_schedule: - queue_work(rpcrdma_receive_wq, &rep->rr_work); + rpcrdma_reply_handler(rep); return; out_fail: @@ -295,7 +281,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) case RDMA_CM_EVENT_DISCONNECTED: connstate = -ECONNABORTED; connected: - atomic_set(&xprt->rx_buf.rb_credits, 1); + xprt->rx_buf.rb_credits = 1; ep->rep_connected = connstate; rpcrdma_conn_func(ep); wake_up_all(&ep->rep_connect_wait); @@ -564,16 +550,15 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_attr.cap.max_recv_sge); /* set trigger for requesting send completion */ - ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1; - if (ep->rep_cqinit <= 2) - ep->rep_cqinit = 0; /* always signal? */ - rpcrdma_init_cqcount(ep, 0); + ep->rep_send_batch = min_t(unsigned int, RPCRDMA_MAX_SEND_BATCH, + cdata->max_requests >> 2); + ep->rep_send_count = ep->rep_send_batch; init_waitqueue_head(&ep->rep_connect_wait); INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); sendcq = ib_alloc_cq(ia->ri_device, NULL, ep->rep_attr.cap.max_send_wr + 1, - 0, IB_POLL_SOFTIRQ); + 1, IB_POLL_WORKQUEUE); if (IS_ERR(sendcq)) { rc = PTR_ERR(sendcq); dprintk("RPC: %s: failed to create send CQ: %i\n", @@ -583,7 +568,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, recvcq = ib_alloc_cq(ia->ri_device, NULL, ep->rep_attr.cap.max_recv_wr + 1, - 0, IB_POLL_SOFTIRQ); + 0, IB_POLL_WORKQUEUE); if (IS_ERR(recvcq)) { rc = PTR_ERR(recvcq); dprintk("RPC: %s: failed to create recv CQ: %i\n", @@ -846,6 +831,168 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) ib_drain_qp(ia->ri_id->qp); } +/* Fixed-size circular FIFO queue. This implementation is wait-free and + * lock-free. + * + * Consumer is the code path that posts Sends. This path dequeues a + * sendctx for use by a Send operation. Multiple consumer threads + * are serialized by the RPC transport lock, which allows only one + * ->send_request call at a time. + * + * Producer is the code path that handles Send completions. This path + * enqueues a sendctx that has been completed. Multiple producer + * threads are serialized by the ib_poll_cq() function. + */ + +/* rpcrdma_sendctxs_destroy() assumes caller has already quiesced + * queue activity, and ib_drain_qp has flushed all remaining Send + * requests. + */ +static void rpcrdma_sendctxs_destroy(struct rpcrdma_buffer *buf) +{ + unsigned long i; + + for (i = 0; i <= buf->rb_sc_last; i++) + kfree(buf->rb_sc_ctxs[i]); + kfree(buf->rb_sc_ctxs); +} + +static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ia *ia) +{ + struct rpcrdma_sendctx *sc; + + sc = kzalloc(sizeof(*sc) + + ia->ri_max_send_sges * sizeof(struct ib_sge), + GFP_KERNEL); + if (!sc) + return NULL; + + sc->sc_wr.wr_cqe = &sc->sc_cqe; + sc->sc_wr.sg_list = sc->sc_sges; + sc->sc_wr.opcode = IB_WR_SEND; + sc->sc_cqe.done = rpcrdma_wc_send; + return sc; +} + +static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_sendctx *sc; + unsigned long i; + + /* Maximum number of concurrent outstanding Send WRs. Capping + * the circular queue size stops Send Queue overflow by causing + * the ->send_request call to fail temporarily before too many + * Sends are posted. + */ + i = buf->rb_max_requests + RPCRDMA_MAX_BC_REQUESTS; + dprintk("RPC: %s: allocating %lu send_ctxs\n", __func__, i); + buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), GFP_KERNEL); + if (!buf->rb_sc_ctxs) + return -ENOMEM; + + buf->rb_sc_last = i - 1; + for (i = 0; i <= buf->rb_sc_last; i++) { + sc = rpcrdma_sendctx_create(&r_xprt->rx_ia); + if (!sc) + goto out_destroy; + + sc->sc_xprt = r_xprt; + buf->rb_sc_ctxs[i] = sc; + } + + return 0; + +out_destroy: + rpcrdma_sendctxs_destroy(buf); + return -ENOMEM; +} + +/* The sendctx queue is not guaranteed to have a size that is a + * power of two, thus the helpers in circ_buf.h cannot be used. + * The other option is to use modulus (%), which can be expensive. + */ +static unsigned long rpcrdma_sendctx_next(struct rpcrdma_buffer *buf, + unsigned long item) +{ + return likely(item < buf->rb_sc_last) ? item + 1 : 0; +} + +/** + * rpcrdma_sendctx_get_locked - Acquire a send context + * @buf: transport buffers from which to acquire an unused context + * + * Returns pointer to a free send completion context; or NULL if + * the queue is empty. + * + * Usage: Called to acquire an SGE array before preparing a Send WR. + * + * The caller serializes calls to this function (per rpcrdma_buffer), + * and provides an effective memory barrier that flushes the new value + * of rb_sc_head. + */ +struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_xprt *r_xprt; + struct rpcrdma_sendctx *sc; + unsigned long next_head; + + next_head = rpcrdma_sendctx_next(buf, buf->rb_sc_head); + + if (next_head == READ_ONCE(buf->rb_sc_tail)) + goto out_emptyq; + + /* ORDER: item must be accessed _before_ head is updated */ + sc = buf->rb_sc_ctxs[next_head]; + + /* Releasing the lock in the caller acts as a memory + * barrier that flushes rb_sc_head. + */ + buf->rb_sc_head = next_head; + + return sc; + +out_emptyq: + /* The queue is "empty" if there have not been enough Send + * completions recently. This is a sign the Send Queue is + * backing up. Cause the caller to pause and try again. + */ + dprintk("RPC: %s: empty sendctx queue\n", __func__); + r_xprt = container_of(buf, struct rpcrdma_xprt, rx_buf); + r_xprt->rx_stats.empty_sendctx_q++; + return NULL; +} + +/** + * rpcrdma_sendctx_put_locked - Release a send context + * @sc: send context to release + * + * Usage: Called from Send completion to return a sendctxt + * to the queue. + * + * The caller serializes calls to this function (per rpcrdma_buffer). + */ +void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc) +{ + struct rpcrdma_buffer *buf = &sc->sc_xprt->rx_buf; + unsigned long next_tail; + + /* Unmap SGEs of previously completed by unsignaled + * Sends by walking up the queue until @sc is found. + */ + next_tail = buf->rb_sc_tail; + do { + next_tail = rpcrdma_sendctx_next(buf, next_tail); + + /* ORDER: item must be accessed _before_ tail is updated */ + rpcrdma_unmap_sendctx(buf->rb_sc_ctxs[next_tail]); + + } while (buf->rb_sc_ctxs[next_tail] != sc); + + /* Paired with READ_ONCE */ + smp_store_release(&buf->rb_sc_tail, next_tail); +} + static void rpcrdma_mr_recovery_worker(struct work_struct *work) { @@ -941,13 +1088,8 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) spin_lock(&buffer->rb_reqslock); list_add(&req->rl_all, &buffer->rb_allreqs); spin_unlock(&buffer->rb_reqslock); - req->rl_cqe.done = rpcrdma_wc_send; req->rl_buffer = &r_xprt->rx_buf; INIT_LIST_HEAD(&req->rl_registered); - req->rl_send_wr.next = NULL; - req->rl_send_wr.wr_cqe = &req->rl_cqe; - req->rl_send_wr.sg_list = req->rl_send_sge; - req->rl_send_wr.opcode = IB_WR_SEND; return req; } @@ -974,7 +1116,7 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) rep->rr_cqe.done = rpcrdma_wc_receive; rep->rr_rxprt = r_xprt; - INIT_WORK(&rep->rr_work, rpcrdma_reply_handler); + INIT_WORK(&rep->rr_work, rpcrdma_deferred_completion); rep->rr_recv_wr.next = NULL; rep->rr_recv_wr.wr_cqe = &rep->rr_cqe; rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov; @@ -995,7 +1137,6 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) buf->rb_max_requests = r_xprt->rx_data.max_requests; buf->rb_bc_srv_max_requests = 0; - atomic_set(&buf->rb_credits, 1); spin_lock_init(&buf->rb_mwlock); spin_lock_init(&buf->rb_lock); spin_lock_init(&buf->rb_recovery_lock); @@ -1022,7 +1163,6 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) rc = PTR_ERR(req); goto out; } - req->rl_backchannel = false; list_add(&req->rl_list, &buf->rb_send_bufs); } @@ -1040,6 +1180,10 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) list_add(&rep->rr_list, &buf->rb_recv_bufs); } + rc = rpcrdma_sendctxs_create(r_xprt); + if (rc) + goto out; + return 0; out: rpcrdma_buffer_destroy(buf); @@ -1116,6 +1260,8 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) cancel_delayed_work_sync(&buf->rb_recovery_worker); cancel_delayed_work_sync(&buf->rb_refresh_worker); + rpcrdma_sendctxs_destroy(buf); + while (!list_empty(&buf->rb_recv_bufs)) { struct rpcrdma_rep *rep; @@ -1231,7 +1377,6 @@ rpcrdma_buffer_put(struct rpcrdma_req *req) struct rpcrdma_buffer *buffers = req->rl_buffer; struct rpcrdma_rep *rep = req->rl_reply; - req->rl_send_wr.num_sge = 0; req->rl_reply = NULL; spin_lock(&buffers->rb_lock); @@ -1363,7 +1508,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, struct rpcrdma_req *req) { - struct ib_send_wr *send_wr = &req->rl_send_wr; + struct ib_send_wr *send_wr = &req->rl_sendctx->sc_wr; struct ib_send_wr *send_wr_fail; int rc; @@ -1377,7 +1522,14 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, dprintk("RPC: %s: posting %d s/g entries\n", __func__, send_wr->num_sge); - rpcrdma_set_signaled(ep, send_wr); + if (!ep->rep_send_count || + test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) { + send_wr->send_flags |= IB_SEND_SIGNALED; + ep->rep_send_count = ep->rep_send_batch; + } else { + send_wr->send_flags &= ~IB_SEND_SIGNALED; + --ep->rep_send_count; + } rc = ib_post_send(ia->ri_id->qp, send_wr, &send_wr_fail); if (rc) goto out_postsend_err; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index e26a97d2f922..51686d9eac5f 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -1,4 +1,5 @@ /* + * Copyright (c) 2014-2017 Oracle. All rights reserved. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -93,8 +94,8 @@ enum { */ struct rpcrdma_ep { - atomic_t rep_cqcount; - int rep_cqinit; + unsigned int rep_send_count; + unsigned int rep_send_batch; int rep_connected; struct ib_qp_init_attr rep_attr; wait_queue_head_t rep_connect_wait; @@ -104,25 +105,6 @@ struct rpcrdma_ep { struct delayed_work rep_connect_worker; }; -static inline void -rpcrdma_init_cqcount(struct rpcrdma_ep *ep, int count) -{ - atomic_set(&ep->rep_cqcount, ep->rep_cqinit - count); -} - -/* To update send queue accounting, provider must take a - * send completion every now and then. - */ -static inline void -rpcrdma_set_signaled(struct rpcrdma_ep *ep, struct ib_send_wr *send_wr) -{ - send_wr->send_flags = 0; - if (unlikely(atomic_sub_return(1, &ep->rep_cqcount) <= 0)) { - rpcrdma_init_cqcount(ep, 0); - send_wr->send_flags = IB_SEND_SIGNALED; - } -} - /* Pre-allocate extra Work Requests for handling backward receives * and sends. This is a fixed value because the Work Queues are * allocated when the forward channel is set up. @@ -164,12 +146,6 @@ rdmab_lkey(struct rpcrdma_regbuf *rb) return rb->rg_iov.lkey; } -static inline struct rpcrdma_msg * -rdmab_to_msg(struct rpcrdma_regbuf *rb) -{ - return (struct rpcrdma_msg *)rb->rg_base; -} - static inline struct ib_device * rdmab_device(struct rpcrdma_regbuf *rb) { @@ -202,22 +178,24 @@ enum { }; /* - * struct rpcrdma_rep -- this structure encapsulates state required to recv - * and complete a reply, asychronously. It needs several pieces of - * state: - * o recv buffer (posted to provider) - * o ib_sge (also donated to provider) - * o status of reply (length, success or not) - * o bookkeeping state to get run by reply handler (list, etc) + * struct rpcrdma_rep -- this structure encapsulates state required + * to receive and complete an RPC Reply, asychronously. It needs + * several pieces of state: * - * These are allocated during initialization, per-transport instance. + * o receive buffer and ib_sge (donated to provider) + * o status of receive (success or not, length, inv rkey) + * o bookkeeping state to get run by reply handler (XDR stream) * - * N of these are associated with a transport instance, and stored in - * struct rpcrdma_buffer. N is the max number of outstanding requests. + * These structures are allocated during transport initialization. + * N of these are associated with a transport instance, managed by + * struct rpcrdma_buffer. N is the max number of outstanding RPCs. */ struct rpcrdma_rep { struct ib_cqe rr_cqe; + __be32 rr_xid; + __be32 rr_vers; + __be32 rr_proc; int rr_wc_flags; u32 rr_inv_rkey; struct rpcrdma_regbuf *rr_rdmabuf; @@ -225,10 +203,34 @@ struct rpcrdma_rep { struct work_struct rr_work; struct xdr_buf rr_hdrbuf; struct xdr_stream rr_stream; + struct rpc_rqst *rr_rqst; struct list_head rr_list; struct ib_recv_wr rr_recv_wr; }; +/* struct rpcrdma_sendctx - DMA mapped SGEs to unmap after Send completes + */ +struct rpcrdma_req; +struct rpcrdma_xprt; +struct rpcrdma_sendctx { + struct ib_send_wr sc_wr; + struct ib_cqe sc_cqe; + struct rpcrdma_xprt *sc_xprt; + struct rpcrdma_req *sc_req; + unsigned int sc_unmap_count; + struct ib_sge sc_sges[]; +}; + +/* Limit the number of SGEs that can be unmapped during one + * Send completion. This caps the amount of work a single + * completion can do before returning to the provider. + * + * Setting this to zero disables Send completion batching. + */ +enum { + RPCRDMA_MAX_SEND_BATCH = 7, +}; + /* * struct rpcrdma_mw - external memory region metadata * @@ -340,26 +342,30 @@ enum { struct rpcrdma_buffer; struct rpcrdma_req { struct list_head rl_list; - unsigned int rl_mapped_sges; unsigned int rl_connect_cookie; struct rpcrdma_buffer *rl_buffer; struct rpcrdma_rep *rl_reply; struct xdr_stream rl_stream; struct xdr_buf rl_hdrbuf; - struct ib_send_wr rl_send_wr; - struct ib_sge rl_send_sge[RPCRDMA_MAX_SEND_SGES]; + struct rpcrdma_sendctx *rl_sendctx; struct rpcrdma_regbuf *rl_rdmabuf; /* xprt header */ struct rpcrdma_regbuf *rl_sendbuf; /* rq_snd_buf */ struct rpcrdma_regbuf *rl_recvbuf; /* rq_rcv_buf */ - struct ib_cqe rl_cqe; struct list_head rl_all; - bool rl_backchannel; + unsigned long rl_flags; struct list_head rl_registered; /* registered segments */ struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; }; +/* rl_flags */ +enum { + RPCRDMA_REQ_F_BACKCHANNEL = 0, + RPCRDMA_REQ_F_PENDING, + RPCRDMA_REQ_F_TX_RESOURCES, +}; + static inline void rpcrdma_set_xprtdata(struct rpc_rqst *rqst, struct rpcrdma_req *req) { @@ -399,12 +405,17 @@ struct rpcrdma_buffer { struct list_head rb_mws; struct list_head rb_all; + unsigned long rb_sc_head; + unsigned long rb_sc_tail; + unsigned long rb_sc_last; + struct rpcrdma_sendctx **rb_sc_ctxs; + spinlock_t rb_lock; /* protect buf lists */ int rb_send_count, rb_recv_count; struct list_head rb_send_bufs; struct list_head rb_recv_bufs; u32 rb_max_requests; - atomic_t rb_credits; /* most recent credit grant */ + u32 rb_credits; /* most recent credit grant */ u32 rb_bc_srv_max_requests; spinlock_t rb_reqslock; /* protect rb_allreqs */ @@ -453,10 +464,12 @@ struct rpcrdma_stats { unsigned long mrs_recovered; unsigned long mrs_orphaned; unsigned long mrs_allocated; + unsigned long empty_sendctx_q; /* accessed when receiving a reply */ unsigned long long total_rdma_reply; unsigned long long fixup_copy_count; + unsigned long reply_waits_for_send; unsigned long local_inv_needed; unsigned long nomsg_call_count; unsigned long bcall_count; @@ -473,8 +486,6 @@ struct rpcrdma_memreg_ops { struct rpcrdma_mw **); void (*ro_unmap_sync)(struct rpcrdma_xprt *, struct list_head *); - void (*ro_unmap_safe)(struct rpcrdma_xprt *, - struct rpcrdma_req *, bool); void (*ro_recover_mr)(struct rpcrdma_mw *); int (*ro_open)(struct rpcrdma_ia *, struct rpcrdma_ep *, @@ -532,6 +543,8 @@ void rpcrdma_ia_close(struct rpcrdma_ia *); bool frwr_is_supported(struct rpcrdma_ia *); bool fmr_is_supported(struct rpcrdma_ia *); +extern struct workqueue_struct *rpcrdma_receive_wq; + /* * Endpoint calls - xprtrdma/verbs.c */ @@ -554,6 +567,8 @@ struct rpcrdma_rep *rpcrdma_create_rep(struct rpcrdma_xprt *); void rpcrdma_destroy_req(struct rpcrdma_req *); int rpcrdma_buffer_create(struct rpcrdma_xprt *); void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); +struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf); +void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc); struct rpcrdma_mw *rpcrdma_get_mw(struct rpcrdma_xprt *); void rpcrdma_put_mw(struct rpcrdma_xprt *, struct rpcrdma_mw *); @@ -610,12 +625,18 @@ enum rpcrdma_chunktype { rpcrdma_replych }; -bool rpcrdma_prepare_send_sges(struct rpcrdma_ia *, struct rpcrdma_req *, - u32, struct xdr_buf *, enum rpcrdma_chunktype); -void rpcrdma_unmap_sges(struct rpcrdma_ia *, struct rpcrdma_req *); +int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, u32 hdrlen, + struct xdr_buf *xdr, + enum rpcrdma_chunktype rtype); +void rpcrdma_unmap_sendctx(struct rpcrdma_sendctx *sc); int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst); void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *); -void rpcrdma_reply_handler(struct work_struct *work); +void rpcrdma_complete_rqst(struct rpcrdma_rep *rep); +void rpcrdma_reply_handler(struct rpcrdma_rep *rep); +void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req); +void rpcrdma_deferred_completion(struct work_struct *work); static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len) { diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 4dad5da388d6..9cc850c2719e 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -552,6 +552,7 @@ static int xs_local_send_request(struct rpc_task *task) default: dprintk("RPC: sendmsg returned unrecognized error %d\n", -status); + /* fall through */ case -EPIPE: xs_close(xprt); status = -ENOTCONN; @@ -1611,6 +1612,7 @@ static void xs_tcp_state_change(struct sock *sk) xprt->connect_cookie++; clear_bit(XPRT_CONNECTED, &xprt->state); xs_tcp_force_close(xprt); + /* fall through */ case TCP_CLOSING: /* * If the server closed down the connection, make sure that @@ -2368,6 +2370,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) switch (ret) { case 0: xs_set_srcport(transport, sock); + /* fall through */ case -EINPROGRESS: /* SYN_SENT! */ if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO) @@ -2419,6 +2422,7 @@ static void xs_tcp_setup_socket(struct work_struct *work) default: printk("%s: connect returned unhandled error %d\n", __func__, status); + /* fall through */ case -EADDRNOTAVAIL: /* We're probably in TIME_WAIT. Get rid of existing socket, * and retry diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 0531b41d1f2d..74b9d916a58b 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -345,6 +345,8 @@ static size_t switchdev_obj_size(const struct switchdev_obj *obj) return sizeof(struct switchdev_obj_port_vlan); case SWITCHDEV_OBJ_ID_PORT_MDB: return sizeof(struct switchdev_obj_port_mdb); + case SWITCHDEV_OBJ_ID_HOST_MDB: + return sizeof(struct switchdev_obj_port_mdb); default: BUG(); } diff --git a/net/tipc/link.c b/net/tipc/link.c index 870b9b8f877a..6bce0b1117bd 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -239,7 +239,8 @@ static int link_is_up(struct tipc_link *l) static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, struct sk_buff_head *xmitq); static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, - u16 rcvgap, int tolerance, int priority, + bool probe_reply, u16 rcvgap, + int tolerance, int priority, struct sk_buff_head *xmitq); static void link_print(struct tipc_link *l, const char *str); static int tipc_link_build_nack_msg(struct tipc_link *l, @@ -773,7 +774,7 @@ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq) } if (state || probe || setup) - tipc_link_build_proto_msg(l, mtyp, probe, 0, 0, 0, xmitq); + tipc_link_build_proto_msg(l, mtyp, probe, 0, 0, 0, 0, xmitq); return rc; } @@ -1174,7 +1175,7 @@ int tipc_link_build_state_msg(struct tipc_link *l, struct sk_buff_head *xmitq) /* Unicast ACK */ l->rcv_unacked = 0; l->stats.sent_acks++; - tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, xmitq); + tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, 0, xmitq); return 0; } @@ -1188,7 +1189,7 @@ void tipc_link_build_reset_msg(struct tipc_link *l, struct sk_buff_head *xmitq) if (l->state == LINK_ESTABLISHING) mtyp = ACTIVATE_MSG; - tipc_link_build_proto_msg(l, mtyp, 0, 0, 0, 0, xmitq); + tipc_link_build_proto_msg(l, mtyp, 0, 0, 0, 0, 0, xmitq); /* Inform peer that this endpoint is going down if applicable */ skb = skb_peek_tail(xmitq); @@ -1215,7 +1216,7 @@ static int tipc_link_build_nack_msg(struct tipc_link *l, } if ((skb_queue_len(&l->deferdq) == 1) || !(def_cnt % TIPC_NACK_INTV)) - tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, xmitq); + tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, 0, xmitq); return 0; } @@ -1289,7 +1290,8 @@ drop: } static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, - u16 rcvgap, int tolerance, int priority, + bool probe_reply, u16 rcvgap, + int tolerance, int priority, struct sk_buff_head *xmitq) { struct tipc_link *bcl = l->bc_rcvlink; @@ -1337,6 +1339,7 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, msg_set_seq_gap(hdr, rcvgap); msg_set_bc_gap(hdr, link_bc_rcv_gap(bcl)); msg_set_probe(hdr, probe); + msg_set_is_keepalive(hdr, probe || probe_reply); tipc_mon_prep(l->net, data, &dlen, mstate, l->bearer_id); msg_set_size(hdr, INT_H_SIZE + dlen); skb_trim(skb, INT_H_SIZE + dlen); @@ -1442,6 +1445,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, u16 rcv_nxt = l->rcv_nxt; u16 dlen = msg_data_sz(hdr); int mtyp = msg_type(hdr); + bool reply = msg_probe(hdr); void *data; char *if_name; int rc = 0; @@ -1528,9 +1532,9 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, /* Send NACK if peer has sent pkts we haven't received yet */ if (more(peers_snd_nxt, rcv_nxt) && !tipc_link_is_synching(l)) rcvgap = peers_snd_nxt - l->rcv_nxt; - if (rcvgap || (msg_probe(hdr))) - tipc_link_build_proto_msg(l, STATE_MSG, 0, rcvgap, - 0, 0, xmitq); + if (rcvgap || reply) + tipc_link_build_proto_msg(l, STATE_MSG, 0, reply, + rcvgap, 0, 0, xmitq); tipc_link_release_pkts(l, ack); /* If NACK, retransmit will now start at right position */ @@ -2122,14 +2126,14 @@ void tipc_link_set_tolerance(struct tipc_link *l, u32 tol, struct sk_buff_head *xmitq) { l->tolerance = tol; - tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, tol, 0, xmitq); + tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, tol, 0, xmitq); } void tipc_link_set_prio(struct tipc_link *l, u32 prio, struct sk_buff_head *xmitq) { l->priority = prio; - tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, prio, xmitq); + tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, prio, xmitq); } void tipc_link_set_abort_limit(struct tipc_link *l, u32 limit) diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 1649d456e22d..b0d07b35909d 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -174,7 +174,7 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) if (fragid == LAST_FRAGMENT) { TIPC_SKB_CB(head)->validated = false; - if (unlikely(!tipc_msg_validate(head))) + if (unlikely(!tipc_msg_validate(&head))) goto err; *buf = head; TIPC_SKB_CB(head)->tail = NULL; @@ -201,11 +201,21 @@ err: * TIPC will ignore the excess, under the assumption that it is optional info * introduced by a later release of the protocol. */ -bool tipc_msg_validate(struct sk_buff *skb) +bool tipc_msg_validate(struct sk_buff **_skb) { - struct tipc_msg *msg; + struct sk_buff *skb = *_skb; + struct tipc_msg *hdr; int msz, hsz; + /* Ensure that flow control ratio condition is satisfied */ + if (unlikely(skb->truesize / buf_roundup_len(skb) > 4)) { + skb = skb_copy(skb, GFP_ATOMIC); + if (!skb) + return false; + kfree_skb(*_skb); + *_skb = skb; + } + if (unlikely(TIPC_SKB_CB(skb)->validated)) return true; if (unlikely(!pskb_may_pull(skb, MIN_H_SIZE))) @@ -217,11 +227,11 @@ bool tipc_msg_validate(struct sk_buff *skb) if (unlikely(!pskb_may_pull(skb, hsz))) return false; - msg = buf_msg(skb); - if (unlikely(msg_version(msg) != TIPC_VERSION)) + hdr = buf_msg(skb); + if (unlikely(msg_version(hdr) != TIPC_VERSION)) return false; - msz = msg_size(msg); + msz = msg_size(hdr); if (unlikely(msz < hsz)) return false; if (unlikely((msz - hsz) > TIPC_MAX_USER_MSG_SIZE)) @@ -411,7 +421,7 @@ bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos) skb_pull(*iskb, offset); imsz = msg_size(buf_msg(*iskb)); skb_trim(*iskb, imsz); - if (unlikely(!tipc_msg_validate(*iskb))) + if (unlikely(!tipc_msg_validate(iskb))) goto none; *pos += align(imsz); return true; diff --git a/net/tipc/msg.h b/net/tipc/msg.h index cedf811317fb..3e4384c222f7 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -226,6 +226,16 @@ static inline void msg_set_dest_droppable(struct tipc_msg *m, u32 d) msg_set_bits(m, 0, 19, 1, d); } +static inline int msg_is_keepalive(struct tipc_msg *m) +{ + return msg_bits(m, 0, 19, 1); +} + +static inline void msg_set_is_keepalive(struct tipc_msg *m, u32 d) +{ + msg_set_bits(m, 0, 19, 1, d); +} + static inline int msg_src_droppable(struct tipc_msg *m) { return msg_bits(m, 0, 18, 1); @@ -916,7 +926,7 @@ static inline bool msg_is_reset(struct tipc_msg *hdr) } struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp); -bool tipc_msg_validate(struct sk_buff *skb); +bool tipc_msg_validate(struct sk_buff **_skb); bool tipc_msg_reverse(u32 own_addr, struct sk_buff **skb, int err); void tipc_skb_reject(struct net *net, int err, struct sk_buff *skb, struct sk_buff_head *xmitq); @@ -944,6 +954,11 @@ static inline u16 buf_seqno(struct sk_buff *skb) return msg_seqno(buf_msg(skb)); } +static inline int buf_roundup_len(struct sk_buff *skb) +{ + return (skb->len / 1024 + 1) * 1024; +} + /* tipc_skb_peek(): peek and reserve first buffer in list * @list: list to be peeked in * Returns pointer to first buffer in list, if any diff --git a/net/tipc/node.c b/net/tipc/node.c index 009a81631280..507017fe0f1b 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1539,7 +1539,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) __skb_queue_head_init(&xmitq); /* Ensure message is well-formed before touching the header */ - if (unlikely(!tipc_msg_validate(skb))) + if (unlikely(!tipc_msg_validate(&skb))) goto discard; hdr = buf_msg(skb); usr = msg_user(hdr); diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 60aff60e30ad..e07ee3ae0023 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -45,8 +45,18 @@ MODULE_AUTHOR("Mellanox Technologies"); MODULE_DESCRIPTION("Transport Layer Security Support"); MODULE_LICENSE("Dual BSD/GPL"); -static struct proto tls_base_prot; -static struct proto tls_sw_prot; +enum { + TLS_BASE_TX, + TLS_SW_TX, + TLS_NUM_CONFIG, +}; + +static struct proto tls_prots[TLS_NUM_CONFIG]; + +static inline void update_sk_prot(struct sock *sk, struct tls_context *ctx) +{ + sk->sk_prot = &tls_prots[ctx->tx_conf]; +} int wait_on_pending_writer(struct sock *sk, long *timeo) { @@ -216,6 +226,12 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) void (*sk_proto_close)(struct sock *sk, long timeout); lock_sock(sk); + sk_proto_close = ctx->sk_proto_close; + + if (ctx->tx_conf == TLS_BASE_TX) { + kfree(ctx); + goto skip_tx_cleanup; + } if (!tls_complete_pending_work(sk, ctx, 0, &timeo)) tls_handle_open_record(sk, 0); @@ -232,13 +248,14 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) sg++; } } - ctx->free_resources(sk); + kfree(ctx->rec_seq); kfree(ctx->iv); - sk_proto_close = ctx->sk_proto_close; - kfree(ctx); + if (ctx->tx_conf == TLS_SW_TX) + tls_sw_free_tx_resources(sk); +skip_tx_cleanup: release_sock(sk); sk_proto_close(sk, timeout); } @@ -338,46 +355,41 @@ static int tls_getsockopt(struct sock *sk, int level, int optname, static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, unsigned int optlen) { - struct tls_crypto_info *crypto_info, tmp_crypto_info; + struct tls_crypto_info *crypto_info; struct tls_context *ctx = tls_get_ctx(sk); - struct proto *prot = NULL; int rc = 0; + int tx_conf; if (!optval || (optlen < sizeof(*crypto_info))) { rc = -EINVAL; goto out; } - rc = copy_from_user(&tmp_crypto_info, optval, sizeof(*crypto_info)); + crypto_info = &ctx->crypto_send; + /* Currently we don't support set crypto info more than one time */ + if (TLS_CRYPTO_INFO_READY(crypto_info)) + goto out; + + rc = copy_from_user(crypto_info, optval, sizeof(*crypto_info)); if (rc) { rc = -EFAULT; goto out; } /* check version */ - if (tmp_crypto_info.version != TLS_1_2_VERSION) { + if (crypto_info->version != TLS_1_2_VERSION) { rc = -ENOTSUPP; - goto out; + goto err_crypto_info; } - /* get user crypto info */ - crypto_info = &ctx->crypto_send; - - /* Currently we don't support set crypto info more than one time */ - if (TLS_CRYPTO_INFO_READY(crypto_info)) - goto out; - - switch (tmp_crypto_info.cipher_type) { + switch (crypto_info->cipher_type) { case TLS_CIPHER_AES_GCM_128: { if (optlen != sizeof(struct tls12_crypto_info_aes_gcm_128)) { rc = -EINVAL; goto out; } - rc = copy_from_user( - crypto_info, - optval, - sizeof(struct tls12_crypto_info_aes_gcm_128)); - + rc = copy_from_user(crypto_info + 1, optval + sizeof(*crypto_info), + optlen - sizeof(*crypto_info)); if (rc) { rc = -EFAULT; goto err_crypto_info; @@ -389,18 +401,16 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, goto out; } - ctx->sk_write_space = sk->sk_write_space; - sk->sk_write_space = tls_write_space; - - ctx->sk_proto_close = sk->sk_prot->close; - /* currently SW is default, we will have ethtool in future */ rc = tls_set_sw_offload(sk, ctx); - prot = &tls_sw_prot; + tx_conf = TLS_SW_TX; if (rc) goto err_crypto_info; - sk->sk_prot = prot; + ctx->tx_conf = tx_conf; + update_sk_prot(sk, ctx); + ctx->sk_write_space = sk->sk_write_space; + sk->sk_write_space = tls_write_space; goto out; err_crypto_info: @@ -453,7 +463,10 @@ static int tls_init(struct sock *sk) icsk->icsk_ulp_data = ctx; ctx->setsockopt = sk->sk_prot->setsockopt; ctx->getsockopt = sk->sk_prot->getsockopt; - sk->sk_prot = &tls_base_prot; + ctx->sk_proto_close = sk->sk_prot->close; + + ctx->tx_conf = TLS_BASE_TX; + update_sk_prot(sk, ctx); out: return rc; } @@ -464,16 +477,21 @@ static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = { .init = tls_init, }; +static void build_protos(struct proto *prot, struct proto *base) +{ + prot[TLS_BASE_TX] = *base; + prot[TLS_BASE_TX].setsockopt = tls_setsockopt; + prot[TLS_BASE_TX].getsockopt = tls_getsockopt; + prot[TLS_BASE_TX].close = tls_sk_proto_close; + + prot[TLS_SW_TX] = prot[TLS_BASE_TX]; + prot[TLS_SW_TX].sendmsg = tls_sw_sendmsg; + prot[TLS_SW_TX].sendpage = tls_sw_sendpage; +} + static int __init tls_register(void) { - tls_base_prot = tcp_prot; - tls_base_prot.setsockopt = tls_setsockopt; - tls_base_prot.getsockopt = tls_getsockopt; - - tls_sw_prot = tls_base_prot; - tls_sw_prot.sendmsg = tls_sw_sendmsg; - tls_sw_prot.sendpage = tls_sw_sendpage; - tls_sw_prot.close = tls_sk_proto_close; + build_protos(tls_prots, &tcp_prot); tcp_register_ulp(&tcp_tls_ulp_ops); diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 7d80040a37b6..73d19210dd49 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -39,22 +39,6 @@ #include <net/tls.h> -static inline void tls_make_aad(int recv, - char *buf, - size_t size, - char *record_sequence, - int record_sequence_size, - unsigned char record_type) -{ - memcpy(buf, record_sequence, record_sequence_size); - - buf[8] = record_type; - buf[9] = TLS_1_2_VERSION_MAJOR; - buf[10] = TLS_1_2_VERSION_MINOR; - buf[11] = size >> 8; - buf[12] = size & 0xFF; -} - static void trim_sg(struct sock *sk, struct scatterlist *sg, int *sg_num_elem, unsigned int *sg_size, int target_size) { @@ -219,7 +203,7 @@ static int tls_do_encryption(struct tls_context *tls_ctx, struct aead_request *aead_req; int rc; - aead_req = kmalloc(req_size, flags); + aead_req = kzalloc(req_size, flags); if (!aead_req) return -ENOMEM; @@ -249,7 +233,7 @@ static int tls_push_record(struct sock *sk, int flags, sg_mark_end(ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem - 1); sg_mark_end(ctx->sg_encrypted_data + ctx->sg_encrypted_num_elem - 1); - tls_make_aad(0, ctx->aad_space, ctx->sg_plaintext_size, + tls_make_aad(ctx->aad_space, ctx->sg_plaintext_size, tls_ctx->rec_seq, tls_ctx->rec_seq_size, record_type); @@ -639,7 +623,7 @@ sendpage_end: return ret; } -static void tls_sw_free_resources(struct sock *sk) +void tls_sw_free_tx_resources(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); @@ -650,6 +634,7 @@ static void tls_sw_free_resources(struct sock *sk) tls_free_both_sg(sk); kfree(ctx); + kfree(tls_ctx); } int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) @@ -679,7 +664,6 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) } ctx->priv_ctx = (struct tls_offload_context *)sw_ctx; - ctx->free_resources = tls_sw_free_resources; crypto_info = &ctx->crypto_send; switch (crypto_info->cipher_type) { diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index fce2cbe6a193..a0e1951227fa 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6291,7 +6291,7 @@ static int nl80211_send_regdom(struct sk_buff *msg, struct netlink_callback *cb, if (!hdr) return -1; - genl_dump_check_consistent(cb, hdr, &nl80211_fam); + genl_dump_check_consistent(cb, hdr); if (nl80211_put_regdom(regdom, msg)) goto nla_put_failure; @@ -7722,7 +7722,7 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, if (!hdr) return -1; - genl_dump_check_consistent(cb, hdr, &nl80211_fam); + genl_dump_check_consistent(cb, hdr); if (nla_put_u32(msg, NL80211_ATTR_GENERATION, rdev->bss_generation)) goto nla_put_failure; @@ -14274,7 +14274,7 @@ static bool __nl80211_unexpected_frame(struct net_device *dev, u8 cmd, struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct sk_buff *msg; void *hdr; - u32 nlportid = ACCESS_ONCE(wdev->ap_unexpected_nlportid); + u32 nlportid = READ_ONCE(wdev->ap_unexpected_nlportid); if (!nlportid) return false; diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 82d20ee34581..347ab31574d5 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -266,8 +266,6 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) goto lock; } - daddr = (xfrm_address_t *)(skb_network_header(skb) + - XFRM_SPI_SKB_CB(skb)->daddroff); family = XFRM_SPI_SKB_CB(skb)->family; /* if tunnel is present override skb->mark value with tunnel i_key */ @@ -294,6 +292,8 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) goto drop; } + daddr = (xfrm_address_t *)(skb_network_header(skb) + + XFRM_SPI_SKB_CB(skb)->daddroff); do { if (skb->sp->len == XFRM_MAX_DEPTH) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index f02b1743b239..9542975eb2f9 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1305,6 +1305,7 @@ static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir) newp->xfrm_nr = old->xfrm_nr; newp->index = old->index; newp->type = old->type; + newp->family = old->family; memcpy(newp->xfrm_vec, old->xfrm_vec, newp->xfrm_nr*sizeof(struct xfrm_tmpl)); spin_lock_bh(&net->xfrm.xfrm_policy_lock); @@ -1786,19 +1787,23 @@ void xfrm_policy_cache_flush(void) put_online_cpus(); } -static bool xfrm_pol_dead(struct xfrm_dst *xdst) +static bool xfrm_xdst_can_reuse(struct xfrm_dst *xdst, + struct xfrm_state * const xfrm[], + int num) { - unsigned int num_pols = xdst->num_pols; - unsigned int pol_dead = 0, i; + const struct dst_entry *dst = &xdst->u.dst; + int i; - for (i = 0; i < num_pols; i++) - pol_dead |= xdst->pols[i]->walk.dead; + if (xdst->num_xfrms != num) + return false; - /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */ - if (pol_dead) - xdst->u.dst.obsolete = DST_OBSOLETE_DEAD; + for (i = 0; i < num; i++) { + if (!dst || dst->xfrm != xfrm[i]) + return false; + dst = dst->child; + } - return pol_dead; + return xfrm_bundle_ok(xdst); } static struct xfrm_dst * @@ -1812,26 +1817,28 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, struct dst_entry *dst; int err; + /* Try to instantiate a bundle */ + err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family); + if (err <= 0) { + if (err != 0 && err != -EAGAIN) + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); + return ERR_PTR(err); + } + xdst = this_cpu_read(xfrm_last_dst); if (xdst && xdst->u.dst.dev == dst_orig->dev && xdst->num_pols == num_pols && - !xfrm_pol_dead(xdst) && memcmp(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols) == 0 && - xfrm_bundle_ok(xdst)) { + xfrm_xdst_can_reuse(xdst, xfrm, err)) { dst_hold(&xdst->u.dst); + while (err > 0) + xfrm_state_put(xfrm[--err]); return xdst; } old = xdst; - /* Try to instantiate a bundle */ - err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family); - if (err <= 0) { - if (err != 0 && err != -EAGAIN) - XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); - return ERR_PTR(err); - } dst = xfrm_bundle_create(pols[0], xfrm, err, fl, dst_orig); if (IS_ERR(dst)) { |