summaryrefslogtreecommitdiffstats
path: root/net/sched
diff options
context:
space:
mode:
Diffstat (limited to 'net/sched')
-rw-r--r--net/sched/Kconfig186
-rw-r--r--net/sched/Makefile2
-rw-r--r--net/sched/act_api.c101
-rw-r--r--net/sched/act_bpf.c7
-rw-r--r--net/sched/act_connmark.c6
-rw-r--r--net/sched/act_csum.c16
-rw-r--r--net/sched/act_ct.c41
-rw-r--r--net/sched/act_ctinfo.c21
-rw-r--r--net/sched/act_gact.c23
-rw-r--r--net/sched/act_ife.c16
-rw-r--r--net/sched/act_ipt.c23
-rw-r--r--net/sched/act_mirred.c74
-rw-r--r--net/sched/act_mpls.c25
-rw-r--r--net/sched/act_nat.c10
-rw-r--r--net/sched/act_pedit.c19
-rw-r--r--net/sched/act_police.c49
-rw-r--r--net/sched/act_sample.c40
-rw-r--r--net/sched/act_simple.c9
-rw-r--r--net/sched/act_skbedit.c10
-rw-r--r--net/sched/act_skbmod.c6
-rw-r--r--net/sched/act_tunnel_key.c222
-rw-r--r--net/sched/act_vlan.c26
-rw-r--r--net/sched/cls_api.c847
-rw-r--r--net/sched/cls_basic.c11
-rw-r--r--net/sched/cls_bpf.c55
-rw-r--r--net/sched/cls_flower.c522
-rw-r--r--net/sched/cls_fw.c11
-rw-r--r--net/sched/cls_matchall.c45
-rw-r--r--net/sched/cls_route.c11
-rw-r--r--net/sched/cls_rsvp.h17
-rw-r--r--net/sched/cls_tcindex.c54
-rw-r--r--net/sched/cls_u32.c40
-rw-r--r--net/sched/em_meta.c8
-rw-r--r--net/sched/ematch.c5
-rw-r--r--net/sched/sch_api.c45
-rw-r--r--net/sched/sch_cake.c66
-rw-r--r--net/sched/sch_cbq.c43
-rw-r--r--net/sched/sch_cbs.c51
-rw-r--r--net/sched/sch_choke.c2
-rw-r--r--net/sched/sch_dsmark.c2
-rw-r--r--net/sched/sch_etf.c2
-rw-r--r--net/sched/sch_ets.c828
-rw-r--r--net/sched/sch_fq.c26
-rw-r--r--net/sched/sch_fq_codel.c15
-rw-r--r--net/sched/sch_fq_pie.c562
-rw-r--r--net/sched/sch_generic.c70
-rw-r--r--net/sched/sch_hhf.c10
-rw-r--r--net/sched/sch_htb.c4
-rw-r--r--net/sched/sch_mq.c4
-rw-r--r--net/sched/sch_mqprio.c7
-rw-r--r--net/sched/sch_multiq.c25
-rw-r--r--net/sched/sch_netem.c15
-rw-r--r--net/sched/sch_pie.c315
-rw-r--r--net/sched/sch_prio.c12
-rw-r--r--net/sched/sch_sfb.c20
-rw-r--r--net/sched/sch_sfq.c14
-rw-r--r--net/sched/sch_taprio.c552
-rw-r--r--net/sched/sch_tbf.c60
58 files changed, 4085 insertions, 1223 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index afd2ba157a13..edde0e519438 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -324,7 +324,7 @@ config NET_SCH_CAKE
tristate "Common Applications Kept Enhanced (CAKE)"
help
Say Y here if you want to use the Common Applications Kept Enhanced
- (CAKE) queue management algorithm.
+ (CAKE) queue management algorithm.
To compile this driver as a module, choose M here: the module
will be called sch_cake.
@@ -366,6 +366,19 @@ config NET_SCH_PIE
If unsure, say N.
+config NET_SCH_FQ_PIE
+ depends on NET_SCH_PIE
+ tristate "Flow Queue Proportional Integral controller Enhanced (FQ-PIE)"
+ help
+ Say Y here if you want to use the Flow Queue Proportional Integral
+ controller Enhanced (FQ-PIE) packet scheduling algorithm.
+ For more information, please see https://tools.ietf.org/html/rfc8033
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_fq_pie.
+
+ If unsure, say N.
+
config NET_SCH_INGRESS
tristate "Ingress/classifier-action Qdisc"
depends on NET_CLS_ACT
@@ -409,6 +422,23 @@ config NET_SCH_PLUG
To compile this code as a module, choose M here: the
module will be called sch_plug.
+config NET_SCH_ETS
+ tristate "Enhanced transmission selection scheduler (ETS)"
+ help
+ The Enhanced Transmission Selection scheduler is a classful
+ queuing discipline that merges functionality of PRIO and DRR
+ qdiscs in one scheduler. ETS makes it easy to configure a set of
+ strict and bandwidth-sharing bands to implement the transmission
+ selection described in 802.1Qaz.
+
+ Say Y here if you want to use the ETS packet scheduling
+ algorithm.
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_ets.
+
+ If unsure, say N.
+
menuconfig NET_SCH_DEFAULT
bool "Allow override default queue discipline"
---help---
@@ -730,8 +760,8 @@ config NET_CLS_ACT
config NET_ACT_POLICE
tristate "Traffic Policing"
- depends on NET_CLS_ACT
- ---help---
+ depends on NET_CLS_ACT
+ ---help---
Say Y here if you want to do traffic policing, i.e. strict
bandwidth limiting. This action replaces the existing policing
module.
@@ -740,9 +770,9 @@ config NET_ACT_POLICE
module will be called act_police.
config NET_ACT_GACT
- tristate "Generic actions"
- depends on NET_CLS_ACT
- ---help---
+ tristate "Generic actions"
+ depends on NET_CLS_ACT
+ ---help---
Say Y here to take generic actions such as dropping and
accepting packets.
@@ -750,15 +780,15 @@ config NET_ACT_GACT
module will be called act_gact.
config GACT_PROB
- bool "Probability support"
- depends on NET_ACT_GACT
- ---help---
+ bool "Probability support"
+ depends on NET_ACT_GACT
+ ---help---
Say Y here to use the generic action randomly or deterministically.
config NET_ACT_MIRRED
- tristate "Redirecting and Mirroring"
- depends on NET_CLS_ACT
- ---help---
+ tristate "Redirecting and Mirroring"
+ depends on NET_CLS_ACT
+ ---help---
Say Y here to allow packets to be mirrored or redirected to
other devices.
@@ -766,10 +796,10 @@ config NET_ACT_MIRRED
module will be called act_mirred.
config NET_ACT_SAMPLE
- tristate "Traffic Sampling"
- depends on NET_CLS_ACT
- select PSAMPLE
- ---help---
+ tristate "Traffic Sampling"
+ depends on NET_CLS_ACT
+ select PSAMPLE
+ ---help---
Say Y here to allow packet sampling tc action. The packet sample
action consists of statistically choosing packets and sampling
them using the psample module.
@@ -778,9 +808,9 @@ config NET_ACT_SAMPLE
module will be called act_sample.
config NET_ACT_IPT
- tristate "IPtables targets"
- depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
- ---help---
+ tristate "IPtables targets"
+ depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
+ ---help---
Say Y here to be able to invoke iptables targets after successful
classification.
@@ -788,9 +818,9 @@ config NET_ACT_IPT
module will be called act_ipt.
config NET_ACT_NAT
- tristate "Stateless NAT"
- depends on NET_CLS_ACT
- ---help---
+ tristate "Stateless NAT"
+ depends on NET_CLS_ACT
+ ---help---
Say Y here to do stateless NAT on IPv4 packets. You should use
netfilter for NAT unless you know what you are doing.
@@ -798,18 +828,18 @@ config NET_ACT_NAT
module will be called act_nat.
config NET_ACT_PEDIT
- tristate "Packet Editing"
- depends on NET_CLS_ACT
- ---help---
+ tristate "Packet Editing"
+ depends on NET_CLS_ACT
+ ---help---
Say Y here if you want to mangle the content of packets.
To compile this code as a module, choose M here: the
module will be called act_pedit.
config NET_ACT_SIMP
- tristate "Simple Example (Debug)"
- depends on NET_CLS_ACT
- ---help---
+ tristate "Simple Example (Debug)"
+ depends on NET_CLS_ACT
+ ---help---
Say Y here to add a simple action for demonstration purposes.
It is meant as an example and for debugging purposes. It will
print a configured policy string followed by the packet count
@@ -821,9 +851,9 @@ config NET_ACT_SIMP
module will be called act_simple.
config NET_ACT_SKBEDIT
- tristate "SKB Editing"
- depends on NET_CLS_ACT
- ---help---
+ tristate "SKB Editing"
+ depends on NET_CLS_ACT
+ ---help---
Say Y here to change skb priority or queue_mapping settings.
If unsure, say N.
@@ -832,10 +862,10 @@ config NET_ACT_SKBEDIT
module will be called act_skbedit.
config NET_ACT_CSUM
- tristate "Checksum Updating"
- depends on NET_CLS_ACT && INET
- select LIBCRC32C
- ---help---
+ tristate "Checksum Updating"
+ depends on NET_CLS_ACT && INET
+ select LIBCRC32C
+ ---help---
Say Y here to update some common checksum after some direct
packet alterations.
@@ -854,9 +884,9 @@ config NET_ACT_MPLS
module will be called act_mpls.
config NET_ACT_VLAN
- tristate "Vlan manipulation"
- depends on NET_CLS_ACT
- ---help---
+ tristate "Vlan manipulation"
+ depends on NET_CLS_ACT
+ ---help---
Say Y here to push or pop vlan headers.
If unsure, say N.
@@ -865,9 +895,9 @@ config NET_ACT_VLAN
module will be called act_vlan.
config NET_ACT_BPF
- tristate "BPF based action"
- depends on NET_CLS_ACT
- ---help---
+ tristate "BPF based action"
+ depends on NET_CLS_ACT
+ ---help---
Say Y here to execute BPF code on packets. The BPF code will decide
if the packet should be dropped or not.
@@ -877,10 +907,10 @@ config NET_ACT_BPF
module will be called act_bpf.
config NET_ACT_CONNMARK
- tristate "Netfilter Connection Mark Retriever"
- depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
- depends on NF_CONNTRACK && NF_CONNTRACK_MARK
- ---help---
+ tristate "Netfilter Connection Mark Retriever"
+ depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
+ depends on NF_CONNTRACK && NF_CONNTRACK_MARK
+ ---help---
Say Y here to allow retrieving of conn mark
If unsure, say N.
@@ -889,10 +919,10 @@ config NET_ACT_CONNMARK
module will be called act_connmark.
config NET_ACT_CTINFO
- tristate "Netfilter Connection Mark Actions"
- depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
- depends on NF_CONNTRACK && NF_CONNTRACK_MARK
- help
+ tristate "Netfilter Connection Mark Actions"
+ depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
+ depends on NF_CONNTRACK && NF_CONNTRACK_MARK
+ help
Say Y here to allow transfer of a connmark stored information.
Current actions transfer connmark stored DSCP into
ipv4/v6 diffserv and/or to transfer connmark to packet
@@ -906,21 +936,21 @@ config NET_ACT_CTINFO
module will be called act_ctinfo.
config NET_ACT_SKBMOD
- tristate "skb data modification action"
- depends on NET_CLS_ACT
- ---help---
- Say Y here to allow modification of skb data
+ tristate "skb data modification action"
+ depends on NET_CLS_ACT
+ ---help---
+ Say Y here to allow modification of skb data
- If unsure, say N.
+ If unsure, say N.
- To compile this code as a module, choose M here: the
- module will be called act_skbmod.
+ To compile this code as a module, choose M here: the
+ module will be called act_skbmod.
config NET_ACT_IFE
- tristate "Inter-FE action based on IETF ForCES InterFE LFB"
- depends on NET_CLS_ACT
- select NET_IFE
- ---help---
+ tristate "Inter-FE action based on IETF ForCES InterFE LFB"
+ depends on NET_CLS_ACT
+ select NET_IFE
+ ---help---
Say Y here to allow for sourcing and terminating metadata
For details refer to netdev01 paper:
"Distributing Linux Traffic Control Classifier-Action Subsystem"
@@ -930,9 +960,9 @@ config NET_ACT_IFE
module will be called act_ife.
config NET_ACT_TUNNEL_KEY
- tristate "IP tunnel metadata manipulation"
- depends on NET_CLS_ACT
- ---help---
+ tristate "IP tunnel metadata manipulation"
+ depends on NET_CLS_ACT
+ ---help---
Say Y here to set/release ip tunnel metadata.
If unsure, say N.
@@ -941,9 +971,9 @@ config NET_ACT_TUNNEL_KEY
module will be called act_tunnel_key.
config NET_ACT_CT
- tristate "connection tracking tc action"
- depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT
- help
+ tristate "connection tracking tc action"
+ depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT
+ help
Say Y here to allow sending the packets to conntrack module.
If unsure, say N.
@@ -952,16 +982,28 @@ config NET_ACT_CT
module will be called act_ct.
config NET_IFE_SKBMARK
- tristate "Support to encoding decoding skb mark on IFE action"
- depends on NET_ACT_IFE
+ tristate "Support to encoding decoding skb mark on IFE action"
+ depends on NET_ACT_IFE
config NET_IFE_SKBPRIO
- tristate "Support to encoding decoding skb prio on IFE action"
- depends on NET_ACT_IFE
+ tristate "Support to encoding decoding skb prio on IFE action"
+ depends on NET_ACT_IFE
config NET_IFE_SKBTCINDEX
- tristate "Support to encoding decoding skb tcindex on IFE action"
- depends on NET_ACT_IFE
+ tristate "Support to encoding decoding skb tcindex on IFE action"
+ depends on NET_ACT_IFE
+
+config NET_TC_SKB_EXT
+ bool "TC recirculation support"
+ depends on NET_CLS_ACT
+ select SKB_EXTENSIONS
+
+ help
+ Say Y here to allow tc chain misses to continue in OvS datapath in
+ the correct recirc_id, and hardware chain misses to continue in
+ the correct chain in tc software datapath.
+
+ Say N here if you won't be using tc<->ovs offload or tc chains offload.
endif # NET_SCHED
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 415d1e1f237e..31c367a6cd09 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o
obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o
obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o
obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o
+obj-$(CONFIG_NET_SCH_ETS) += sch_ets.o
obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o
obj-$(CONFIG_NET_SCH_SKBPRIO) += sch_skbprio.o
obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o
@@ -58,6 +59,7 @@ obj-$(CONFIG_NET_SCH_CAKE) += sch_cake.o
obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o
obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o
obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o
+obj-$(CONFIG_NET_SCH_FQ_PIE) += sch_fq_pie.o
obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o
obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o
obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 339712296164..90a31b15585f 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -88,7 +88,7 @@ struct tcf_chain *tcf_action_set_ctrlact(struct tc_action *a, int action,
struct tcf_chain *goto_chain)
{
a->tcfa_action = action;
- rcu_swap_protected(a->goto_chain, goto_chain, 1);
+ goto_chain = rcu_replace_pointer(a->goto_chain, goto_chain, 1);
return goto_chain;
}
EXPORT_SYMBOL(tcf_action_set_ctrlact);
@@ -188,6 +188,8 @@ static size_t tcf_action_shared_attrs_size(const struct tc_action *act)
+ nla_total_size(0) /* TCA_ACT_STATS nested */
/* TCA_STATS_BASIC */
+ nla_total_size_64bit(sizeof(struct gnet_stats_basic))
+ /* TCA_STATS_PKT64 */
+ + nla_total_size_64bit(sizeof(u64))
/* TCA_STATS_QUEUE */
+ nla_total_size_64bit(sizeof(struct gnet_stats_queue))
+ nla_total_size(0) /* TCA_OPTIONS nested */
@@ -399,7 +401,7 @@ static int tcf_idr_delete_index(struct tcf_idrinfo *idrinfo, u32 index)
int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
struct tc_action **a, const struct tc_action_ops *ops,
- int bind, bool cpustats)
+ int bind, bool cpustats, u32 flags)
{
struct tc_action *p = kzalloc(ops->size, GFP_KERNEL);
struct tcf_idrinfo *idrinfo = tn->idrinfo;
@@ -427,6 +429,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
p->tcfa_tm.install = jiffies;
p->tcfa_tm.lastuse = jiffies;
p->tcfa_tm.firstuse = 0;
+ p->tcfa_flags = flags;
if (est) {
err = gen_new_estimator(&p->tcfa_bstats, p->cpu_bstats,
&p->tcfa_rate_est,
@@ -451,6 +454,17 @@ err1:
}
EXPORT_SYMBOL(tcf_idr_create);
+int tcf_idr_create_from_flags(struct tc_action_net *tn, u32 index,
+ struct nlattr *est, struct tc_action **a,
+ const struct tc_action_ops *ops, int bind,
+ u32 flags)
+{
+ /* Set cpustats according to actions flags. */
+ return tcf_idr_create(tn, index, est, a, ops, bind,
+ !(flags & TCA_ACT_FLAGS_NO_PERCPU_STATS), flags);
+}
+EXPORT_SYMBOL(tcf_idr_create_from_flags);
+
void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a)
{
struct tcf_idrinfo *idrinfo = tn->idrinfo;
@@ -773,6 +787,14 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
}
rcu_read_unlock();
+ if (a->tcfa_flags) {
+ struct nla_bitfield32 flags = { a->tcfa_flags,
+ a->tcfa_flags, };
+
+ if (nla_put(skb, TCA_ACT_FLAGS, sizeof(flags), &flags))
+ goto nla_put_failure;
+ }
+
nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
@@ -831,12 +853,24 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb)
return c;
}
+static const u32 tca_act_flags_allowed = TCA_ACT_FLAGS_NO_PERCPU_STATS;
+static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = {
+ [TCA_ACT_KIND] = { .type = NLA_STRING },
+ [TCA_ACT_INDEX] = { .type = NLA_U32 },
+ [TCA_ACT_COOKIE] = { .type = NLA_BINARY,
+ .len = TC_COOKIE_MAX_SIZE },
+ [TCA_ACT_OPTIONS] = { .type = NLA_NESTED },
+ [TCA_ACT_FLAGS] = { .type = NLA_BITFIELD32,
+ .validation_data = &tca_act_flags_allowed },
+};
+
struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
struct nlattr *nla, struct nlattr *est,
char *name, int ovr, int bind,
bool rtnl_held,
struct netlink_ext_ack *extack)
{
+ struct nla_bitfield32 flags = { 0, 0 };
struct tc_action *a;
struct tc_action_ops *a_o;
struct tc_cookie *cookie = NULL;
@@ -846,8 +880,8 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
int err;
if (name == NULL) {
- err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, NULL,
- extack);
+ err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla,
+ tcf_action_policy, extack);
if (err < 0)
goto err_out;
err = -EINVAL;
@@ -861,13 +895,6 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
goto err_out;
}
if (tb[TCA_ACT_COOKIE]) {
- int cklen = nla_len(tb[TCA_ACT_COOKIE]);
-
- if (cklen > TC_COOKIE_MAX_SIZE) {
- NL_SET_ERR_MSG(extack, "TC cookie size above the maximum");
- goto err_out;
- }
-
cookie = nla_memdup_cookie(tb);
if (!cookie) {
NL_SET_ERR_MSG(extack, "No memory to generate TC cookie");
@@ -875,6 +902,8 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
goto err_out;
}
}
+ if (tb[TCA_ACT_FLAGS])
+ flags = nla_get_bitfield32(tb[TCA_ACT_FLAGS]);
} else {
if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ) {
NL_SET_ERR_MSG(extack, "TC action name too long");
@@ -913,10 +942,10 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
/* backward compatibility for policer */
if (name == NULL)
err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind,
- rtnl_held, tp, extack);
+ rtnl_held, tp, flags.value, extack);
else
err = a_o->init(net, nla, est, &a, ovr, bind, rtnl_held,
- tp, extack);
+ tp, flags.value, extack);
if (err < 0)
goto err_mod;
@@ -974,7 +1003,6 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
err = PTR_ERR(act);
goto err;
}
- act->order = i;
sz += tcf_action_fill_size(act);
/* Start from index 0 */
actions[i - 1] = act;
@@ -988,6 +1016,29 @@ err:
return err;
}
+void tcf_action_update_stats(struct tc_action *a, u64 bytes, u32 packets,
+ bool drop, bool hw)
+{
+ if (a->cpu_bstats) {
+ _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
+
+ if (drop)
+ this_cpu_ptr(a->cpu_qstats)->drops += packets;
+
+ if (hw)
+ _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw),
+ bytes, packets);
+ return;
+ }
+
+ _bstats_update(&a->tcfa_bstats, bytes, packets);
+ if (drop)
+ a->tcfa_qstats.drops += packets;
+ if (hw)
+ _bstats_update(&a->tcfa_bstats_hw, bytes, packets);
+}
+EXPORT_SYMBOL(tcf_action_update_stats);
+
int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p,
int compat_mode)
{
@@ -1098,7 +1149,8 @@ static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla,
int index;
int err;
- err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, NULL, extack);
+ err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla,
+ tcf_action_policy, extack);
if (err < 0)
goto err_out;
@@ -1152,7 +1204,8 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
b = skb_tail_pointer(skb);
- err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, NULL, extack);
+ err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla,
+ tcf_action_policy, extack);
if (err < 0)
goto err_out;
@@ -1350,11 +1403,16 @@ static int tcf_action_add(struct net *net, struct nlattr *nla,
struct netlink_ext_ack *extack)
{
size_t attr_size = 0;
- int ret = 0;
+ int loop, ret;
struct tc_action *actions[TCA_ACT_MAX_PRIO] = {};
- ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, actions,
- &attr_size, true, extack);
+ for (loop = 0; loop < 10; loop++) {
+ ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0,
+ actions, &attr_size, true, extack);
+ if (ret != -EAGAIN)
+ break;
+ }
+
if (ret < 0)
return ret;
ret = tcf_add_notify(net, n, actions, portid, attr_size, extack);
@@ -1404,11 +1462,8 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
*/
if (n->nlmsg_flags & NLM_F_REPLACE)
ovr = 1;
-replay:
ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr,
extack);
- if (ret == -EAGAIN)
- goto replay;
break;
case RTM_DELACTION:
ret = tca_action_gd(net, tca[TCA_ACT_TAB], n,
@@ -1440,7 +1495,7 @@ static struct nlattr *find_dump_kind(struct nlattr **nla)
if (tb[1] == NULL)
return NULL;
- if (nla_parse_nested_deprecated(tb2, TCA_ACT_MAX, tb[1], NULL, NULL) < 0)
+ if (nla_parse_nested_deprecated(tb2, TCA_ACT_MAX, tb[1], tcf_action_policy, NULL) < 0)
return NULL;
kind = tb2[TCA_ACT_KIND];
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index fd1f7e799e23..46f47e58b3be 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -275,7 +275,8 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog,
static int tcf_bpf_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **act,
int replace, int bind, bool rtnl_held,
- struct tcf_proto *tp, struct netlink_ext_ack *extack)
+ struct tcf_proto *tp, u32 flags,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, bpf_net_id);
struct nlattr *tb[TCA_ACT_BPF_MAX + 1];
@@ -303,7 +304,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
ret = tcf_idr_check_alloc(tn, &index, act, bind);
if (!ret) {
ret = tcf_idr_create(tn, index, est, act,
- &act_bpf_ops, bind, true);
+ &act_bpf_ops, bind, true, 0);
if (ret < 0) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -422,7 +423,7 @@ static __net_init int bpf_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, bpf_net_id);
- return tc_action_net_init(tn, &act_bpf_ops);
+ return tc_action_net_init(net, tn, &act_bpf_ops);
}
static void __net_exit bpf_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index 32ac04d77a45..43a243081e7d 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -94,7 +94,7 @@ static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = {
static int tcf_connmark_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp,
+ struct tcf_proto *tp, u32 flags,
struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, connmark_net_id);
@@ -121,7 +121,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
ret = tcf_idr_check_alloc(tn, &index, a, bind);
if (!ret) {
ret = tcf_idr_create(tn, index, est, a,
- &act_connmark_ops, bind, false);
+ &act_connmark_ops, bind, false, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -231,7 +231,7 @@ static __net_init int connmark_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, connmark_net_id);
- return tc_action_net_init(tn, &act_connmark_ops);
+ return tc_action_net_init(net, tn, &act_connmark_ops);
}
static void __net_exit connmark_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 9b9288267a54..cb8608f0a77a 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -43,7 +43,7 @@ static struct tc_action_ops act_csum_ops;
static int tcf_csum_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a, int ovr,
int bind, bool rtnl_held, struct tcf_proto *tp,
- struct netlink_ext_ack *extack)
+ u32 flags, struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, csum_net_id);
struct tcf_csum_params *params_new;
@@ -68,8 +68,8 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla,
index = parm->index;
err = tcf_idr_check_alloc(tn, &index, a, bind);
if (!err) {
- ret = tcf_idr_create(tn, index, est, a,
- &act_csum_ops, bind, true);
+ ret = tcf_idr_create_from_flags(tn, index, est, a,
+ &act_csum_ops, bind, flags);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -101,8 +101,8 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla,
spin_lock_bh(&p->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
- rcu_swap_protected(p->params, params_new,
- lockdep_is_held(&p->tcf_lock));
+ params_new = rcu_replace_pointer(p->params, params_new,
+ lockdep_is_held(&p->tcf_lock));
spin_unlock_bh(&p->tcf_lock);
if (goto_ch)
@@ -580,7 +580,7 @@ static int tcf_csum_act(struct sk_buff *skb, const struct tc_action *a,
params = rcu_dereference_bh(p->params);
tcf_lastuse_update(&p->tcf_tm);
- bstats_cpu_update(this_cpu_ptr(p->common.cpu_bstats), skb);
+ tcf_action_update_bstats(&p->common, skb);
action = READ_ONCE(p->tcf_action);
if (unlikely(action == TC_ACT_SHOT))
@@ -624,7 +624,7 @@ out:
return action;
drop:
- qstats_drop_inc(this_cpu_ptr(p->common.cpu_qstats));
+ tcf_action_inc_drop_qstats(&p->common);
action = TC_ACT_SHOT;
goto out;
}
@@ -714,7 +714,7 @@ static __net_init int csum_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, csum_net_id);
- return tc_action_net_init(tn, &act_csum_ops);
+ return tc_action_net_init(net, tn, &act_csum_ops);
}
static void __net_exit csum_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 33a1a7406e87..f685c0d73708 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -24,12 +24,12 @@
#include <uapi/linux/tc_act/tc_ct.h>
#include <net/tc_act/tc_ct.h>
-#include <linux/netfilter/nf_nat.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#include <uapi/linux/netfilter/nf_nat.h>
static struct tc_action_ops act_ct_ops;
static unsigned int ct_net_id;
@@ -312,7 +312,7 @@ static void tcf_ct_act_set_labels(struct nf_conn *ct,
u32 *labels_m)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)
- size_t labels_sz = FIELD_SIZEOF(struct tcf_ct_params, labels);
+ size_t labels_sz = sizeof_field(struct tcf_ct_params, labels);
if (!memchr_inv(labels_m, 0, labels_sz))
return;
@@ -329,6 +329,7 @@ static int tcf_ct_act_nat(struct sk_buff *skb,
bool commit)
{
#if IS_ENABLED(CONFIG_NF_NAT)
+ int err;
enum nf_nat_manip_type maniptype;
if (!(ct_action & TCA_CT_ACT_NAT))
@@ -359,7 +360,17 @@ static int tcf_ct_act_nat(struct sk_buff *skb,
return NF_ACCEPT;
}
- return ct_nat_execute(skb, ct, ctinfo, range, maniptype);
+ err = ct_nat_execute(skb, ct, ctinfo, range, maniptype);
+ if (err == NF_ACCEPT &&
+ ct->status & IPS_SRC_NAT && ct->status & IPS_DST_NAT) {
+ if (maniptype == NF_NAT_MANIP_SRC)
+ maniptype = NF_NAT_MANIP_DST;
+ else
+ maniptype = NF_NAT_MANIP_SRC;
+
+ err = ct_nat_execute(skb, ct, ctinfo, range, maniptype);
+ }
+ return err;
#else
return NF_ACCEPT;
#endif
@@ -465,16 +476,15 @@ out_push:
skb_push_rcsum(skb, nh_ofs);
out:
- bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), skb);
+ tcf_action_update_bstats(&c->common, skb);
return retval;
drop:
- qstats_drop_inc(this_cpu_ptr(a->cpu_qstats));
+ tcf_action_inc_drop_qstats(&c->common);
return TC_ACT_SHOT;
}
static const struct nla_policy ct_policy[TCA_CT_MAX + 1] = {
- [TCA_CT_UNSPEC] = { .strict_start_type = TCA_CT_UNSPEC + 1 },
[TCA_CT_ACTION] = { .type = NLA_U16 },
[TCA_CT_PARMS] = { .type = NLA_EXACT_LEN, .len = sizeof(struct tc_ct) },
[TCA_CT_ZONE] = { .type = NLA_U16 },
@@ -656,7 +666,7 @@ static int tcf_ct_fill_params(struct net *net,
static int tcf_ct_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int replace, int bind, bool rtnl_held,
- struct tcf_proto *tp,
+ struct tcf_proto *tp, u32 flags,
struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, ct_net_id);
@@ -688,8 +698,8 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla,
return err;
if (!err) {
- err = tcf_idr_create(tn, index, est, a,
- &act_ct_ops, bind, true);
+ err = tcf_idr_create_from_flags(tn, index, est, a,
+ &act_ct_ops, bind, flags);
if (err) {
tcf_idr_cleanup(tn, index);
return err;
@@ -722,7 +732,8 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla,
spin_lock_bh(&c->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
- rcu_swap_protected(c->params, params, lockdep_is_held(&c->tcf_lock));
+ params = rcu_replace_pointer(c->params, params,
+ lockdep_is_held(&c->tcf_lock));
spin_unlock_bh(&c->tcf_lock);
if (goto_ch)
@@ -905,11 +916,7 @@ static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets,
{
struct tcf_ct *c = to_ct(a);
- _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
-
- if (hw)
- _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw),
- bytes, packets);
+ tcf_action_update_stats(a, bytes, packets, false, hw);
c->tcf_tm.lastuse = max_t(u64, c->tcf_tm.lastuse, lastuse);
}
@@ -929,7 +936,7 @@ static struct tc_action_ops act_ct_ops = {
static __net_init int ct_init_net(struct net *net)
{
- unsigned int n_bits = FIELD_SIZEOF(struct tcf_ct_params, labels) * 8;
+ unsigned int n_bits = sizeof_field(struct tcf_ct_params, labels) * 8;
struct tc_ct_action_net *tn = net_generic(net, ct_net_id);
if (nf_connlabels_get(net, n_bits - 1)) {
@@ -939,7 +946,7 @@ static __net_init int ct_init_net(struct net *net)
tn->labels = true;
}
- return tc_action_net_init(&tn->tn, &act_ct_ops);
+ return tc_action_net_init(net, &tn->tn, &act_ct_ops);
}
static void __net_exit ct_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c
index 06ef74b74911..19649623493b 100644
--- a/net/sched/act_ctinfo.c
+++ b/net/sched/act_ctinfo.c
@@ -153,7 +153,7 @@ static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = {
static int tcf_ctinfo_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp,
+ struct tcf_proto *tp, u32 flags,
struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
@@ -210,7 +210,7 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla,
err = tcf_idr_check_alloc(tn, &index, a, bind);
if (!err) {
ret = tcf_idr_create(tn, index, est, a,
- &act_ctinfo_ops, bind, false);
+ &act_ctinfo_ops, bind, false, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -257,8 +257,8 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla,
spin_lock_bh(&ci->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, actparm->action, goto_ch);
- rcu_swap_protected(ci->params, cp_new,
- lockdep_is_held(&ci->tcf_lock));
+ cp_new = rcu_replace_pointer(ci->params, cp_new,
+ lockdep_is_held(&ci->tcf_lock));
spin_unlock_bh(&ci->tcf_lock);
if (goto_ch)
@@ -360,6 +360,16 @@ static int tcf_ctinfo_search(struct net *net, struct tc_action **a, u32 index)
return tcf_idr_search(tn, a, index);
}
+static void tcf_ctinfo_cleanup(struct tc_action *a)
+{
+ struct tcf_ctinfo *ci = to_ctinfo(a);
+ struct tcf_ctinfo_params *cp;
+
+ cp = rcu_dereference_protected(ci->params, 1);
+ if (cp)
+ kfree_rcu(cp, rcu);
+}
+
static struct tc_action_ops act_ctinfo_ops = {
.kind = "ctinfo",
.id = TCA_ID_CTINFO,
@@ -367,6 +377,7 @@ static struct tc_action_ops act_ctinfo_ops = {
.act = tcf_ctinfo_act,
.dump = tcf_ctinfo_dump,
.init = tcf_ctinfo_init,
+ .cleanup= tcf_ctinfo_cleanup,
.walk = tcf_ctinfo_walker,
.lookup = tcf_ctinfo_search,
.size = sizeof(struct tcf_ctinfo),
@@ -376,7 +387,7 @@ static __net_init int ctinfo_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
- return tc_action_net_init(tn, &act_ctinfo_ops);
+ return tc_action_net_init(net, tn, &act_ctinfo_ops);
}
static void __net_exit ctinfo_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index 8f0140c6ca58..416065772719 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -53,7 +53,8 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = {
static int tcf_gact_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp, struct netlink_ext_ack *extack)
+ struct tcf_proto *tp, u32 flags,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, gact_net_id);
struct nlattr *tb[TCA_GACT_MAX + 1];
@@ -98,8 +99,8 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
err = tcf_idr_check_alloc(tn, &index, a, bind);
if (!err) {
- ret = tcf_idr_create(tn, index, est, a,
- &act_gact_ops, bind, true);
+ ret = tcf_idr_create_from_flags(tn, index, est, a,
+ &act_gact_ops, bind, flags);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -161,9 +162,9 @@ static int tcf_gact_act(struct sk_buff *skb, const struct tc_action *a,
action = gact_rand[ptype](gact);
}
#endif
- bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), skb);
+ tcf_action_update_bstats(&gact->common, skb);
if (action == TC_ACT_SHOT)
- qstats_drop_inc(this_cpu_ptr(gact->common.cpu_qstats));
+ tcf_action_inc_drop_qstats(&gact->common);
tcf_lastuse_update(&gact->tcf_tm);
@@ -177,15 +178,7 @@ static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets,
int action = READ_ONCE(gact->tcf_action);
struct tcf_t *tm = &gact->tcf_tm;
- _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), bytes,
- packets);
- if (action == TC_ACT_SHOT)
- this_cpu_ptr(gact->common.cpu_qstats)->drops += packets;
-
- if (hw)
- _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats_hw),
- bytes, packets);
-
+ tcf_action_update_stats(a, bytes, packets, action == TC_ACT_SHOT, hw);
tm->lastuse = max_t(u64, tm->lastuse, lastuse);
}
@@ -278,7 +271,7 @@ static __net_init int gact_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, gact_net_id);
- return tc_action_net_init(tn, &act_gact_ops);
+ return tc_action_net_init(net, tn, &act_gact_ops);
}
static void __net_exit gact_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 92ee853d43e6..c1fcd85719d6 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -465,7 +465,8 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb,
static int tcf_ife_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp, struct netlink_ext_ack *extack)
+ struct tcf_proto *tp, u32 flags,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, ife_net_id);
struct nlattr *tb[TCA_IFE_MAX + 1];
@@ -522,7 +523,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
if (!exists) {
ret = tcf_idr_create(tn, index, est, a, &act_ife_ops,
- bind, true);
+ bind, true, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
kfree(p);
@@ -536,6 +537,9 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
}
ife = to_ife(*a);
+ if (ret == ACT_P_CREATED)
+ INIT_LIST_HEAD(&ife->metalist);
+
err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
if (err < 0)
goto release_idr;
@@ -565,10 +569,6 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
p->eth_type = ife_type;
}
-
- if (ret == ACT_P_CREATED)
- INIT_LIST_HEAD(&ife->metalist);
-
if (tb[TCA_IFE_METALST]) {
err = nla_parse_nested_deprecated(tb2, IFE_META_MAX,
tb[TCA_IFE_METALST], NULL,
@@ -594,7 +594,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
spin_lock_bh(&ife->tcf_lock);
/* protected by tcf_lock when modifying existing action */
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
- rcu_swap_protected(ife->params, p, 1);
+ p = rcu_replace_pointer(ife->params, p, 1);
if (exists)
spin_unlock_bh(&ife->tcf_lock);
@@ -890,7 +890,7 @@ static __net_init int ife_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, ife_net_id);
- return tc_action_net_init(tn, &act_ife_ops);
+ return tc_action_net_init(net, tn, &act_ife_ops);
}
static void __net_exit ife_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index ce2c30a591d2..400a2cfe8452 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -61,12 +61,13 @@ static int ipt_init_target(struct net *net, struct xt_entry_target *t,
return 0;
}
-static void ipt_destroy_target(struct xt_entry_target *t)
+static void ipt_destroy_target(struct xt_entry_target *t, struct net *net)
{
struct xt_tgdtor_param par = {
.target = t->u.kernel.target,
.targinfo = t->data,
.family = NFPROTO_IPV4,
+ .net = net,
};
if (par.target->destroy != NULL)
par.target->destroy(&par);
@@ -78,7 +79,7 @@ static void tcf_ipt_release(struct tc_action *a)
struct tcf_ipt *ipt = to_ipt(a);
if (ipt->tcfi_t) {
- ipt_destroy_target(ipt->tcfi_t);
+ ipt_destroy_target(ipt->tcfi_t, a->idrinfo->net);
kfree(ipt->tcfi_t);
}
kfree(ipt->tcfi_tname);
@@ -94,7 +95,7 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = {
static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
const struct tc_action_ops *ops, int ovr, int bind,
- struct tcf_proto *tp)
+ struct tcf_proto *tp, u32 flags)
{
struct tc_action_net *tn = net_generic(net, id);
struct nlattr *tb[TCA_IPT_MAX + 1];
@@ -143,7 +144,7 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla,
if (!exists) {
ret = tcf_idr_create(tn, index, est, a, ops, bind,
- false);
+ false, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -180,7 +181,7 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla,
spin_lock_bh(&ipt->tcf_lock);
if (ret != ACT_P_CREATED) {
- ipt_destroy_target(ipt->tcfi_t);
+ ipt_destroy_target(ipt->tcfi_t, net);
kfree(ipt->tcfi_tname);
kfree(ipt->tcfi_t);
}
@@ -204,19 +205,19 @@ err1:
static int tcf_ipt_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a, int ovr,
int bind, bool rtnl_held, struct tcf_proto *tp,
- struct netlink_ext_ack *extack)
+ u32 flags, struct netlink_ext_ack *extack)
{
return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, ovr,
- bind, tp);
+ bind, tp, flags);
}
static int tcf_xt_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a, int ovr,
int bind, bool unlocked, struct tcf_proto *tp,
- struct netlink_ext_ack *extack)
+ u32 flags, struct netlink_ext_ack *extack)
{
return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, ovr,
- bind, tp);
+ bind, tp, flags);
}
static int tcf_ipt_act(struct sk_buff *skb, const struct tc_action *a,
@@ -350,7 +351,7 @@ static __net_init int ipt_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, ipt_net_id);
- return tc_action_net_init(tn, &act_ipt_ops);
+ return tc_action_net_init(net, tn, &act_ipt_ops);
}
static void __net_exit ipt_exit_net(struct list_head *net_list)
@@ -399,7 +400,7 @@ static __net_init int xt_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, xt_net_id);
- return tc_action_net_init(tn, &act_xt_ops);
+ return tc_action_net_init(net, tn, &act_xt_ops);
}
static void __net_exit xt_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index be3f88dfc37e..1ad300e6dbc0 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -93,7 +93,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
struct tcf_proto *tp,
- struct netlink_ext_ack *extack)
+ u32 flags, struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, mirred_net_id);
struct nlattr *tb[TCA_MIRRED_MAX + 1];
@@ -148,8 +148,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
NL_SET_ERR_MSG_MOD(extack, "Specified device does not exist");
return -EINVAL;
}
- ret = tcf_idr_create(tn, index, est, a,
- &act_mirred_ops, bind, true);
+ ret = tcf_idr_create_from_flags(tn, index, est, a,
+ &act_mirred_ops, bind, flags);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -178,8 +178,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
goto put_chain;
}
mac_header_xmit = dev_is_mac_header_xmit(dev);
- rcu_swap_protected(m->tcfm_dev, dev,
- lockdep_is_held(&m->tcf_lock));
+ dev = rcu_replace_pointer(m->tcfm_dev, dev,
+ lockdep_is_held(&m->tcf_lock));
if (dev)
dev_put(dev);
m->tcfm_mac_header_xmit = mac_header_xmit;
@@ -219,8 +219,10 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
bool use_reinsert;
bool want_ingress;
bool is_redirect;
+ bool expects_nh;
int m_eaction;
int mac_len;
+ bool at_nh;
rec_level = __this_cpu_inc_return(mirred_rec_level);
if (unlikely(rec_level > MIRRED_RECURSION_LIMIT)) {
@@ -231,7 +233,7 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
}
tcf_lastuse_update(&m->tcf_tm);
- bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb);
+ tcf_action_update_bstats(&m->common, skb);
m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit);
m_eaction = READ_ONCE(m->tcfm_eaction);
@@ -261,19 +263,19 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
goto out;
}
- /* If action's target direction differs than filter's direction,
- * and devices expect a mac header on xmit, then mac push/pull is
- * needed.
- */
want_ingress = tcf_mirred_act_wants_ingress(m_eaction);
- if (skb_at_tc_ingress(skb) != want_ingress && m_mac_header_xmit) {
- if (!skb_at_tc_ingress(skb)) {
- /* caught at egress, act ingress: pull mac */
- mac_len = skb_network_header(skb) - skb_mac_header(skb);
+
+ expects_nh = want_ingress || !m_mac_header_xmit;
+ at_nh = skb->data == skb_network_header(skb);
+ if (at_nh != expects_nh) {
+ mac_len = skb_at_tc_ingress(skb) ? skb->mac_len :
+ skb_network_header(skb) - skb_mac_header(skb);
+ if (expects_nh) {
+ /* target device/action expect data at nh */
skb_pull_rcsum(skb2, mac_len);
} else {
- /* caught at ingress, act egress: push mac */
- skb_push_rcsum(skb2, skb->mac_len);
+ /* target device/action expect data at mac */
+ skb_push_rcsum(skb2, mac_len);
}
}
@@ -289,8 +291,8 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
/* let's the caller reinsert the packet, if possible */
if (use_reinsert) {
res->ingress = want_ingress;
- res->qstats = this_cpu_ptr(m->common.cpu_qstats);
- skb_tc_reinsert(skb, res);
+ if (skb_tc_reinsert(skb, res))
+ tcf_action_inc_overlimit_qstats(&m->common);
__this_cpu_dec(mirred_rec_level);
return TC_ACT_CONSUMED;
}
@@ -303,7 +305,7 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
if (err) {
out:
- qstats_overlimit_inc(this_cpu_ptr(m->common.cpu_qstats));
+ tcf_action_inc_overlimit_qstats(&m->common);
if (tcf_mirred_is_act_redirect(m_eaction))
retval = TC_ACT_SHOT;
}
@@ -318,10 +320,7 @@ static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets,
struct tcf_mirred *m = to_mirred(a);
struct tcf_t *tm = &m->tcf_tm;
- _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
- if (hw)
- _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw),
- bytes, packets);
+ tcf_action_update_stats(a, bytes, packets, false, hw);
tm->lastuse = max_t(u64, tm->lastuse, lastuse);
}
@@ -408,25 +407,31 @@ static struct notifier_block mirred_device_notifier = {
.notifier_call = mirred_device_event,
};
-static struct net_device *tcf_mirred_get_dev(const struct tc_action *a)
+static void tcf_mirred_dev_put(void *priv)
+{
+ struct net_device *dev = priv;
+
+ dev_put(dev);
+}
+
+static struct net_device *
+tcf_mirred_get_dev(const struct tc_action *a,
+ tc_action_priv_destructor *destructor)
{
struct tcf_mirred *m = to_mirred(a);
struct net_device *dev;
rcu_read_lock();
dev = rcu_dereference(m->tcfm_dev);
- if (dev)
+ if (dev) {
dev_hold(dev);
+ *destructor = tcf_mirred_dev_put;
+ }
rcu_read_unlock();
return dev;
}
-static void tcf_mirred_put_dev(struct net_device *dev)
-{
- dev_put(dev);
-}
-
static size_t tcf_mirred_get_fill_size(const struct tc_action *act)
{
return nla_total_size(sizeof(struct tc_mirred));
@@ -446,14 +451,13 @@ static struct tc_action_ops act_mirred_ops = {
.get_fill_size = tcf_mirred_get_fill_size,
.size = sizeof(struct tcf_mirred),
.get_dev = tcf_mirred_get_dev,
- .put_dev = tcf_mirred_put_dev,
};
static __net_init int mirred_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, mirred_net_id);
- return tc_action_net_init(tn, &act_mirred_ops);
+ return tc_action_net_init(net, tn, &act_mirred_ops);
}
static void __net_exit mirred_exit_net(struct list_head *net_list)
@@ -479,7 +483,11 @@ static int __init mirred_init_module(void)
return err;
pr_info("Mirror/redirect action on\n");
- return tcf_register_action(&act_mirred_ops, &mirred_net_ops);
+ err = tcf_register_action(&act_mirred_ops, &mirred_net_ops);
+ if (err)
+ unregister_netdevice_notifier(&mirred_device_notifier);
+
+ return err;
}
static void __exit mirred_cleanup_module(void)
diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c
index 0f299e3b618c..be3f215cd027 100644
--- a/net/sched/act_mpls.c
+++ b/net/sched/act_mpls.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
/* Copyright (C) 2019 Netronome Systems, Inc. */
+#include <linux/if_arp.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
@@ -55,7 +56,7 @@ static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a,
struct tcf_mpls *m = to_mpls(a);
struct tcf_mpls_params *p;
__be32 new_lse;
- int ret;
+ int ret, mac_len;
tcf_lastuse_update(&m->tcf_tm);
bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb);
@@ -63,8 +64,12 @@ static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a,
/* Ensure 'data' points at mac_header prior calling mpls manipulating
* functions.
*/
- if (skb_at_tc_ingress(skb))
+ if (skb_at_tc_ingress(skb)) {
skb_push_rcsum(skb, skb->mac_len);
+ mac_len = skb->mac_len;
+ } else {
+ mac_len = skb_network_header(skb) - skb_mac_header(skb);
+ }
ret = READ_ONCE(m->tcf_action);
@@ -72,12 +77,14 @@ static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a,
switch (p->tcfm_action) {
case TCA_MPLS_ACT_POP:
- if (skb_mpls_pop(skb, p->tcfm_proto))
+ if (skb_mpls_pop(skb, p->tcfm_proto, mac_len,
+ skb->dev && skb->dev->type == ARPHRD_ETHER))
goto drop;
break;
case TCA_MPLS_ACT_PUSH:
new_lse = tcf_mpls_get_lse(NULL, p, !eth_p_mpls(skb->protocol));
- if (skb_mpls_push(skb, new_lse, p->tcfm_proto))
+ if (skb_mpls_push(skb, new_lse, p->tcfm_proto, mac_len,
+ skb->dev && skb->dev->type == ARPHRD_ETHER))
goto drop;
break;
case TCA_MPLS_ACT_MODIFY:
@@ -115,7 +122,6 @@ static int valid_label(const struct nlattr *attr,
}
static const struct nla_policy mpls_policy[TCA_MPLS_MAX + 1] = {
- [TCA_MPLS_UNSPEC] = { .strict_start_type = TCA_MPLS_UNSPEC + 1 },
[TCA_MPLS_PARMS] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_mpls)),
[TCA_MPLS_PROTO] = { .type = NLA_U16 },
[TCA_MPLS_LABEL] = NLA_POLICY_VALIDATE_FN(NLA_U32, valid_label),
@@ -127,7 +133,8 @@ static const struct nla_policy mpls_policy[TCA_MPLS_MAX + 1] = {
static int tcf_mpls_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp, struct netlink_ext_ack *extack)
+ struct tcf_proto *tp, u32 flags,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, mpls_net_id);
struct nlattr *tb[TCA_MPLS_MAX + 1];
@@ -220,7 +227,7 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla,
if (!exists) {
ret = tcf_idr_create(tn, index, est, a,
- &act_mpls_ops, bind, true);
+ &act_mpls_ops, bind, true, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -258,7 +265,7 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla,
spin_lock_bh(&m->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
- rcu_swap_protected(m->mpls_p, p, lockdep_is_held(&m->tcf_lock));
+ p = rcu_replace_pointer(m->mpls_p, p, lockdep_is_held(&m->tcf_lock));
spin_unlock_bh(&m->tcf_lock);
if (goto_ch)
@@ -375,7 +382,7 @@ static __net_init int mpls_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, mpls_net_id);
- return tc_action_net_init(tn, &act_mpls_ops);
+ return tc_action_net_init(net, tn, &act_mpls_ops);
}
static void __net_exit mpls_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 7b858c11b1b5..855a6fa16a62 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -36,7 +36,7 @@ static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {
static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
struct tc_action **a, int ovr, int bind,
bool rtnl_held, struct tcf_proto *tp,
- struct netlink_ext_ack *extack)
+ u32 flags, struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, nat_net_id);
struct nlattr *tb[TCA_NAT_MAX + 1];
@@ -61,7 +61,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
err = tcf_idr_check_alloc(tn, &index, a, bind);
if (!err) {
ret = tcf_idr_create(tn, index, est, a,
- &act_nat_ops, bind, false);
+ &act_nat_ops, bind, false, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -206,9 +206,7 @@ static int tcf_nat_act(struct sk_buff *skb, const struct tc_action *a,
icmph = (void *)(skb_network_header(skb) + ihl);
- if ((icmph->type != ICMP_DEST_UNREACH) &&
- (icmph->type != ICMP_TIME_EXCEEDED) &&
- (icmph->type != ICMP_PARAMETERPROB))
+ if (!icmp_is_err(icmph->type))
break;
if (!pskb_may_pull(skb, ihl + sizeof(*icmph) + sizeof(*iph) +
@@ -327,7 +325,7 @@ static __net_init int nat_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, nat_net_id);
- return tc_action_net_init(tn, &act_nat_ops);
+ return tc_action_net_init(net, tn, &act_nat_ops);
}
static void __net_exit nat_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 17360c6faeaa..3ad718576304 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -43,7 +43,7 @@ static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla,
int err = -EINVAL;
int rem;
- if (!nla || !n)
+ if (!nla)
return NULL;
keys_ex = kcalloc(n, sizeof(*k), GFP_KERNEL);
@@ -137,7 +137,8 @@ nla_failure:
static int tcf_pedit_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp, struct netlink_ext_ack *extack)
+ struct tcf_proto *tp, u32 flags,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, pedit_net_id);
struct nlattr *tb[TCA_PEDIT_MAX + 1];
@@ -170,6 +171,10 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
}
parm = nla_data(pattr);
+ if (!parm->nkeys) {
+ NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed");
+ return -EINVAL;
+ }
ksize = parm->nkeys * sizeof(struct tc_pedit_key);
if (nla_len(pattr) < sizeof(*parm) + ksize) {
NL_SET_ERR_MSG_ATTR(extack, pattr, "Length of TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute is invalid");
@@ -183,14 +188,8 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
index = parm->index;
err = tcf_idr_check_alloc(tn, &index, a, bind);
if (!err) {
- if (!parm->nkeys) {
- tcf_idr_cleanup(tn, index);
- NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed");
- ret = -EINVAL;
- goto out_free;
- }
ret = tcf_idr_create(tn, index, est, a,
- &act_pedit_ops, bind, false);
+ &act_pedit_ops, bind, false, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
goto out_free;
@@ -498,7 +497,7 @@ static __net_init int pedit_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, pedit_net_id);
- return tc_action_net_init(tn, &act_pedit_ops);
+ return tc_action_net_init(net, tn, &act_pedit_ops);
}
static void __net_exit pedit_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 49cec3e64a4d..8b7a0ac96c51 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -40,12 +40,14 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
[TCA_POLICE_PEAKRATE] = { .len = TC_RTAB_SIZE },
[TCA_POLICE_AVRATE] = { .type = NLA_U32 },
[TCA_POLICE_RESULT] = { .type = NLA_U32 },
+ [TCA_POLICE_RATE64] = { .type = NLA_U64 },
+ [TCA_POLICE_PEAKRATE64] = { .type = NLA_U64 },
};
static int tcf_police_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp,
+ struct tcf_proto *tp, u32 flags,
struct netlink_ext_ack *extack)
{
int ret = 0, tcfp_result = TC_ACT_OK, err, size;
@@ -58,6 +60,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
struct tcf_police_params *new;
bool exists = false;
u32 index;
+ u64 rate64, prate64;
if (nla == NULL)
return -EINVAL;
@@ -84,7 +87,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
if (!exists) {
ret = tcf_idr_create(tn, index, NULL, a,
- &act_police_ops, bind, true);
+ &act_police_ops, bind, true, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -155,14 +158,18 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
}
if (R_tab) {
new->rate_present = true;
- psched_ratecfg_precompute(&new->rate, &R_tab->rate, 0);
+ rate64 = tb[TCA_POLICE_RATE64] ?
+ nla_get_u64(tb[TCA_POLICE_RATE64]) : 0;
+ psched_ratecfg_precompute(&new->rate, &R_tab->rate, rate64);
qdisc_put_rtab(R_tab);
} else {
new->rate_present = false;
}
if (P_tab) {
new->peak_present = true;
- psched_ratecfg_precompute(&new->peak, &P_tab->rate, 0);
+ prate64 = tb[TCA_POLICE_PEAKRATE64] ?
+ nla_get_u64(tb[TCA_POLICE_PEAKRATE64]) : 0;
+ psched_ratecfg_precompute(&new->peak, &P_tab->rate, prate64);
qdisc_put_rtab(P_tab);
} else {
new->peak_present = false;
@@ -184,9 +191,9 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
police->tcfp_ptoks = new->tcfp_mtu_ptoks;
spin_unlock_bh(&police->tcfp_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
- rcu_swap_protected(police->params,
- new,
- lockdep_is_held(&police->tcf_lock));
+ new = rcu_replace_pointer(police->params,
+ new,
+ lockdep_is_held(&police->tcf_lock));
spin_unlock_bh(&police->tcf_lock);
if (goto_ch)
@@ -287,10 +294,7 @@ static void tcf_police_stats_update(struct tc_action *a,
struct tcf_police *police = to_police(a);
struct tcf_t *tm = &police->tcf_tm;
- _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
- if (hw)
- _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw),
- bytes, packets);
+ tcf_action_update_stats(a, bytes, packets, false, hw);
tm->lastuse = max_t(u64, tm->lastuse, lastuse);
}
@@ -313,10 +317,22 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a,
lockdep_is_held(&police->tcf_lock));
opt.mtu = p->tcfp_mtu;
opt.burst = PSCHED_NS2TICKS(p->tcfp_burst);
- if (p->rate_present)
+ if (p->rate_present) {
psched_ratecfg_getrate(&opt.rate, &p->rate);
- if (p->peak_present)
+ if ((police->params->rate.rate_bytes_ps >= (1ULL << 32)) &&
+ nla_put_u64_64bit(skb, TCA_POLICE_RATE64,
+ police->params->rate.rate_bytes_ps,
+ TCA_POLICE_PAD))
+ goto nla_put_failure;
+ }
+ if (p->peak_present) {
psched_ratecfg_getrate(&opt.peakrate, &p->peak);
+ if ((police->params->peak.rate_bytes_ps >= (1ULL << 32)) &&
+ nla_put_u64_64bit(skb, TCA_POLICE_PEAKRATE64,
+ police->params->peak.rate_bytes_ps,
+ TCA_POLICE_PAD))
+ goto nla_put_failure;
+ }
if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt))
goto nla_put_failure;
if (p->tcfp_result &&
@@ -326,10 +342,7 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a,
nla_put_u32(skb, TCA_POLICE_AVRATE, p->tcfp_ewma_rate))
goto nla_put_failure;
- t.install = jiffies_to_clock_t(jiffies - police->tcf_tm.install);
- t.lastuse = jiffies_to_clock_t(jiffies - police->tcf_tm.lastuse);
- t.firstuse = jiffies_to_clock_t(jiffies - police->tcf_tm.firstuse);
- t.expires = jiffies_to_clock_t(police->tcf_tm.expires);
+ tcf_tm_dump(&t, &police->tcf_tm);
if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD))
goto nla_put_failure;
spin_unlock_bh(&police->tcf_lock);
@@ -371,7 +384,7 @@ static __net_init int police_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, police_net_id);
- return tc_action_net_init(tn, &act_police_ops);
+ return tc_action_net_init(net, tn, &act_police_ops);
}
static void __net_exit police_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 595308d60133..ce948c1e24dc 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -36,7 +36,7 @@ static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = {
static int tcf_sample_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a, int ovr,
int bind, bool rtnl_held, struct tcf_proto *tp,
- struct netlink_ext_ack *extack)
+ u32 flags, struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, sample_net_id);
struct nlattr *tb[TCA_SAMPLE_MAX + 1];
@@ -69,7 +69,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla,
if (!exists) {
ret = tcf_idr_create(tn, index, est, a,
- &act_sample_ops, bind, true);
+ &act_sample_ops, bind, true, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -102,13 +102,17 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla,
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
s->rate = rate;
s->psample_group_num = psample_group_num;
- RCU_INIT_POINTER(s->psample_group, psample_group);
+ psample_group = rcu_replace_pointer(s->psample_group, psample_group,
+ lockdep_is_held(&s->tcf_lock));
if (tb[TCA_SAMPLE_TRUNC_SIZE]) {
s->truncate = true;
s->trunc_size = nla_get_u32(tb[TCA_SAMPLE_TRUNC_SIZE]);
}
spin_unlock_bh(&s->tcf_lock);
+
+ if (psample_group)
+ psample_group_put(psample_group);
if (goto_ch)
tcf_chain_put_by_act(goto_ch);
@@ -142,6 +146,7 @@ static bool tcf_sample_dev_ok_push(struct net_device *dev)
case ARPHRD_TUNNEL6:
case ARPHRD_SIT:
case ARPHRD_IPGRE:
+ case ARPHRD_IP6GRE:
case ARPHRD_VOID:
case ARPHRD_NONE:
return false;
@@ -248,6 +253,32 @@ static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index)
return tcf_idr_search(tn, a, index);
}
+static void tcf_psample_group_put(void *priv)
+{
+ struct psample_group *group = priv;
+
+ psample_group_put(group);
+}
+
+static struct psample_group *
+tcf_sample_get_group(const struct tc_action *a,
+ tc_action_priv_destructor *destructor)
+{
+ struct tcf_sample *s = to_sample(a);
+ struct psample_group *group;
+
+ spin_lock_bh(&s->tcf_lock);
+ group = rcu_dereference_protected(s->psample_group,
+ lockdep_is_held(&s->tcf_lock));
+ if (group) {
+ psample_group_take(group);
+ *destructor = tcf_psample_group_put;
+ }
+ spin_unlock_bh(&s->tcf_lock);
+
+ return group;
+}
+
static struct tc_action_ops act_sample_ops = {
.kind = "sample",
.id = TCA_ID_SAMPLE,
@@ -258,6 +289,7 @@ static struct tc_action_ops act_sample_ops = {
.cleanup = tcf_sample_cleanup,
.walk = tcf_sample_walker,
.lookup = tcf_sample_search,
+ .get_psample_group = tcf_sample_get_group,
.size = sizeof(struct tcf_sample),
};
@@ -265,7 +297,7 @@ static __net_init int sample_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, sample_net_id);
- return tc_action_net_init(tn, &act_sample_ops);
+ return tc_action_net_init(net, tn, &act_sample_ops);
}
static void __net_exit sample_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 33aefa25b545..9813ca4006dd 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -35,7 +35,7 @@ static int tcf_simp_act(struct sk_buff *skb, const struct tc_action *a,
* Example if this was the 3rd packet and the string was "hello"
* then it would look like "hello_3" (without quotes)
*/
- pr_info("simple: %s_%d\n",
+ pr_info("simple: %s_%llu\n",
(char *)d->tcfd_defdata, d->tcf_bstats.packets);
spin_unlock(&d->tcf_lock);
return d->tcf_action;
@@ -86,7 +86,8 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = {
static int tcf_simp_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp, struct netlink_ext_ack *extack)
+ struct tcf_proto *tp, u32 flags,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, simp_net_id);
struct nlattr *tb[TCA_DEF_MAX + 1];
@@ -127,7 +128,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
if (!exists) {
ret = tcf_idr_create(tn, index, est, a,
- &act_simp_ops, bind, false);
+ &act_simp_ops, bind, false, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -232,7 +233,7 @@ static __net_init int simp_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, simp_net_id);
- return tc_action_net_init(tn, &act_simp_ops);
+ return tc_action_net_init(net, tn, &act_simp_ops);
}
static void __net_exit simp_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 37dced00b63d..e857424c387c 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -86,7 +86,7 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp,
+ struct tcf_proto *tp, u32 act_flags,
struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, skbedit_net_id);
@@ -165,7 +165,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
if (!exists) {
ret = tcf_idr_create(tn, index, est, a,
- &act_skbedit_ops, bind, true);
+ &act_skbedit_ops, bind, true, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -206,8 +206,8 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
spin_lock_bh(&d->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
- rcu_swap_protected(d->params, params_new,
- lockdep_is_held(&d->tcf_lock));
+ params_new = rcu_replace_pointer(d->params, params_new,
+ lockdep_is_held(&d->tcf_lock));
spin_unlock_bh(&d->tcf_lock);
if (params_new)
kfree_rcu(params_new, rcu);
@@ -336,7 +336,7 @@ static __net_init int skbedit_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, skbedit_net_id);
- return tc_action_net_init(tn, &act_skbedit_ops);
+ return tc_action_net_init(net, tn, &act_skbedit_ops);
}
static void __net_exit skbedit_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index 7da3518e18ef..39e6d94cfafb 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -79,7 +79,7 @@ static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = {
static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp,
+ struct tcf_proto *tp, u32 flags,
struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, skbmod_net_id);
@@ -143,7 +143,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
if (!exists) {
ret = tcf_idr_create(tn, index, est, a,
- &act_skbmod_ops, bind, true);
+ &act_skbmod_ops, bind, true, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -287,7 +287,7 @@ static __net_init int skbmod_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, skbmod_net_id);
- return tc_action_net_init(tn, &act_skbmod_ops);
+ return tc_action_net_init(net, tn, &act_skbmod_ops);
}
static void __net_exit skbmod_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 6d0debdc9b97..536c4bc31be6 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -10,6 +10,8 @@
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <net/geneve.h>
+#include <net/vxlan.h>
+#include <net/erspan.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/dst.h>
@@ -31,7 +33,7 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a,
params = rcu_dereference_bh(t->params);
tcf_lastuse_update(&t->tcf_tm);
- bstats_cpu_update(this_cpu_ptr(t->common.cpu_bstats), skb);
+ tcf_action_update_bstats(&t->common, skb);
action = READ_ONCE(t->tcf_action);
switch (params->tcft_action) {
@@ -53,7 +55,11 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a,
static const struct nla_policy
enc_opts_policy[TCA_TUNNEL_KEY_ENC_OPTS_MAX + 1] = {
+ [TCA_TUNNEL_KEY_ENC_OPTS_UNSPEC] = {
+ .strict_start_type = TCA_TUNNEL_KEY_ENC_OPTS_VXLAN },
[TCA_TUNNEL_KEY_ENC_OPTS_GENEVE] = { .type = NLA_NESTED },
+ [TCA_TUNNEL_KEY_ENC_OPTS_VXLAN] = { .type = NLA_NESTED },
+ [TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN] = { .type = NLA_NESTED },
};
static const struct nla_policy
@@ -64,6 +70,19 @@ geneve_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1] = {
.len = 128 },
};
+static const struct nla_policy
+vxlan_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX + 1] = {
+ [TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy
+erspan_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX + 1] = {
+ [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER] = { .type = NLA_U8 },
+ [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX] = { .type = NLA_U32 },
+ [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR] = { .type = NLA_U8 },
+ [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID] = { .type = NLA_U8 },
+};
+
static int
tunnel_key_copy_geneve_opt(const struct nlattr *nla, void *dst, int dst_len,
struct netlink_ext_ack *extack)
@@ -116,10 +135,89 @@ tunnel_key_copy_geneve_opt(const struct nlattr *nla, void *dst, int dst_len,
return opt_len;
}
+static int
+tunnel_key_copy_vxlan_opt(const struct nlattr *nla, void *dst, int dst_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX, nla,
+ vxlan_opt_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key vxlan option gbp");
+ return -EINVAL;
+ }
+
+ if (dst) {
+ struct vxlan_metadata *md = dst;
+
+ md->gbp = nla_get_u32(tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP]);
+ }
+
+ return sizeof(struct vxlan_metadata);
+}
+
+static int
+tunnel_key_copy_erspan_opt(const struct nlattr *nla, void *dst, int dst_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX + 1];
+ int err;
+ u8 ver;
+
+ err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX, nla,
+ erspan_opt_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option ver");
+ return -EINVAL;
+ }
+
+ ver = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER]);
+ if (ver == 1) {
+ if (!tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option index");
+ return -EINVAL;
+ }
+ } else if (ver == 2) {
+ if (!tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR] ||
+ !tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option dir or hwid");
+ return -EINVAL;
+ }
+ } else {
+ NL_SET_ERR_MSG(extack, "Tunnel key erspan option ver is incorrect");
+ return -EINVAL;
+ }
+
+ if (dst) {
+ struct erspan_metadata *md = dst;
+
+ md->version = ver;
+ if (ver == 1) {
+ nla = tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX];
+ md->u.index = nla_get_be32(nla);
+ } else {
+ nla = tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR];
+ md->u.md2.dir = nla_get_u8(nla);
+ nla = tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID];
+ set_hwid(&md->u.md2, nla_get_u8(nla));
+ }
+ }
+
+ return sizeof(struct erspan_metadata);
+}
+
static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst,
int dst_len, struct netlink_ext_ack *extack)
{
- int err, rem, opt_len, len = nla_len(nla), opts_len = 0;
+ int err, rem, opt_len, len = nla_len(nla), opts_len = 0, type = 0;
const struct nlattr *attr, *head = nla_data(nla);
err = nla_validate_deprecated(head, len, TCA_TUNNEL_KEY_ENC_OPTS_MAX,
@@ -130,15 +228,48 @@ static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst,
nla_for_each_attr(attr, head, len, rem) {
switch (nla_type(attr)) {
case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE:
+ if (type && type != TUNNEL_GENEVE_OPT) {
+ NL_SET_ERR_MSG(extack, "Duplicate type for geneve options");
+ return -EINVAL;
+ }
opt_len = tunnel_key_copy_geneve_opt(attr, dst,
dst_len, extack);
if (opt_len < 0)
return opt_len;
opts_len += opt_len;
+ if (opts_len > IP_TUNNEL_OPTS_MAX) {
+ NL_SET_ERR_MSG(extack, "Tunnel options exceeds max size");
+ return -EINVAL;
+ }
if (dst) {
dst_len -= opt_len;
dst += opt_len;
}
+ type = TUNNEL_GENEVE_OPT;
+ break;
+ case TCA_TUNNEL_KEY_ENC_OPTS_VXLAN:
+ if (type) {
+ NL_SET_ERR_MSG(extack, "Duplicate type for vxlan options");
+ return -EINVAL;
+ }
+ opt_len = tunnel_key_copy_vxlan_opt(attr, dst,
+ dst_len, extack);
+ if (opt_len < 0)
+ return opt_len;
+ opts_len += opt_len;
+ type = TUNNEL_VXLAN_OPT;
+ break;
+ case TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN:
+ if (type) {
+ NL_SET_ERR_MSG(extack, "Duplicate type for erspan options");
+ return -EINVAL;
+ }
+ opt_len = tunnel_key_copy_erspan_opt(attr, dst,
+ dst_len, extack);
+ if (opt_len < 0)
+ return opt_len;
+ opts_len += opt_len;
+ type = TUNNEL_ERSPAN_OPT;
break;
}
}
@@ -175,6 +306,22 @@ static int tunnel_key_opts_set(struct nlattr *nla, struct ip_tunnel_info *info,
#else
return -EAFNOSUPPORT;
#endif
+ case TCA_TUNNEL_KEY_ENC_OPTS_VXLAN:
+#if IS_ENABLED(CONFIG_INET)
+ info->key.tun_flags |= TUNNEL_VXLAN_OPT;
+ return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info),
+ opts_len, extack);
+#else
+ return -EAFNOSUPPORT;
+#endif
+ case TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN:
+#if IS_ENABLED(CONFIG_INET)
+ info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
+ return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info),
+ opts_len, extack);
+#else
+ return -EAFNOSUPPORT;
+#endif
default:
NL_SET_ERR_MSG(extack, "Cannot set tunnel options for unknown tunnel type");
return -EINVAL;
@@ -208,7 +355,7 @@ static void tunnel_key_release_params(struct tcf_tunnel_key_params *p)
static int tunnel_key_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp,
+ struct tcf_proto *tp, u32 act_flags,
struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
@@ -347,8 +494,9 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
}
if (!exists) {
- ret = tcf_idr_create(tn, index, est, a,
- &act_tunnel_key_ops, bind, true);
+ ret = tcf_idr_create_from_flags(tn, index, est, a,
+ &act_tunnel_key_ops, bind,
+ act_flags);
if (ret) {
NL_SET_ERR_MSG(extack, "Cannot create TC IDR");
goto release_tun_meta;
@@ -381,8 +529,8 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
spin_lock_bh(&t->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
- rcu_swap_protected(t->params, params_new,
- lockdep_is_held(&t->tcf_lock));
+ params_new = rcu_replace_pointer(t->params, params_new,
+ lockdep_is_held(&t->tcf_lock));
spin_unlock_bh(&t->tcf_lock);
tunnel_key_release_params(params_new);
if (goto_ch)
@@ -450,6 +598,56 @@ static int tunnel_key_geneve_opts_dump(struct sk_buff *skb,
return 0;
}
+static int tunnel_key_vxlan_opts_dump(struct sk_buff *skb,
+ const struct ip_tunnel_info *info)
+{
+ struct vxlan_metadata *md = (struct vxlan_metadata *)(info + 1);
+ struct nlattr *start;
+
+ start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_VXLAN);
+ if (!start)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP, md->gbp)) {
+ nla_nest_cancel(skb, start);
+ return -EMSGSIZE;
+ }
+
+ nla_nest_end(skb, start);
+ return 0;
+}
+
+static int tunnel_key_erspan_opts_dump(struct sk_buff *skb,
+ const struct ip_tunnel_info *info)
+{
+ struct erspan_metadata *md = (struct erspan_metadata *)(info + 1);
+ struct nlattr *start;
+
+ start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN);
+ if (!start)
+ return -EMSGSIZE;
+
+ if (nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER, md->version))
+ goto err;
+
+ if (md->version == 1 &&
+ nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX, md->u.index))
+ goto err;
+
+ if (md->version == 2 &&
+ (nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR,
+ md->u.md2.dir) ||
+ nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID,
+ get_hwid(&md->u.md2))))
+ goto err;
+
+ nla_nest_end(skb, start);
+ return 0;
+err:
+ nla_nest_cancel(skb, start);
+ return -EMSGSIZE;
+}
+
static int tunnel_key_opts_dump(struct sk_buff *skb,
const struct ip_tunnel_info *info)
{
@@ -467,6 +665,14 @@ static int tunnel_key_opts_dump(struct sk_buff *skb,
err = tunnel_key_geneve_opts_dump(skb, info);
if (err)
goto err_out;
+ } else if (info->key.tun_flags & TUNNEL_VXLAN_OPT) {
+ err = tunnel_key_vxlan_opts_dump(skb, info);
+ if (err)
+ goto err_out;
+ } else if (info->key.tun_flags & TUNNEL_ERSPAN_OPT) {
+ err = tunnel_key_erspan_opts_dump(skb, info);
+ if (err)
+ goto err_out;
} else {
err_out:
nla_nest_cancel(skb, start);
@@ -600,7 +806,7 @@ static __net_init int tunnel_key_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
- return tc_action_net_init(tn, &act_tunnel_key_ops);
+ return tc_action_net_init(net, tn, &act_tunnel_key_ops);
}
static void __net_exit tunnel_key_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index a3c9eea1ee8a..c91d3958fcbb 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -29,7 +29,7 @@ static int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a,
u16 tci;
tcf_lastuse_update(&v->tcf_tm);
- bstats_cpu_update(this_cpu_ptr(v->common.cpu_bstats), skb);
+ tcf_action_update_bstats(&v->common, skb);
/* Ensure 'data' points at mac_header prior calling vlan manipulating
* functions.
@@ -88,7 +88,7 @@ out:
return action;
drop:
- qstats_drop_inc(this_cpu_ptr(v->common.cpu_qstats));
+ tcf_action_inc_drop_qstats(&v->common);
return TC_ACT_SHOT;
}
@@ -102,7 +102,8 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = {
static int tcf_vlan_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp, struct netlink_ext_ack *extack)
+ struct tcf_proto *tp, u32 flags,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, vlan_net_id);
struct nlattr *tb[TCA_VLAN_MAX + 1];
@@ -188,8 +189,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
action = parm->v_action;
if (!exists) {
- ret = tcf_idr_create(tn, index, est, a,
- &act_vlan_ops, bind, true);
+ ret = tcf_idr_create_from_flags(tn, index, est, a,
+ &act_vlan_ops, bind, flags);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -220,7 +221,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
spin_lock_bh(&v->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
- rcu_swap_protected(v->vlan_p, p, lockdep_is_held(&v->tcf_lock));
+ p = rcu_replace_pointer(v->vlan_p, p, lockdep_is_held(&v->tcf_lock));
spin_unlock_bh(&v->tcf_lock);
if (goto_ch)
@@ -301,6 +302,16 @@ static int tcf_vlan_walker(struct net *net, struct sk_buff *skb,
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
+static void tcf_vlan_stats_update(struct tc_action *a, u64 bytes, u32 packets,
+ u64 lastuse, bool hw)
+{
+ struct tcf_vlan *v = to_vlan(a);
+ struct tcf_t *tm = &v->tcf_tm;
+
+ tcf_action_update_stats(a, bytes, packets, false, hw);
+ tm->lastuse = max_t(u64, tm->lastuse, lastuse);
+}
+
static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, vlan_net_id);
@@ -325,6 +336,7 @@ static struct tc_action_ops act_vlan_ops = {
.init = tcf_vlan_init,
.cleanup = tcf_vlan_cleanup,
.walk = tcf_vlan_walker,
+ .stats_update = tcf_vlan_stats_update,
.get_fill_size = tcf_vlan_get_fill_size,
.lookup = tcf_vlan_search,
.size = sizeof(struct tcf_vlan),
@@ -334,7 +346,7 @@ static __net_init int vlan_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, vlan_net_id);
- return tc_action_net_init(tn, &act_vlan_ops);
+ return tc_action_net_init(net, tn, &act_vlan_ops);
}
static void __net_exit vlan_exit_net(struct list_head *net_list)
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index efd3cfb80a2a..c2cdd0fc2e70 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -21,6 +21,7 @@
#include <linux/slab.h>
#include <linux/idr.h>
#include <linux/rhashtable.h>
+#include <linux/jhash.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/netlink.h>
@@ -36,6 +37,8 @@
#include <net/tc_act/tc_sample.h>
#include <net/tc_act/tc_skbedit.h>
#include <net/tc_act/tc_ct.h>
+#include <net/tc_act/tc_mpls.h>
+#include <net/flow_offload.h>
extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1];
@@ -45,6 +48,62 @@ static LIST_HEAD(tcf_proto_base);
/* Protects list of registered TC modules. It is pure SMP lock. */
static DEFINE_RWLOCK(cls_mod_lock);
+static u32 destroy_obj_hashfn(const struct tcf_proto *tp)
+{
+ return jhash_3words(tp->chain->index, tp->prio,
+ (__force __u32)tp->protocol, 0);
+}
+
+static void tcf_proto_signal_destroying(struct tcf_chain *chain,
+ struct tcf_proto *tp)
+{
+ struct tcf_block *block = chain->block;
+
+ mutex_lock(&block->proto_destroy_lock);
+ hash_add_rcu(block->proto_destroy_ht, &tp->destroy_ht_node,
+ destroy_obj_hashfn(tp));
+ mutex_unlock(&block->proto_destroy_lock);
+}
+
+static bool tcf_proto_cmp(const struct tcf_proto *tp1,
+ const struct tcf_proto *tp2)
+{
+ return tp1->chain->index == tp2->chain->index &&
+ tp1->prio == tp2->prio &&
+ tp1->protocol == tp2->protocol;
+}
+
+static bool tcf_proto_exists_destroying(struct tcf_chain *chain,
+ struct tcf_proto *tp)
+{
+ u32 hash = destroy_obj_hashfn(tp);
+ struct tcf_proto *iter;
+ bool found = false;
+
+ rcu_read_lock();
+ hash_for_each_possible_rcu(chain->block->proto_destroy_ht, iter,
+ destroy_ht_node, hash) {
+ if (tcf_proto_cmp(tp, iter)) {
+ found = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return found;
+}
+
+static void
+tcf_proto_signal_destroyed(struct tcf_chain *chain, struct tcf_proto *tp)
+{
+ struct tcf_block *block = chain->block;
+
+ mutex_lock(&block->proto_destroy_lock);
+ if (hash_hashed(&tp->destroy_ht_node))
+ hash_del_rcu(&tp->destroy_ht_node);
+ mutex_unlock(&block->proto_destroy_lock);
+}
+
/* Find classifier type by string name */
static const struct tcf_proto_ops *__tcf_proto_lookup_ops(const char *kind)
@@ -160,11 +219,22 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp)
return TC_H_MAJ(first);
}
+static bool tcf_proto_check_kind(struct nlattr *kind, char *name)
+{
+ if (kind)
+ return nla_strlcpy(name, kind, IFNAMSIZ) >= IFNAMSIZ;
+ memset(name, 0, IFNAMSIZ);
+ return false;
+}
+
static bool tcf_proto_is_unlocked(const char *kind)
{
const struct tcf_proto_ops *ops;
bool ret;
+ if (strlen(kind) == 0)
+ return false;
+
ops = tcf_proto_lookup_ops(kind, false, NULL);
/* On error return false to take rtnl lock. Proto lookup/create
* functions will perform lookup again and properly handle errors.
@@ -221,9 +291,11 @@ static void tcf_proto_get(struct tcf_proto *tp)
static void tcf_chain_put(struct tcf_chain *chain);
static void tcf_proto_destroy(struct tcf_proto *tp, bool rtnl_held,
- struct netlink_ext_ack *extack)
+ bool sig_destroy, struct netlink_ext_ack *extack)
{
tp->ops->destroy(tp, rtnl_held, extack);
+ if (sig_destroy)
+ tcf_proto_signal_destroyed(tp->chain, tp);
tcf_chain_put(tp->chain);
module_put(tp->ops->owner);
kfree_rcu(tp, rcu);
@@ -233,36 +305,15 @@ static void tcf_proto_put(struct tcf_proto *tp, bool rtnl_held,
struct netlink_ext_ack *extack)
{
if (refcount_dec_and_test(&tp->refcnt))
- tcf_proto_destroy(tp, rtnl_held, extack);
-}
-
-static int walker_check_empty(struct tcf_proto *tp, void *fh,
- struct tcf_walker *arg)
-{
- if (fh) {
- arg->nonempty = true;
- return -1;
- }
- return 0;
+ tcf_proto_destroy(tp, rtnl_held, true, extack);
}
-static bool tcf_proto_is_empty(struct tcf_proto *tp, bool rtnl_held)
+static bool tcf_proto_check_delete(struct tcf_proto *tp)
{
- struct tcf_walker walker = { .fn = walker_check_empty, };
+ if (tp->ops->delete_empty)
+ return tp->ops->delete_empty(tp);
- if (tp->ops->walk) {
- tp->ops->walk(tp, &walker, rtnl_held);
- return !walker.nonempty;
- }
- return true;
-}
-
-static bool tcf_proto_check_delete(struct tcf_proto *tp, bool rtnl_held)
-{
- spin_lock(&tp->lock);
- if (tcf_proto_is_empty(tp, rtnl_held))
- tp->deleting = true;
- spin_unlock(&tp->lock);
+ tp->deleting = true;
return tp->deleting;
}
@@ -357,6 +408,7 @@ static bool tcf_chain_detach(struct tcf_chain *chain)
static void tcf_block_destroy(struct tcf_block *block)
{
mutex_destroy(&block->lock);
+ mutex_destroy(&block->proto_destroy_lock);
kfree_rcu(block, rcu);
}
@@ -532,6 +584,12 @@ static void tcf_chain_flush(struct tcf_chain *chain, bool rtnl_held)
mutex_lock(&chain->filter_chain_lock);
tp = tcf_chain_dereference(chain->filter_chain, chain);
+ while (tp) {
+ tp_next = rcu_dereference_protected(tp->next, 1);
+ tcf_proto_signal_destroying(chain, tp);
+ tp = tp_next;
+ }
+ tp = tcf_chain_dereference(chain->filter_chain, chain);
RCU_INIT_POINTER(chain->filter_chain, NULL);
tcf_chain0_head_change(chain, NULL);
chain->flushing = true;
@@ -544,235 +602,87 @@ static void tcf_chain_flush(struct tcf_chain *chain, bool rtnl_held)
}
}
-static struct tcf_block *tc_dev_ingress_block(struct net_device *dev)
-{
- const struct Qdisc_class_ops *cops;
- struct Qdisc *qdisc;
-
- if (!dev_ingress_queue(dev))
- return NULL;
-
- qdisc = dev_ingress_queue(dev)->qdisc_sleeping;
- if (!qdisc)
- return NULL;
-
- cops = qdisc->ops->cl_ops;
- if (!cops)
- return NULL;
-
- if (!cops->tcf_block)
- return NULL;
-
- return cops->tcf_block(qdisc, TC_H_MIN_INGRESS, NULL);
-}
-
-static struct rhashtable indr_setup_block_ht;
-
-struct tc_indr_block_dev {
- struct rhash_head ht_node;
- struct net_device *dev;
- unsigned int refcnt;
- struct list_head cb_list;
- struct tcf_block *block;
-};
-
-struct tc_indr_block_cb {
- struct list_head list;
- void *cb_priv;
- tc_indr_block_bind_cb_t *cb;
- void *cb_ident;
-};
-
-static const struct rhashtable_params tc_indr_setup_block_ht_params = {
- .key_offset = offsetof(struct tc_indr_block_dev, dev),
- .head_offset = offsetof(struct tc_indr_block_dev, ht_node),
- .key_len = sizeof(struct net_device *),
-};
-
-static struct tc_indr_block_dev *
-tc_indr_block_dev_lookup(struct net_device *dev)
-{
- return rhashtable_lookup_fast(&indr_setup_block_ht, &dev,
- tc_indr_setup_block_ht_params);
-}
-
-static struct tc_indr_block_dev *tc_indr_block_dev_get(struct net_device *dev)
-{
- struct tc_indr_block_dev *indr_dev;
-
- indr_dev = tc_indr_block_dev_lookup(dev);
- if (indr_dev)
- goto inc_ref;
-
- indr_dev = kzalloc(sizeof(*indr_dev), GFP_KERNEL);
- if (!indr_dev)
- return NULL;
-
- INIT_LIST_HEAD(&indr_dev->cb_list);
- indr_dev->dev = dev;
- indr_dev->block = tc_dev_ingress_block(dev);
- if (rhashtable_insert_fast(&indr_setup_block_ht, &indr_dev->ht_node,
- tc_indr_setup_block_ht_params)) {
- kfree(indr_dev);
- return NULL;
- }
-
-inc_ref:
- indr_dev->refcnt++;
- return indr_dev;
-}
-
-static void tc_indr_block_dev_put(struct tc_indr_block_dev *indr_dev)
-{
- if (--indr_dev->refcnt)
- return;
-
- rhashtable_remove_fast(&indr_setup_block_ht, &indr_dev->ht_node,
- tc_indr_setup_block_ht_params);
- kfree(indr_dev);
-}
-
-static struct tc_indr_block_cb *
-tc_indr_block_cb_lookup(struct tc_indr_block_dev *indr_dev,
- tc_indr_block_bind_cb_t *cb, void *cb_ident)
-{
- struct tc_indr_block_cb *indr_block_cb;
-
- list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
- if (indr_block_cb->cb == cb &&
- indr_block_cb->cb_ident == cb_ident)
- return indr_block_cb;
- return NULL;
-}
-
-static struct tc_indr_block_cb *
-tc_indr_block_cb_add(struct tc_indr_block_dev *indr_dev, void *cb_priv,
- tc_indr_block_bind_cb_t *cb, void *cb_ident)
-{
- struct tc_indr_block_cb *indr_block_cb;
-
- indr_block_cb = tc_indr_block_cb_lookup(indr_dev, cb, cb_ident);
- if (indr_block_cb)
- return ERR_PTR(-EEXIST);
-
- indr_block_cb = kzalloc(sizeof(*indr_block_cb), GFP_KERNEL);
- if (!indr_block_cb)
- return ERR_PTR(-ENOMEM);
-
- indr_block_cb->cb_priv = cb_priv;
- indr_block_cb->cb = cb;
- indr_block_cb->cb_ident = cb_ident;
- list_add(&indr_block_cb->list, &indr_dev->cb_list);
-
- return indr_block_cb;
-}
-
-static void tc_indr_block_cb_del(struct tc_indr_block_cb *indr_block_cb)
-{
- list_del(&indr_block_cb->list);
- kfree(indr_block_cb);
-}
-
static int tcf_block_setup(struct tcf_block *block,
struct flow_block_offload *bo);
-static void tc_indr_block_ing_cmd(struct tc_indr_block_dev *indr_dev,
- struct tc_indr_block_cb *indr_block_cb,
- enum flow_block_command command)
+static void tc_indr_block_cmd(struct net_device *dev, struct tcf_block *block,
+ flow_indr_block_bind_cb_t *cb, void *cb_priv,
+ enum flow_block_command command, bool ingress)
{
struct flow_block_offload bo = {
.command = command,
- .binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS,
- .net = dev_net(indr_dev->dev),
- .block_shared = tcf_block_non_null_shared(indr_dev->block),
+ .binder_type = ingress ?
+ FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS :
+ FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS,
+ .net = dev_net(dev),
+ .block_shared = tcf_block_non_null_shared(block),
};
INIT_LIST_HEAD(&bo.cb_list);
- if (!indr_dev->block)
+ if (!block)
return;
- bo.block = &indr_dev->block->flow_block;
+ bo.block = &block->flow_block;
+
+ down_write(&block->cb_lock);
+ cb(dev, cb_priv, TC_SETUP_BLOCK, &bo);
- indr_block_cb->cb(indr_dev->dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK,
- &bo);
- tcf_block_setup(indr_dev->block, &bo);
+ tcf_block_setup(block, &bo);
+ up_write(&block->cb_lock);
}
-int __tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
- tc_indr_block_bind_cb_t *cb, void *cb_ident)
+static struct tcf_block *tc_dev_block(struct net_device *dev, bool ingress)
{
- struct tc_indr_block_cb *indr_block_cb;
- struct tc_indr_block_dev *indr_dev;
- int err;
+ const struct Qdisc_class_ops *cops;
+ const struct Qdisc_ops *ops;
+ struct Qdisc *qdisc;
- indr_dev = tc_indr_block_dev_get(dev);
- if (!indr_dev)
- return -ENOMEM;
+ if (!dev_ingress_queue(dev))
+ return NULL;
- indr_block_cb = tc_indr_block_cb_add(indr_dev, cb_priv, cb, cb_ident);
- err = PTR_ERR_OR_ZERO(indr_block_cb);
- if (err)
- goto err_dev_put;
+ qdisc = dev_ingress_queue(dev)->qdisc_sleeping;
+ if (!qdisc)
+ return NULL;
- tc_indr_block_ing_cmd(indr_dev, indr_block_cb, FLOW_BLOCK_BIND);
- return 0;
+ ops = qdisc->ops;
+ if (!ops)
+ return NULL;
-err_dev_put:
- tc_indr_block_dev_put(indr_dev);
- return err;
-}
-EXPORT_SYMBOL_GPL(__tc_indr_block_cb_register);
+ if (!ingress && !strcmp("ingress", ops->id))
+ return NULL;
-int tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
- tc_indr_block_bind_cb_t *cb, void *cb_ident)
-{
- int err;
+ cops = ops->cl_ops;
+ if (!cops)
+ return NULL;
- rtnl_lock();
- err = __tc_indr_block_cb_register(dev, cb_priv, cb, cb_ident);
- rtnl_unlock();
+ if (!cops->tcf_block)
+ return NULL;
- return err;
+ return cops->tcf_block(qdisc,
+ ingress ? TC_H_MIN_INGRESS : TC_H_MIN_EGRESS,
+ NULL);
}
-EXPORT_SYMBOL_GPL(tc_indr_block_cb_register);
-void __tc_indr_block_cb_unregister(struct net_device *dev,
- tc_indr_block_bind_cb_t *cb, void *cb_ident)
+static void tc_indr_block_get_and_cmd(struct net_device *dev,
+ flow_indr_block_bind_cb_t *cb,
+ void *cb_priv,
+ enum flow_block_command command)
{
- struct tc_indr_block_cb *indr_block_cb;
- struct tc_indr_block_dev *indr_dev;
-
- indr_dev = tc_indr_block_dev_lookup(dev);
- if (!indr_dev)
- return;
+ struct tcf_block *block;
- indr_block_cb = tc_indr_block_cb_lookup(indr_dev, cb, cb_ident);
- if (!indr_block_cb)
- return;
+ block = tc_dev_block(dev, true);
+ tc_indr_block_cmd(dev, block, cb, cb_priv, command, true);
- /* Send unbind message if required to free any block cbs. */
- tc_indr_block_ing_cmd(indr_dev, indr_block_cb, FLOW_BLOCK_UNBIND);
- tc_indr_block_cb_del(indr_block_cb);
- tc_indr_block_dev_put(indr_dev);
-}
-EXPORT_SYMBOL_GPL(__tc_indr_block_cb_unregister);
-
-void tc_indr_block_cb_unregister(struct net_device *dev,
- tc_indr_block_bind_cb_t *cb, void *cb_ident)
-{
- rtnl_lock();
- __tc_indr_block_cb_unregister(dev, cb, cb_ident);
- rtnl_unlock();
+ block = tc_dev_block(dev, false);
+ tc_indr_block_cmd(dev, block, cb, cb_priv, command, false);
}
-EXPORT_SYMBOL_GPL(tc_indr_block_cb_unregister);
-static void tc_indr_block_call(struct tcf_block *block, struct net_device *dev,
+static void tc_indr_block_call(struct tcf_block *block,
+ struct net_device *dev,
struct tcf_block_ext_info *ei,
enum flow_block_command command,
struct netlink_ext_ack *extack)
{
- struct tc_indr_block_cb *indr_block_cb;
- struct tc_indr_block_dev *indr_dev;
struct flow_block_offload bo = {
.command = command,
.binder_type = ei->binder_type,
@@ -783,22 +693,13 @@ static void tc_indr_block_call(struct tcf_block *block, struct net_device *dev,
};
INIT_LIST_HEAD(&bo.cb_list);
- indr_dev = tc_indr_block_dev_lookup(dev);
- if (!indr_dev)
- return;
-
- indr_dev->block = command == FLOW_BLOCK_BIND ? block : NULL;
-
- list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
- indr_block_cb->cb(dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK,
- &bo);
-
+ flow_indr_block_call(dev, &bo, command);
tcf_block_setup(block, &bo);
}
static bool tcf_block_offload_in_use(struct tcf_block *block)
{
- return block->offloadcnt;
+ return atomic_read(&block->offloadcnt);
}
static int tcf_block_offload_cmd(struct tcf_block *block,
@@ -832,6 +733,7 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
struct net_device *dev = q->dev_queue->dev;
int err;
+ down_write(&block->cb_lock);
if (!dev->netdev_ops->ndo_setup_tc)
goto no_offload_dev_inc;
@@ -840,24 +742,31 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
*/
if (!tc_can_offload(dev) && tcf_block_offload_in_use(block)) {
NL_SET_ERR_MSG(extack, "Bind to offloaded block failed as dev has offload disabled");
- return -EOPNOTSUPP;
+ err = -EOPNOTSUPP;
+ goto err_unlock;
}
err = tcf_block_offload_cmd(block, dev, ei, FLOW_BLOCK_BIND, extack);
if (err == -EOPNOTSUPP)
goto no_offload_dev_inc;
if (err)
- return err;
+ goto err_unlock;
tc_indr_block_call(block, dev, ei, FLOW_BLOCK_BIND, extack);
+ up_write(&block->cb_lock);
return 0;
no_offload_dev_inc:
- if (tcf_block_offload_in_use(block))
- return -EOPNOTSUPP;
+ if (tcf_block_offload_in_use(block)) {
+ err = -EOPNOTSUPP;
+ goto err_unlock;
+ }
+ err = 0;
block->nooffloaddevcnt++;
tc_indr_block_call(block, dev, ei, FLOW_BLOCK_BIND, extack);
- return 0;
+err_unlock:
+ up_write(&block->cb_lock);
+ return err;
}
static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q,
@@ -866,6 +775,7 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q,
struct net_device *dev = q->dev_queue->dev;
int err;
+ down_write(&block->cb_lock);
tc_indr_block_call(block, dev, ei, FLOW_BLOCK_UNBIND, NULL);
if (!dev->netdev_ops->ndo_setup_tc)
@@ -873,10 +783,12 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q,
err = tcf_block_offload_cmd(block, dev, ei, FLOW_BLOCK_UNBIND, NULL);
if (err == -EOPNOTSUPP)
goto no_offload_dev_dec;
+ up_write(&block->cb_lock);
return;
no_offload_dev_dec:
WARN_ON(block->nooffloaddevcnt-- == 0);
+ up_write(&block->cb_lock);
}
static int
@@ -991,6 +903,8 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q,
return ERR_PTR(-ENOMEM);
}
mutex_init(&block->lock);
+ mutex_init(&block->proto_destroy_lock);
+ init_rwsem(&block->cb_lock);
flow_block_init(&block->flow_block);
INIT_LIST_HEAD(&block->chain_list);
INIT_LIST_HEAD(&block->owner_list);
@@ -1526,6 +1440,8 @@ tcf_block_playback_offloads(struct tcf_block *block, flow_setup_cb_t *cb,
struct tcf_proto *tp, *tp_prev;
int err;
+ lockdep_assert_held(&block->cb_lock);
+
for (chain = __tcf_get_next_chain(block, NULL);
chain;
chain_prev = chain,
@@ -1564,6 +1480,8 @@ static int tcf_block_bind(struct tcf_block *block,
struct flow_block_cb *block_cb, *next;
int err, i = 0;
+ lockdep_assert_held(&block->cb_lock);
+
list_for_each_entry(block_cb, &bo->cb_list, list) {
err = tcf_block_playback_offloads(block, block_cb->cb,
block_cb->cb_priv, true,
@@ -1571,6 +1489,8 @@ static int tcf_block_bind(struct tcf_block *block,
bo->extack);
if (err)
goto err_unroll;
+ if (!bo->unlocked_driver_cb)
+ block->lockeddevcnt++;
i++;
}
@@ -1586,6 +1506,8 @@ err_unroll:
block_cb->cb_priv, false,
tcf_block_offload_in_use(block),
NULL);
+ if (!bo->unlocked_driver_cb)
+ block->lockeddevcnt--;
}
flow_block_cb_free(block_cb);
}
@@ -1598,6 +1520,8 @@ static void tcf_block_unbind(struct tcf_block *block,
{
struct flow_block_cb *block_cb, *next;
+ lockdep_assert_held(&block->cb_lock);
+
list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) {
tcf_block_playback_offloads(block, block_cb->cb,
block_cb->cb_priv, false,
@@ -1605,6 +1529,8 @@ static void tcf_block_unbind(struct tcf_block *block,
NULL);
list_del(&block_cb->list);
flow_block_cb_free(block_cb);
+ if (!bo->unlocked_driver_cb)
+ block->lockeddevcnt--;
}
}
@@ -1659,6 +1585,18 @@ reclassify:
goto reset;
} else if (unlikely(TC_ACT_EXT_CMP(err, TC_ACT_GOTO_CHAIN))) {
first_tp = res->goto_tp;
+
+#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+ {
+ struct tc_skb_ext *ext;
+
+ ext = skb_ext_add(skb, TC_SKB_EXT);
+ if (WARN_ON_ONCE(!ext))
+ return TC_ACT_SHOT;
+
+ ext->chain = err & TC_ACT_EXT_VAL_MASK;
+ }
+#endif
goto reset;
}
#endif
@@ -1743,6 +1681,12 @@ static struct tcf_proto *tcf_chain_tp_insert_unique(struct tcf_chain *chain,
mutex_lock(&chain->filter_chain_lock);
+ if (tcf_proto_exists_destroying(chain, tp_new)) {
+ mutex_unlock(&chain->filter_chain_lock);
+ tcf_proto_destroy(tp_new, rtnl_held, false, NULL);
+ return ERR_PTR(-EAGAIN);
+ }
+
tp = tcf_chain_tp_find(chain, &chain_info,
protocol, prio, false);
if (!tp)
@@ -1750,10 +1694,10 @@ static struct tcf_proto *tcf_chain_tp_insert_unique(struct tcf_chain *chain,
mutex_unlock(&chain->filter_chain_lock);
if (tp) {
- tcf_proto_destroy(tp_new, rtnl_held, NULL);
+ tcf_proto_destroy(tp_new, rtnl_held, false, NULL);
tp_new = tp;
} else if (err) {
- tcf_proto_destroy(tp_new, rtnl_held, NULL);
+ tcf_proto_destroy(tp_new, rtnl_held, false, NULL);
tp_new = ERR_PTR(err);
}
@@ -1786,11 +1730,12 @@ static void tcf_chain_tp_delete_empty(struct tcf_chain *chain,
* concurrently.
* Mark tp for deletion if it is empty.
*/
- if (!tp_iter || !tcf_proto_check_delete(tp, rtnl_held)) {
+ if (!tp_iter || !tcf_proto_check_delete(tp)) {
mutex_unlock(&chain->filter_chain_lock);
return;
}
+ tcf_proto_signal_destroying(chain, tp);
next = tcf_chain_dereference(chain_info.next, chain);
if (tp == chain->filter_chain)
tcf_chain0_head_change(chain, next);
@@ -1976,6 +1921,7 @@ static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
{
struct net *net = sock_net(skb->sk);
struct nlattr *tca[TCA_MAX + 1];
+ char name[IFNAMSIZ];
struct tcmsg *t;
u32 protocol;
u32 prio;
@@ -2032,13 +1978,19 @@ replay:
if (err)
return err;
+ if (tcf_proto_check_kind(tca[TCA_KIND], name)) {
+ NL_SET_ERR_MSG(extack, "Specified TC filter name too long");
+ err = -EINVAL;
+ goto errout;
+ }
+
/* Take rtnl mutex if rtnl_held was set to true on previous iteration,
* block is shared (no qdisc found), qdisc is not unlocked, classifier
* type is not specified, classifier is not unlocked.
*/
if (rtnl_held ||
(q && !(q->ops->cl_ops->flags & QDISC_CLASS_OPS_DOIT_UNLOCKED)) ||
- !tca[TCA_KIND] || !tcf_proto_is_unlocked(nla_data(tca[TCA_KIND]))) {
+ !tcf_proto_is_unlocked(name)) {
rtnl_held = true;
rtnl_lock();
}
@@ -2103,9 +2055,8 @@ replay:
&chain_info));
mutex_unlock(&chain->filter_chain_lock);
- tp_new = tcf_proto_create(nla_data(tca[TCA_KIND]),
- protocol, prio, chain, rtnl_held,
- extack);
+ tp_new = tcf_proto_create(name, protocol, prio, chain,
+ rtnl_held, extack);
if (IS_ERR(tp_new)) {
err = PTR_ERR(tp_new);
goto errout_tp;
@@ -2196,6 +2147,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
{
struct net *net = sock_net(skb->sk);
struct nlattr *tca[TCA_MAX + 1];
+ char name[IFNAMSIZ];
struct tcmsg *t;
u32 protocol;
u32 prio;
@@ -2235,13 +2187,18 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
if (err)
return err;
+ if (tcf_proto_check_kind(tca[TCA_KIND], name)) {
+ NL_SET_ERR_MSG(extack, "Specified TC filter name too long");
+ err = -EINVAL;
+ goto errout;
+ }
/* Take rtnl mutex if flushing whole chain, block is shared (no qdisc
* found), qdisc is not unlocked, classifier type is not specified,
* classifier is not unlocked.
*/
if (!prio ||
(q && !(q->ops->cl_ops->flags & QDISC_CLASS_OPS_DOIT_UNLOCKED)) ||
- !tca[TCA_KIND] || !tcf_proto_is_unlocked(nla_data(tca[TCA_KIND]))) {
+ !tcf_proto_is_unlocked(name)) {
rtnl_held = true;
rtnl_lock();
}
@@ -2297,6 +2254,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
err = -EINVAL;
goto errout_locked;
} else if (t->tcm_handle == 0) {
+ tcf_proto_signal_destroying(chain, tp);
tcf_chain_tp_remove(chain, &chain_info, tp);
mutex_unlock(&chain->filter_chain_lock);
@@ -2349,6 +2307,7 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
{
struct net *net = sock_net(skb->sk);
struct nlattr *tca[TCA_MAX + 1];
+ char name[IFNAMSIZ];
struct tcmsg *t;
u32 protocol;
u32 prio;
@@ -2385,12 +2344,17 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
if (err)
return err;
+ if (tcf_proto_check_kind(tca[TCA_KIND], name)) {
+ NL_SET_ERR_MSG(extack, "Specified TC filter name too long");
+ err = -EINVAL;
+ goto errout;
+ }
/* Take rtnl mutex if block is shared (no qdisc found), qdisc is not
* unlocked, classifier type is not specified, classifier is not
* unlocked.
*/
if ((q && !(q->ops->cl_ops->flags & QDISC_CLASS_OPS_DOIT_UNLOCKED)) ||
- !tca[TCA_KIND] || !tcf_proto_is_unlocked(nla_data(tca[TCA_KIND]))) {
+ !tcf_proto_is_unlocked(name)) {
rtnl_held = true;
rtnl_lock();
}
@@ -2749,13 +2713,19 @@ static int tc_chain_tmplt_add(struct tcf_chain *chain, struct net *net,
struct netlink_ext_ack *extack)
{
const struct tcf_proto_ops *ops;
+ char name[IFNAMSIZ];
void *tmplt_priv;
/* If kind is not set, user did not specify template. */
if (!tca[TCA_KIND])
return 0;
- ops = tcf_proto_lookup_ops(nla_data(tca[TCA_KIND]), true, extack);
+ if (tcf_proto_check_kind(tca[TCA_KIND], name)) {
+ NL_SET_ERR_MSG(extack, "Specified TC chain template name too long");
+ return -EINVAL;
+ }
+
+ ops = tcf_proto_lookup_ops(name, true, extack);
if (IS_ERR(ops))
return PTR_ERR(ops);
if (!ops->tmplt_create || !ops->tmplt_destroy || !ops->tmplt_dump) {
@@ -3027,8 +2997,10 @@ out:
void tcf_exts_destroy(struct tcf_exts *exts)
{
#ifdef CONFIG_NET_CLS_ACT
- tcf_action_destroy(exts->actions, TCA_ACT_UNBIND);
- kfree(exts->actions);
+ if (exts->actions) {
+ tcf_action_destroy(exts->actions, TCA_ACT_UNBIND);
+ kfree(exts->actions);
+ }
exts->nr_actions = 0;
#endif
}
@@ -3151,17 +3123,61 @@ int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts)
}
EXPORT_SYMBOL(tcf_exts_dump_stats);
-int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type,
- void *type_data, bool err_stop)
+static void tcf_block_offload_inc(struct tcf_block *block, u32 *flags)
+{
+ if (*flags & TCA_CLS_FLAGS_IN_HW)
+ return;
+ *flags |= TCA_CLS_FLAGS_IN_HW;
+ atomic_inc(&block->offloadcnt);
+}
+
+static void tcf_block_offload_dec(struct tcf_block *block, u32 *flags)
+{
+ if (!(*flags & TCA_CLS_FLAGS_IN_HW))
+ return;
+ *flags &= ~TCA_CLS_FLAGS_IN_HW;
+ atomic_dec(&block->offloadcnt);
+}
+
+static void tc_cls_offload_cnt_update(struct tcf_block *block,
+ struct tcf_proto *tp, u32 *cnt,
+ u32 *flags, u32 diff, bool add)
+{
+ lockdep_assert_held(&block->cb_lock);
+
+ spin_lock(&tp->lock);
+ if (add) {
+ if (!*cnt)
+ tcf_block_offload_inc(block, flags);
+ *cnt += diff;
+ } else {
+ *cnt -= diff;
+ if (!*cnt)
+ tcf_block_offload_dec(block, flags);
+ }
+ spin_unlock(&tp->lock);
+}
+
+static void
+tc_cls_offload_cnt_reset(struct tcf_block *block, struct tcf_proto *tp,
+ u32 *cnt, u32 *flags)
+{
+ lockdep_assert_held(&block->cb_lock);
+
+ spin_lock(&tp->lock);
+ tcf_block_offload_dec(block, flags);
+ *cnt = 0;
+ spin_unlock(&tp->lock);
+}
+
+static int
+__tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type,
+ void *type_data, bool err_stop)
{
struct flow_block_cb *block_cb;
int ok_count = 0;
int err;
- /* Make sure all netdevs sharing this block are offload-capable. */
- if (block->nooffloaddevcnt && err_stop)
- return -EOPNOTSUPP;
-
list_for_each_entry(block_cb, &block->flow_block.cb_list, list) {
err = block_cb->cb(type, type_data, block_cb->cb_priv);
if (err) {
@@ -3173,17 +3189,261 @@ int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type,
}
return ok_count;
}
+
+int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type,
+ void *type_data, bool err_stop, bool rtnl_held)
+{
+ bool take_rtnl = READ_ONCE(block->lockeddevcnt) && !rtnl_held;
+ int ok_count;
+
+retry:
+ if (take_rtnl)
+ rtnl_lock();
+ down_read(&block->cb_lock);
+ /* Need to obtain rtnl lock if block is bound to devs that require it.
+ * In block bind code cb_lock is obtained while holding rtnl, so we must
+ * obtain the locks in same order here.
+ */
+ if (!rtnl_held && !take_rtnl && block->lockeddevcnt) {
+ up_read(&block->cb_lock);
+ take_rtnl = true;
+ goto retry;
+ }
+
+ ok_count = __tc_setup_cb_call(block, type, type_data, err_stop);
+
+ up_read(&block->cb_lock);
+ if (take_rtnl)
+ rtnl_unlock();
+ return ok_count;
+}
EXPORT_SYMBOL(tc_setup_cb_call);
+/* Non-destructive filter add. If filter that wasn't already in hardware is
+ * successfully offloaded, increment block offloads counter. On failure,
+ * previously offloaded filter is considered to be intact and offloads counter
+ * is not decremented.
+ */
+
+int tc_setup_cb_add(struct tcf_block *block, struct tcf_proto *tp,
+ enum tc_setup_type type, void *type_data, bool err_stop,
+ u32 *flags, unsigned int *in_hw_count, bool rtnl_held)
+{
+ bool take_rtnl = READ_ONCE(block->lockeddevcnt) && !rtnl_held;
+ int ok_count;
+
+retry:
+ if (take_rtnl)
+ rtnl_lock();
+ down_read(&block->cb_lock);
+ /* Need to obtain rtnl lock if block is bound to devs that require it.
+ * In block bind code cb_lock is obtained while holding rtnl, so we must
+ * obtain the locks in same order here.
+ */
+ if (!rtnl_held && !take_rtnl && block->lockeddevcnt) {
+ up_read(&block->cb_lock);
+ take_rtnl = true;
+ goto retry;
+ }
+
+ /* Make sure all netdevs sharing this block are offload-capable. */
+ if (block->nooffloaddevcnt && err_stop) {
+ ok_count = -EOPNOTSUPP;
+ goto err_unlock;
+ }
+
+ ok_count = __tc_setup_cb_call(block, type, type_data, err_stop);
+ if (ok_count < 0)
+ goto err_unlock;
+
+ if (tp->ops->hw_add)
+ tp->ops->hw_add(tp, type_data);
+ if (ok_count > 0)
+ tc_cls_offload_cnt_update(block, tp, in_hw_count, flags,
+ ok_count, true);
+err_unlock:
+ up_read(&block->cb_lock);
+ if (take_rtnl)
+ rtnl_unlock();
+ return ok_count < 0 ? ok_count : 0;
+}
+EXPORT_SYMBOL(tc_setup_cb_add);
+
+/* Destructive filter replace. If filter that wasn't already in hardware is
+ * successfully offloaded, increment block offload counter. On failure,
+ * previously offloaded filter is considered to be destroyed and offload counter
+ * is decremented.
+ */
+
+int tc_setup_cb_replace(struct tcf_block *block, struct tcf_proto *tp,
+ enum tc_setup_type type, void *type_data, bool err_stop,
+ u32 *old_flags, unsigned int *old_in_hw_count,
+ u32 *new_flags, unsigned int *new_in_hw_count,
+ bool rtnl_held)
+{
+ bool take_rtnl = READ_ONCE(block->lockeddevcnt) && !rtnl_held;
+ int ok_count;
+
+retry:
+ if (take_rtnl)
+ rtnl_lock();
+ down_read(&block->cb_lock);
+ /* Need to obtain rtnl lock if block is bound to devs that require it.
+ * In block bind code cb_lock is obtained while holding rtnl, so we must
+ * obtain the locks in same order here.
+ */
+ if (!rtnl_held && !take_rtnl && block->lockeddevcnt) {
+ up_read(&block->cb_lock);
+ take_rtnl = true;
+ goto retry;
+ }
+
+ /* Make sure all netdevs sharing this block are offload-capable. */
+ if (block->nooffloaddevcnt && err_stop) {
+ ok_count = -EOPNOTSUPP;
+ goto err_unlock;
+ }
+
+ tc_cls_offload_cnt_reset(block, tp, old_in_hw_count, old_flags);
+ if (tp->ops->hw_del)
+ tp->ops->hw_del(tp, type_data);
+
+ ok_count = __tc_setup_cb_call(block, type, type_data, err_stop);
+ if (ok_count < 0)
+ goto err_unlock;
+
+ if (tp->ops->hw_add)
+ tp->ops->hw_add(tp, type_data);
+ if (ok_count > 0)
+ tc_cls_offload_cnt_update(block, tp, new_in_hw_count,
+ new_flags, ok_count, true);
+err_unlock:
+ up_read(&block->cb_lock);
+ if (take_rtnl)
+ rtnl_unlock();
+ return ok_count < 0 ? ok_count : 0;
+}
+EXPORT_SYMBOL(tc_setup_cb_replace);
+
+/* Destroy filter and decrement block offload counter, if filter was previously
+ * offloaded.
+ */
+
+int tc_setup_cb_destroy(struct tcf_block *block, struct tcf_proto *tp,
+ enum tc_setup_type type, void *type_data, bool err_stop,
+ u32 *flags, unsigned int *in_hw_count, bool rtnl_held)
+{
+ bool take_rtnl = READ_ONCE(block->lockeddevcnt) && !rtnl_held;
+ int ok_count;
+
+retry:
+ if (take_rtnl)
+ rtnl_lock();
+ down_read(&block->cb_lock);
+ /* Need to obtain rtnl lock if block is bound to devs that require it.
+ * In block bind code cb_lock is obtained while holding rtnl, so we must
+ * obtain the locks in same order here.
+ */
+ if (!rtnl_held && !take_rtnl && block->lockeddevcnt) {
+ up_read(&block->cb_lock);
+ take_rtnl = true;
+ goto retry;
+ }
+
+ ok_count = __tc_setup_cb_call(block, type, type_data, err_stop);
+
+ tc_cls_offload_cnt_reset(block, tp, in_hw_count, flags);
+ if (tp->ops->hw_del)
+ tp->ops->hw_del(tp, type_data);
+
+ up_read(&block->cb_lock);
+ if (take_rtnl)
+ rtnl_unlock();
+ return ok_count < 0 ? ok_count : 0;
+}
+EXPORT_SYMBOL(tc_setup_cb_destroy);
+
+int tc_setup_cb_reoffload(struct tcf_block *block, struct tcf_proto *tp,
+ bool add, flow_setup_cb_t *cb,
+ enum tc_setup_type type, void *type_data,
+ void *cb_priv, u32 *flags, unsigned int *in_hw_count)
+{
+ int err = cb(type, type_data, cb_priv);
+
+ if (err) {
+ if (add && tc_skip_sw(*flags))
+ return err;
+ } else {
+ tc_cls_offload_cnt_update(block, tp, in_hw_count, flags, 1,
+ add);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(tc_setup_cb_reoffload);
+
+void tc_cleanup_flow_action(struct flow_action *flow_action)
+{
+ struct flow_action_entry *entry;
+ int i;
+
+ flow_action_for_each(i, entry, flow_action)
+ if (entry->destructor)
+ entry->destructor(entry->destructor_priv);
+}
+EXPORT_SYMBOL(tc_cleanup_flow_action);
+
+static void tcf_mirred_get_dev(struct flow_action_entry *entry,
+ const struct tc_action *act)
+{
+#ifdef CONFIG_NET_CLS_ACT
+ entry->dev = act->ops->get_dev(act, &entry->destructor);
+ if (!entry->dev)
+ return;
+ entry->destructor_priv = entry->dev;
+#endif
+}
+
+static void tcf_tunnel_encap_put_tunnel(void *priv)
+{
+ struct ip_tunnel_info *tunnel = priv;
+
+ kfree(tunnel);
+}
+
+static int tcf_tunnel_encap_get_tunnel(struct flow_action_entry *entry,
+ const struct tc_action *act)
+{
+ entry->tunnel = tcf_tunnel_info_copy(act);
+ if (!entry->tunnel)
+ return -ENOMEM;
+ entry->destructor = tcf_tunnel_encap_put_tunnel;
+ entry->destructor_priv = entry->tunnel;
+ return 0;
+}
+
+static void tcf_sample_get_group(struct flow_action_entry *entry,
+ const struct tc_action *act)
+{
+#ifdef CONFIG_NET_CLS_ACT
+ entry->sample.psample_group =
+ act->ops->get_psample_group(act, &entry->destructor);
+ entry->destructor_priv = entry->sample.psample_group;
+#endif
+}
+
int tc_setup_flow_action(struct flow_action *flow_action,
- const struct tcf_exts *exts)
+ const struct tcf_exts *exts, bool rtnl_held)
{
const struct tc_action *act;
- int i, j, k;
+ int i, j, k, err = 0;
if (!exts)
return 0;
+ if (!rtnl_held)
+ rtnl_lock();
+
j = 0;
tcf_exts_for_each_action(i, act, exts) {
struct flow_action_entry *entry;
@@ -3200,10 +3460,16 @@ int tc_setup_flow_action(struct flow_action *flow_action,
entry->chain_index = tcf_gact_goto_chain_index(act);
} else if (is_tcf_mirred_egress_redirect(act)) {
entry->id = FLOW_ACTION_REDIRECT;
- entry->dev = tcf_mirred_dev(act);
+ tcf_mirred_get_dev(entry, act);
} else if (is_tcf_mirred_egress_mirror(act)) {
entry->id = FLOW_ACTION_MIRRED;
- entry->dev = tcf_mirred_dev(act);
+ tcf_mirred_get_dev(entry, act);
+ } else if (is_tcf_mirred_ingress_redirect(act)) {
+ entry->id = FLOW_ACTION_REDIRECT_INGRESS;
+ tcf_mirred_get_dev(entry, act);
+ } else if (is_tcf_mirred_ingress_mirror(act)) {
+ entry->id = FLOW_ACTION_MIRRED_INGRESS;
+ tcf_mirred_get_dev(entry, act);
} else if (is_tcf_vlan(act)) {
switch (tcf_vlan_action(act)) {
case TCA_VLAN_ACT_PUSH:
@@ -3222,11 +3488,14 @@ int tc_setup_flow_action(struct flow_action *flow_action,
entry->vlan.prio = tcf_vlan_push_prio(act);
break;
default:
+ err = -EOPNOTSUPP;
goto err_out;
}
} else if (is_tcf_tunnel_set(act)) {
entry->id = FLOW_ACTION_TUNNEL_ENCAP;
- entry->tunnel = tcf_tunnel_info(act);
+ err = tcf_tunnel_encap_get_tunnel(entry, act);
+ if (err)
+ goto err_out;
} else if (is_tcf_tunnel_release(act)) {
entry->id = FLOW_ACTION_TUNNEL_DECAP;
} else if (is_tcf_pedit(act)) {
@@ -3239,6 +3508,7 @@ int tc_setup_flow_action(struct flow_action *flow_action,
entry->id = FLOW_ACTION_ADD;
break;
default:
+ err = -EOPNOTSUPP;
goto err_out;
}
entry->mangle.htype = tcf_pedit_htype(act, k);
@@ -3255,11 +3525,10 @@ int tc_setup_flow_action(struct flow_action *flow_action,
entry->mark = tcf_skbedit_mark(act);
} else if (is_tcf_sample(act)) {
entry->id = FLOW_ACTION_SAMPLE;
- entry->sample.psample_group =
- tcf_sample_psample_group(act);
entry->sample.trunc_size = tcf_sample_trunc_size(act);
entry->sample.truncate = tcf_sample_truncate(act);
entry->sample.rate = tcf_sample_rate(act);
+ tcf_sample_get_group(entry, act);
} else if (is_tcf_police(act)) {
entry->id = FLOW_ACTION_POLICE;
entry->police.burst = tcf_police_tcfp_burst(act);
@@ -3269,16 +3538,50 @@ int tc_setup_flow_action(struct flow_action *flow_action,
entry->id = FLOW_ACTION_CT;
entry->ct.action = tcf_ct_action(act);
entry->ct.zone = tcf_ct_zone(act);
+ } else if (is_tcf_mpls(act)) {
+ switch (tcf_mpls_action(act)) {
+ case TCA_MPLS_ACT_PUSH:
+ entry->id = FLOW_ACTION_MPLS_PUSH;
+ entry->mpls_push.proto = tcf_mpls_proto(act);
+ entry->mpls_push.label = tcf_mpls_label(act);
+ entry->mpls_push.tc = tcf_mpls_tc(act);
+ entry->mpls_push.bos = tcf_mpls_bos(act);
+ entry->mpls_push.ttl = tcf_mpls_ttl(act);
+ break;
+ case TCA_MPLS_ACT_POP:
+ entry->id = FLOW_ACTION_MPLS_POP;
+ entry->mpls_pop.proto = tcf_mpls_proto(act);
+ break;
+ case TCA_MPLS_ACT_MODIFY:
+ entry->id = FLOW_ACTION_MPLS_MANGLE;
+ entry->mpls_mangle.label = tcf_mpls_label(act);
+ entry->mpls_mangle.tc = tcf_mpls_tc(act);
+ entry->mpls_mangle.bos = tcf_mpls_bos(act);
+ entry->mpls_mangle.ttl = tcf_mpls_ttl(act);
+ break;
+ default:
+ goto err_out;
+ }
+ } else if (is_tcf_skbedit_ptype(act)) {
+ entry->id = FLOW_ACTION_PTYPE;
+ entry->ptype = tcf_skbedit_ptype(act);
} else {
+ err = -EOPNOTSUPP;
goto err_out;
}
if (!is_tcf_pedit(act))
j++;
}
- return 0;
+
err_out:
- return -EOPNOTSUPP;
+ if (!rtnl_held)
+ rtnl_unlock();
+
+ if (err)
+ tc_cleanup_flow_action(flow_action);
+
+ return err;
}
EXPORT_SYMBOL(tc_setup_flow_action);
@@ -3321,6 +3624,11 @@ static struct pernet_operations tcf_net_ops = {
.size = sizeof(struct tcf_net),
};
+static struct flow_indr_block_entry block_entry = {
+ .cb = tc_indr_block_get_and_cmd,
+ .list = LIST_HEAD_INIT(block_entry.list),
+};
+
static int __init tc_filter_init(void)
{
int err;
@@ -3333,10 +3641,7 @@ static int __init tc_filter_init(void)
if (err)
goto err_register_pernet_subsys;
- err = rhashtable_init(&indr_setup_block_ht,
- &tc_indr_setup_block_ht_params);
- if (err)
- goto err_rhash_setup_block_ht;
+ flow_indr_add_block_cb(&block_entry);
rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL,
RTNL_FLAG_DOIT_UNLOCKED);
@@ -3351,8 +3656,6 @@ static int __init tc_filter_init(void)
return 0;
-err_rhash_setup_block_ht:
- unregister_pernet_subsys(&tcf_net_ops);
err_register_pernet_subsys:
destroy_workqueue(tc_filter_wq);
return err;
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index 4aafbe3d435c..f256a7c69093 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -263,12 +263,17 @@ skip:
}
}
-static void basic_bind_class(void *fh, u32 classid, unsigned long cl)
+static void basic_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
+ unsigned long base)
{
struct basic_filter *f = fh;
- if (f && f->res.classid == classid)
- f->res.class = cl;
+ if (f && f->res.classid == classid) {
+ if (cl)
+ __tcf_bind_filter(q, &f->res, base);
+ else
+ __tcf_unbind_filter(q, &f->res);
+ }
}
static int basic_dump(struct net *net, struct tcf_proto *tp, void *fh,
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 3f7a9c02b70c..6e3e63db0e01 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -162,18 +162,24 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
cls_bpf.name = obj->bpf_name;
cls_bpf.exts_integrated = obj->exts_integrated;
- if (oldprog)
- tcf_block_offload_dec(block, &oldprog->gen_flags);
+ if (oldprog && prog)
+ err = tc_setup_cb_replace(block, tp, TC_SETUP_CLSBPF, &cls_bpf,
+ skip_sw, &oldprog->gen_flags,
+ &oldprog->in_hw_count,
+ &prog->gen_flags, &prog->in_hw_count,
+ true);
+ else if (prog)
+ err = tc_setup_cb_add(block, tp, TC_SETUP_CLSBPF, &cls_bpf,
+ skip_sw, &prog->gen_flags,
+ &prog->in_hw_count, true);
+ else
+ err = tc_setup_cb_destroy(block, tp, TC_SETUP_CLSBPF, &cls_bpf,
+ skip_sw, &oldprog->gen_flags,
+ &oldprog->in_hw_count, true);
- err = tc_setup_cb_call(block, TC_SETUP_CLSBPF, &cls_bpf, skip_sw);
- if (prog) {
- if (err < 0) {
- cls_bpf_offload_cmd(tp, oldprog, prog, extack);
- return err;
- } else if (err > 0) {
- prog->in_hw_count = err;
- tcf_block_offload_inc(block, &prog->gen_flags);
- }
+ if (prog && err) {
+ cls_bpf_offload_cmd(tp, oldprog, prog, extack);
+ return err;
}
if (prog && skip_sw && !(prog->gen_flags & TCA_CLS_FLAGS_IN_HW))
@@ -230,7 +236,7 @@ static void cls_bpf_offload_update_stats(struct tcf_proto *tp,
cls_bpf.name = prog->bpf_name;
cls_bpf.exts_integrated = prog->exts_integrated;
- tc_setup_cb_call(block, TC_SETUP_CLSBPF, &cls_bpf, false);
+ tc_setup_cb_call(block, TC_SETUP_CLSBPF, &cls_bpf, false, true);
}
static int cls_bpf_init(struct tcf_proto *tp)
@@ -625,12 +631,17 @@ nla_put_failure:
return -1;
}
-static void cls_bpf_bind_class(void *fh, u32 classid, unsigned long cl)
+static void cls_bpf_bind_class(void *fh, u32 classid, unsigned long cl,
+ void *q, unsigned long base)
{
struct cls_bpf_prog *prog = fh;
- if (prog && prog->res.classid == classid)
- prog->res.class = cl;
+ if (prog && prog->res.classid == classid) {
+ if (cl)
+ __tcf_bind_filter(q, &prog->res, base);
+ else
+ __tcf_unbind_filter(q, &prog->res);
+ }
}
static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg,
@@ -673,15 +684,11 @@ static int cls_bpf_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb
cls_bpf.name = prog->bpf_name;
cls_bpf.exts_integrated = prog->exts_integrated;
- err = cb(TC_SETUP_CLSBPF, &cls_bpf, cb_priv);
- if (err) {
- if (add && tc_skip_sw(prog->gen_flags))
- return err;
- continue;
- }
-
- tc_cls_offload_cnt_update(block, &prog->in_hw_count,
- &prog->gen_flags, add);
+ err = tc_setup_cb_reoffload(block, tp, add, cb, TC_SETUP_CLSBPF,
+ &cls_bpf, cb_priv, &prog->gen_flags,
+ &prog->in_hw_count);
+ if (err)
+ return err;
}
return 0;
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 054123742e32..7e54d2ab5254 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -22,6 +22,8 @@
#include <net/ip.h>
#include <net/flow_dissector.h>
#include <net/geneve.h>
+#include <net/vxlan.h>
+#include <net/erspan.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
@@ -54,8 +56,13 @@ struct fl_flow_key {
struct flow_dissector_key_ip ip;
struct flow_dissector_key_ip enc_ip;
struct flow_dissector_key_enc_opts enc_opts;
- struct flow_dissector_key_ports tp_min;
- struct flow_dissector_key_ports tp_max;
+ union {
+ struct flow_dissector_key_ports tp;
+ struct {
+ struct flow_dissector_key_ports tp_min;
+ struct flow_dissector_key_ports tp_max;
+ };
+ } tp_range;
struct flow_dissector_key_ct ct;
} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
@@ -198,19 +205,19 @@ static bool fl_range_port_dst_cmp(struct cls_fl_filter *filter,
{
__be16 min_mask, max_mask, min_val, max_val;
- min_mask = htons(filter->mask->key.tp_min.dst);
- max_mask = htons(filter->mask->key.tp_max.dst);
- min_val = htons(filter->key.tp_min.dst);
- max_val = htons(filter->key.tp_max.dst);
+ min_mask = htons(filter->mask->key.tp_range.tp_min.dst);
+ max_mask = htons(filter->mask->key.tp_range.tp_max.dst);
+ min_val = htons(filter->key.tp_range.tp_min.dst);
+ max_val = htons(filter->key.tp_range.tp_max.dst);
if (min_mask && max_mask) {
- if (htons(key->tp.dst) < min_val ||
- htons(key->tp.dst) > max_val)
+ if (htons(key->tp_range.tp.dst) < min_val ||
+ htons(key->tp_range.tp.dst) > max_val)
return false;
/* skb does not have min and max values */
- mkey->tp_min.dst = filter->mkey.tp_min.dst;
- mkey->tp_max.dst = filter->mkey.tp_max.dst;
+ mkey->tp_range.tp_min.dst = filter->mkey.tp_range.tp_min.dst;
+ mkey->tp_range.tp_max.dst = filter->mkey.tp_range.tp_max.dst;
}
return true;
}
@@ -221,19 +228,19 @@ static bool fl_range_port_src_cmp(struct cls_fl_filter *filter,
{
__be16 min_mask, max_mask, min_val, max_val;
- min_mask = htons(filter->mask->key.tp_min.src);
- max_mask = htons(filter->mask->key.tp_max.src);
- min_val = htons(filter->key.tp_min.src);
- max_val = htons(filter->key.tp_max.src);
+ min_mask = htons(filter->mask->key.tp_range.tp_min.src);
+ max_mask = htons(filter->mask->key.tp_range.tp_max.src);
+ min_val = htons(filter->key.tp_range.tp_min.src);
+ max_val = htons(filter->key.tp_range.tp_max.src);
if (min_mask && max_mask) {
- if (htons(key->tp.src) < min_val ||
- htons(key->tp.src) > max_val)
+ if (htons(key->tp_range.tp.src) < min_val ||
+ htons(key->tp_range.tp.src) > max_val)
return false;
/* skb does not have min and max values */
- mkey->tp_min.src = filter->mkey.tp_min.src;
- mkey->tp_max.src = filter->mkey.tp_max.src;
+ mkey->tp_range.tp_min.src = filter->mkey.tp_range.tp_min.src;
+ mkey->tp_range.tp_max.src = filter->mkey.tp_range.tp_max.src;
}
return true;
}
@@ -412,41 +419,27 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
struct tcf_block *block = tp->chain->block;
struct flow_cls_offload cls_flower = {};
- if (!rtnl_held)
- rtnl_lock();
-
tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack);
cls_flower.command = FLOW_CLS_DESTROY;
cls_flower.cookie = (unsigned long) f;
- tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
- spin_lock(&tp->lock);
- list_del_init(&f->hw_list);
- tcf_block_offload_dec(block, &f->flags);
- spin_unlock(&tp->lock);
+ tc_setup_cb_destroy(block, tp, TC_SETUP_CLSFLOWER, &cls_flower, false,
+ &f->flags, &f->in_hw_count, rtnl_held);
- if (!rtnl_held)
- rtnl_unlock();
}
static int fl_hw_replace_filter(struct tcf_proto *tp,
struct cls_fl_filter *f, bool rtnl_held,
struct netlink_ext_ack *extack)
{
- struct cls_fl_head *head = fl_head_dereference(tp);
struct tcf_block *block = tp->chain->block;
struct flow_cls_offload cls_flower = {};
bool skip_sw = tc_skip_sw(f->flags);
int err = 0;
- if (!rtnl_held)
- rtnl_lock();
-
cls_flower.rule = flow_rule_alloc(tcf_exts_num_actions(&f->exts));
- if (!cls_flower.rule) {
- err = -ENOMEM;
- goto errout;
- }
+ if (!cls_flower.rule)
+ return -ENOMEM;
tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack);
cls_flower.command = FLOW_CLS_REPLACE;
@@ -456,43 +449,31 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
cls_flower.rule->match.key = &f->mkey;
cls_flower.classid = f->res.classid;
- err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts);
+ err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts,
+ rtnl_held);
if (err) {
kfree(cls_flower.rule);
- if (skip_sw)
+ if (skip_sw) {
NL_SET_ERR_MSG_MOD(extack, "Failed to setup flow action");
- else
- err = 0;
- goto errout;
+ return err;
+ }
+ return 0;
}
- err = tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, skip_sw);
+ err = tc_setup_cb_add(block, tp, TC_SETUP_CLSFLOWER, &cls_flower,
+ skip_sw, &f->flags, &f->in_hw_count, rtnl_held);
+ tc_cleanup_flow_action(&cls_flower.rule->action);
kfree(cls_flower.rule);
- if (err < 0) {
- fl_hw_destroy_filter(tp, f, true, NULL);
- goto errout;
- } else if (err > 0) {
- f->in_hw_count = err;
- err = 0;
- spin_lock(&tp->lock);
- tcf_block_offload_inc(block, &f->flags);
- spin_unlock(&tp->lock);
- }
-
- if (skip_sw && !(f->flags & TCA_CLS_FLAGS_IN_HW)) {
- err = -EINVAL;
- goto errout;
+ if (err) {
+ fl_hw_destroy_filter(tp, f, rtnl_held, NULL);
+ return err;
}
- spin_lock(&tp->lock);
- list_add(&f->hw_list, &head->hw_filters);
- spin_unlock(&tp->lock);
-errout:
- if (!rtnl_held)
- rtnl_unlock();
+ if (skip_sw && !(f->flags & TCA_CLS_FLAGS_IN_HW))
+ return -EINVAL;
- return err;
+ return 0;
}
static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f,
@@ -501,22 +482,17 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f,
struct tcf_block *block = tp->chain->block;
struct flow_cls_offload cls_flower = {};
- if (!rtnl_held)
- rtnl_lock();
-
tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, NULL);
cls_flower.command = FLOW_CLS_STATS;
cls_flower.cookie = (unsigned long) f;
cls_flower.classid = f->res.classid;
- tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
+ tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false,
+ rtnl_held);
tcf_exts_stats_update(&f->exts, cls_flower.stats.bytes,
cls_flower.stats.pkts,
cls_flower.stats.lastused);
-
- if (!rtnl_held)
- rtnl_unlock();
}
static void __fl_put(struct cls_fl_filter *f)
@@ -715,11 +691,16 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
.len = 128 / BITS_PER_BYTE },
[TCA_FLOWER_KEY_CT_LABELS_MASK] = { .type = NLA_BINARY,
.len = 128 / BITS_PER_BYTE },
+ [TCA_FLOWER_FLAGS] = { .type = NLA_U32 },
};
static const struct nla_policy
enc_opts_policy[TCA_FLOWER_KEY_ENC_OPTS_MAX + 1] = {
+ [TCA_FLOWER_KEY_ENC_OPTS_UNSPEC] = {
+ .strict_start_type = TCA_FLOWER_KEY_ENC_OPTS_VXLAN },
[TCA_FLOWER_KEY_ENC_OPTS_GENEVE] = { .type = NLA_NESTED },
+ [TCA_FLOWER_KEY_ENC_OPTS_VXLAN] = { .type = NLA_NESTED },
+ [TCA_FLOWER_KEY_ENC_OPTS_ERSPAN] = { .type = NLA_NESTED },
};
static const struct nla_policy
@@ -730,6 +711,19 @@ geneve_opt_policy[TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX + 1] = {
.len = 128 },
};
+static const struct nla_policy
+vxlan_opt_policy[TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX + 1] = {
+ [TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy
+erspan_opt_policy[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX + 1] = {
+ [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID] = { .type = NLA_U8 },
+};
+
static void fl_set_key_val(struct nlattr **tb,
void *val, int val_type,
void *mask, int mask_type, int len)
@@ -746,23 +740,25 @@ static void fl_set_key_val(struct nlattr **tb,
static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key,
struct fl_flow_key *mask)
{
- fl_set_key_val(tb, &key->tp_min.dst,
- TCA_FLOWER_KEY_PORT_DST_MIN, &mask->tp_min.dst,
- TCA_FLOWER_UNSPEC, sizeof(key->tp_min.dst));
- fl_set_key_val(tb, &key->tp_max.dst,
- TCA_FLOWER_KEY_PORT_DST_MAX, &mask->tp_max.dst,
- TCA_FLOWER_UNSPEC, sizeof(key->tp_max.dst));
- fl_set_key_val(tb, &key->tp_min.src,
- TCA_FLOWER_KEY_PORT_SRC_MIN, &mask->tp_min.src,
- TCA_FLOWER_UNSPEC, sizeof(key->tp_min.src));
- fl_set_key_val(tb, &key->tp_max.src,
- TCA_FLOWER_KEY_PORT_SRC_MAX, &mask->tp_max.src,
- TCA_FLOWER_UNSPEC, sizeof(key->tp_max.src));
-
- if ((mask->tp_min.dst && mask->tp_max.dst &&
- htons(key->tp_max.dst) <= htons(key->tp_min.dst)) ||
- (mask->tp_min.src && mask->tp_max.src &&
- htons(key->tp_max.src) <= htons(key->tp_min.src)))
+ fl_set_key_val(tb, &key->tp_range.tp_min.dst,
+ TCA_FLOWER_KEY_PORT_DST_MIN, &mask->tp_range.tp_min.dst,
+ TCA_FLOWER_UNSPEC, sizeof(key->tp_range.tp_min.dst));
+ fl_set_key_val(tb, &key->tp_range.tp_max.dst,
+ TCA_FLOWER_KEY_PORT_DST_MAX, &mask->tp_range.tp_max.dst,
+ TCA_FLOWER_UNSPEC, sizeof(key->tp_range.tp_max.dst));
+ fl_set_key_val(tb, &key->tp_range.tp_min.src,
+ TCA_FLOWER_KEY_PORT_SRC_MIN, &mask->tp_range.tp_min.src,
+ TCA_FLOWER_UNSPEC, sizeof(key->tp_range.tp_min.src));
+ fl_set_key_val(tb, &key->tp_range.tp_max.src,
+ TCA_FLOWER_KEY_PORT_SRC_MAX, &mask->tp_range.tp_max.src,
+ TCA_FLOWER_UNSPEC, sizeof(key->tp_range.tp_max.src));
+
+ if ((mask->tp_range.tp_min.dst && mask->tp_range.tp_max.dst &&
+ htons(key->tp_range.tp_max.dst) <=
+ htons(key->tp_range.tp_min.dst)) ||
+ (mask->tp_range.tp_min.src && mask->tp_range.tp_max.src &&
+ htons(key->tp_range.tp_max.src) <=
+ htons(key->tp_range.tp_min.src)))
return -EINVAL;
return 0;
@@ -959,6 +955,105 @@ static int fl_set_geneve_opt(const struct nlattr *nla, struct fl_flow_key *key,
return sizeof(struct geneve_opt) + data_len;
}
+static int fl_set_vxlan_opt(const struct nlattr *nla, struct fl_flow_key *key,
+ int depth, int option_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX + 1];
+ struct vxlan_metadata *md;
+ int err;
+
+ md = (struct vxlan_metadata *)&key->enc_opts.data[key->enc_opts.len];
+ memset(md, 0xff, sizeof(*md));
+
+ if (!depth)
+ return sizeof(*md);
+
+ if (nla_type(nla) != TCA_FLOWER_KEY_ENC_OPTS_VXLAN) {
+ NL_SET_ERR_MSG(extack, "Non-vxlan option type for mask");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX, nla,
+ vxlan_opt_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!option_len && !tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key vxlan option gbp");
+ return -EINVAL;
+ }
+
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP])
+ md->gbp = nla_get_u32(tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP]);
+
+ return sizeof(*md);
+}
+
+static int fl_set_erspan_opt(const struct nlattr *nla, struct fl_flow_key *key,
+ int depth, int option_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX + 1];
+ struct erspan_metadata *md;
+ int err;
+
+ md = (struct erspan_metadata *)&key->enc_opts.data[key->enc_opts.len];
+ memset(md, 0xff, sizeof(*md));
+ md->version = 1;
+
+ if (!depth)
+ return sizeof(*md);
+
+ if (nla_type(nla) != TCA_FLOWER_KEY_ENC_OPTS_ERSPAN) {
+ NL_SET_ERR_MSG(extack, "Non-erspan option type for mask");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX, nla,
+ erspan_opt_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!option_len && !tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option ver");
+ return -EINVAL;
+ }
+
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER])
+ md->version = nla_get_u8(tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER]);
+
+ if (md->version == 1) {
+ if (!option_len && !tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option index");
+ return -EINVAL;
+ }
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX]) {
+ nla = tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX];
+ md->u.index = nla_get_be32(nla);
+ }
+ } else if (md->version == 2) {
+ if (!option_len && (!tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR] ||
+ !tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID])) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option dir or hwid");
+ return -EINVAL;
+ }
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR]) {
+ nla = tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR];
+ md->u.md2.dir = nla_get_u8(nla);
+ }
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID]) {
+ nla = tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID];
+ set_hwid(&md->u.md2, nla_get_u8(nla));
+ }
+ } else {
+ NL_SET_ERR_MSG(extack, "Tunnel key erspan option ver is incorrect");
+ return -EINVAL;
+ }
+
+ return sizeof(*md);
+}
+
static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
struct fl_flow_key *mask,
struct netlink_ext_ack *extack)
@@ -989,6 +1084,11 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS]), key_depth) {
switch (nla_type(nla_opt_key)) {
case TCA_FLOWER_KEY_ENC_OPTS_GENEVE:
+ if (key->enc_opts.dst_opt_type &&
+ key->enc_opts.dst_opt_type != TUNNEL_GENEVE_OPT) {
+ NL_SET_ERR_MSG(extack, "Duplicate type for geneve options");
+ return -EINVAL;
+ }
option_len = 0;
key->enc_opts.dst_opt_type = TUNNEL_GENEVE_OPT;
option_len = fl_set_geneve_opt(nla_opt_key, key,
@@ -1017,6 +1117,72 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
if (msk_depth)
nla_opt_msk = nla_next(nla_opt_msk, &msk_depth);
break;
+ case TCA_FLOWER_KEY_ENC_OPTS_VXLAN:
+ if (key->enc_opts.dst_opt_type) {
+ NL_SET_ERR_MSG(extack, "Duplicate type for vxlan options");
+ return -EINVAL;
+ }
+ option_len = 0;
+ key->enc_opts.dst_opt_type = TUNNEL_VXLAN_OPT;
+ option_len = fl_set_vxlan_opt(nla_opt_key, key,
+ key_depth, option_len,
+ extack);
+ if (option_len < 0)
+ return option_len;
+
+ key->enc_opts.len += option_len;
+ /* At the same time we need to parse through the mask
+ * in order to verify exact and mask attribute lengths.
+ */
+ mask->enc_opts.dst_opt_type = TUNNEL_VXLAN_OPT;
+ option_len = fl_set_vxlan_opt(nla_opt_msk, mask,
+ msk_depth, option_len,
+ extack);
+ if (option_len < 0)
+ return option_len;
+
+ mask->enc_opts.len += option_len;
+ if (key->enc_opts.len != mask->enc_opts.len) {
+ NL_SET_ERR_MSG(extack, "Key and mask miss aligned");
+ return -EINVAL;
+ }
+
+ if (msk_depth)
+ nla_opt_msk = nla_next(nla_opt_msk, &msk_depth);
+ break;
+ case TCA_FLOWER_KEY_ENC_OPTS_ERSPAN:
+ if (key->enc_opts.dst_opt_type) {
+ NL_SET_ERR_MSG(extack, "Duplicate type for erspan options");
+ return -EINVAL;
+ }
+ option_len = 0;
+ key->enc_opts.dst_opt_type = TUNNEL_ERSPAN_OPT;
+ option_len = fl_set_erspan_opt(nla_opt_key, key,
+ key_depth, option_len,
+ extack);
+ if (option_len < 0)
+ return option_len;
+
+ key->enc_opts.len += option_len;
+ /* At the same time we need to parse through the mask
+ * in order to verify exact and mask attribute lengths.
+ */
+ mask->enc_opts.dst_opt_type = TUNNEL_ERSPAN_OPT;
+ option_len = fl_set_erspan_opt(nla_opt_msk, mask,
+ msk_depth, option_len,
+ extack);
+ if (option_len < 0)
+ return option_len;
+
+ mask->enc_opts.len += option_len;
+ if (key->enc_opts.len != mask->enc_opts.len) {
+ NL_SET_ERR_MSG(extack, "Key and mask miss aligned");
+ return -EINVAL;
+ }
+
+ if (msk_depth)
+ nla_opt_msk = nla_next(nla_opt_msk, &msk_depth);
+ break;
default:
NL_SET_ERR_MSG(extack, "Unknown tunnel option type");
return -EINVAL;
@@ -1316,7 +1482,7 @@ static int fl_init_mask_hashtable(struct fl_flow_mask *mask)
}
#define FL_KEY_MEMBER_OFFSET(member) offsetof(struct fl_flow_key, member)
-#define FL_KEY_MEMBER_SIZE(member) FIELD_SIZEOF(struct fl_flow_key, member)
+#define FL_KEY_MEMBER_SIZE(member) sizeof_field(struct fl_flow_key, member)
#define FL_KEY_IS_MASKED(mask, member) \
memchr_inv(((char *)mask) + FL_KEY_MEMBER_OFFSET(member), \
@@ -1351,9 +1517,10 @@ static void fl_init_dissector(struct flow_dissector *dissector,
FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4);
FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6);
- if (FL_KEY_IS_MASKED(mask, tp) ||
- FL_KEY_IS_MASKED(mask, tp_min) || FL_KEY_IS_MASKED(mask, tp_max))
- FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_PORTS, tp);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_PORTS, tp);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_PORTS_RANGE, tp_range);
FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_IP, ip);
FL_KEY_SET_IF_MASKED(mask, keys, cnt,
@@ -1402,8 +1569,10 @@ static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head,
fl_mask_copy(newmask, mask);
- if ((newmask->key.tp_min.dst && newmask->key.tp_max.dst) ||
- (newmask->key.tp_min.src && newmask->key.tp_max.src))
+ if ((newmask->key.tp_range.tp_min.dst &&
+ newmask->key.tp_range.tp_max.dst) ||
+ (newmask->key.tp_range.tp_min.src &&
+ newmask->key.tp_range.tp_max.src))
newmask->flags |= TCA_FLOWER_MASK_FLAGS_RANGE;
err = fl_init_mask_hashtable(newmask);
@@ -1831,7 +2000,8 @@ static int fl_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb,
cls_flower.rule->match.mask = &f->mask->key;
cls_flower.rule->match.key = &f->mkey;
- err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts);
+ err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts,
+ true);
if (err) {
kfree(cls_flower.rule);
if (tc_skip_sw(f->flags)) {
@@ -1844,21 +2014,17 @@ static int fl_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb,
cls_flower.classid = f->res.classid;
- err = cb(TC_SETUP_CLSFLOWER, &cls_flower, cb_priv);
+ err = tc_setup_cb_reoffload(block, tp, add, cb,
+ TC_SETUP_CLSFLOWER, &cls_flower,
+ cb_priv, &f->flags,
+ &f->in_hw_count);
+ tc_cleanup_flow_action(&cls_flower.rule->action);
kfree(cls_flower.rule);
if (err) {
- if (add && tc_skip_sw(f->flags)) {
- __fl_put(f);
- return err;
- }
- goto next_flow;
+ __fl_put(f);
+ return err;
}
-
- spin_lock(&tp->lock);
- tc_cls_offload_cnt_update(block, &f->in_hw_count, &f->flags,
- add);
- spin_unlock(&tp->lock);
next_flow:
__fl_put(f);
}
@@ -1866,6 +2032,30 @@ next_flow:
return 0;
}
+static void fl_hw_add(struct tcf_proto *tp, void *type_data)
+{
+ struct flow_cls_offload *cls_flower = type_data;
+ struct cls_fl_filter *f =
+ (struct cls_fl_filter *) cls_flower->cookie;
+ struct cls_fl_head *head = fl_head_dereference(tp);
+
+ spin_lock(&tp->lock);
+ list_add(&f->hw_list, &head->hw_filters);
+ spin_unlock(&tp->lock);
+}
+
+static void fl_hw_del(struct tcf_proto *tp, void *type_data)
+{
+ struct flow_cls_offload *cls_flower = type_data;
+ struct cls_fl_filter *f =
+ (struct cls_fl_filter *) cls_flower->cookie;
+
+ spin_lock(&tp->lock);
+ if (!list_empty(&f->hw_list))
+ list_del_init(&f->hw_list);
+ spin_unlock(&tp->lock);
+}
+
static int fl_hw_create_tmplt(struct tcf_chain *chain,
struct fl_flow_tmplt *tmplt)
{
@@ -1886,7 +2076,7 @@ static int fl_hw_create_tmplt(struct tcf_chain *chain,
/* We don't care if driver (any of them) fails to handle this
* call. It serves just as a hint for it.
*/
- tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
+ tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false, true);
kfree(cls_flower.rule);
return 0;
@@ -1902,7 +2092,7 @@ static void fl_hw_destroy_tmplt(struct tcf_chain *chain,
cls_flower.command = FLOW_CLS_TMPLT_DESTROY;
cls_flower.cookie = (unsigned long) tmplt;
- tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
+ tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false, true);
}
static void *fl_tmplt_create(struct net *net, struct tcf_chain *chain,
@@ -1980,18 +2170,22 @@ static int fl_dump_key_val(struct sk_buff *skb,
static int fl_dump_key_port_range(struct sk_buff *skb, struct fl_flow_key *key,
struct fl_flow_key *mask)
{
- if (fl_dump_key_val(skb, &key->tp_min.dst, TCA_FLOWER_KEY_PORT_DST_MIN,
- &mask->tp_min.dst, TCA_FLOWER_UNSPEC,
- sizeof(key->tp_min.dst)) ||
- fl_dump_key_val(skb, &key->tp_max.dst, TCA_FLOWER_KEY_PORT_DST_MAX,
- &mask->tp_max.dst, TCA_FLOWER_UNSPEC,
- sizeof(key->tp_max.dst)) ||
- fl_dump_key_val(skb, &key->tp_min.src, TCA_FLOWER_KEY_PORT_SRC_MIN,
- &mask->tp_min.src, TCA_FLOWER_UNSPEC,
- sizeof(key->tp_min.src)) ||
- fl_dump_key_val(skb, &key->tp_max.src, TCA_FLOWER_KEY_PORT_SRC_MAX,
- &mask->tp_max.src, TCA_FLOWER_UNSPEC,
- sizeof(key->tp_max.src)))
+ if (fl_dump_key_val(skb, &key->tp_range.tp_min.dst,
+ TCA_FLOWER_KEY_PORT_DST_MIN,
+ &mask->tp_range.tp_min.dst, TCA_FLOWER_UNSPEC,
+ sizeof(key->tp_range.tp_min.dst)) ||
+ fl_dump_key_val(skb, &key->tp_range.tp_max.dst,
+ TCA_FLOWER_KEY_PORT_DST_MAX,
+ &mask->tp_range.tp_max.dst, TCA_FLOWER_UNSPEC,
+ sizeof(key->tp_range.tp_max.dst)) ||
+ fl_dump_key_val(skb, &key->tp_range.tp_min.src,
+ TCA_FLOWER_KEY_PORT_SRC_MIN,
+ &mask->tp_range.tp_min.src, TCA_FLOWER_UNSPEC,
+ sizeof(key->tp_range.tp_min.src)) ||
+ fl_dump_key_val(skb, &key->tp_range.tp_max.src,
+ TCA_FLOWER_KEY_PORT_SRC_MAX,
+ &mask->tp_range.tp_max.src, TCA_FLOWER_UNSPEC,
+ sizeof(key->tp_range.tp_max.src)))
return -1;
return 0;
@@ -2145,6 +2339,61 @@ nla_put_failure:
return -EMSGSIZE;
}
+static int fl_dump_key_vxlan_opt(struct sk_buff *skb,
+ struct flow_dissector_key_enc_opts *enc_opts)
+{
+ struct vxlan_metadata *md;
+ struct nlattr *nest;
+
+ nest = nla_nest_start_noflag(skb, TCA_FLOWER_KEY_ENC_OPTS_VXLAN);
+ if (!nest)
+ goto nla_put_failure;
+
+ md = (struct vxlan_metadata *)&enc_opts->data[0];
+ if (nla_put_u32(skb, TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP, md->gbp))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int fl_dump_key_erspan_opt(struct sk_buff *skb,
+ struct flow_dissector_key_enc_opts *enc_opts)
+{
+ struct erspan_metadata *md;
+ struct nlattr *nest;
+
+ nest = nla_nest_start_noflag(skb, TCA_FLOWER_KEY_ENC_OPTS_ERSPAN);
+ if (!nest)
+ goto nla_put_failure;
+
+ md = (struct erspan_metadata *)&enc_opts->data[0];
+ if (nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER, md->version))
+ goto nla_put_failure;
+
+ if (md->version == 1 &&
+ nla_put_be32(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX, md->u.index))
+ goto nla_put_failure;
+
+ if (md->version == 2 &&
+ (nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR,
+ md->u.md2.dir) ||
+ nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID,
+ get_hwid(&md->u.md2))))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
static int fl_dump_key_ct(struct sk_buff *skb,
struct flow_dissector_key_ct *key,
struct flow_dissector_key_ct *mask)
@@ -2198,6 +2447,16 @@ static int fl_dump_key_options(struct sk_buff *skb, int enc_opt_type,
if (err)
goto nla_put_failure;
break;
+ case TUNNEL_VXLAN_OPT:
+ err = fl_dump_key_vxlan_opt(skb, enc_opts);
+ if (err)
+ goto nla_put_failure;
+ break;
+ case TUNNEL_ERSPAN_OPT:
+ err = fl_dump_key_erspan_opt(skb, enc_opts);
+ if (err)
+ goto nla_put_failure;
+ break;
default:
goto nla_put_failure;
}
@@ -2507,12 +2766,28 @@ nla_put_failure:
return -EMSGSIZE;
}
-static void fl_bind_class(void *fh, u32 classid, unsigned long cl)
+static void fl_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
+ unsigned long base)
{
struct cls_fl_filter *f = fh;
- if (f && f->res.classid == classid)
- f->res.class = cl;
+ if (f && f->res.classid == classid) {
+ if (cl)
+ __tcf_bind_filter(q, &f->res, base);
+ else
+ __tcf_unbind_filter(q, &f->res);
+ }
+}
+
+static bool fl_delete_empty(struct tcf_proto *tp)
+{
+ struct cls_fl_head *head = fl_head_dereference(tp);
+
+ spin_lock(&tp->lock);
+ tp->deleting = idr_is_empty(&head->handle_idr);
+ spin_unlock(&tp->lock);
+
+ return tp->deleting;
}
static struct tcf_proto_ops cls_fl_ops __read_mostly = {
@@ -2524,8 +2799,11 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = {
.put = fl_put,
.change = fl_change,
.delete = fl_delete,
+ .delete_empty = fl_delete_empty,
.walk = fl_walk,
.reoffload = fl_reoffload,
+ .hw_add = fl_hw_add,
+ .hw_del = fl_hw_del,
.dump = fl_dump,
.bind_class = fl_bind_class,
.tmplt_create = fl_tmplt_create,
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index c9496c920d6f..ec945294626a 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -419,12 +419,17 @@ nla_put_failure:
return -1;
}
-static void fw_bind_class(void *fh, u32 classid, unsigned long cl)
+static void fw_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
+ unsigned long base)
{
struct fw_filter *f = fh;
- if (f && f->res.classid == classid)
- f->res.class = cl;
+ if (f && f->res.classid == classid) {
+ if (cl)
+ __tcf_bind_filter(q, &f->res, base);
+ else
+ __tcf_unbind_filter(q, &f->res);
+ }
}
static struct tcf_proto_ops cls_fw_ops __read_mostly = {
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 455ea2793f9b..610a0b728161 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -75,8 +75,8 @@ static void mall_destroy_hw_filter(struct tcf_proto *tp,
cls_mall.command = TC_CLSMATCHALL_DESTROY;
cls_mall.cookie = cookie;
- tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, false);
- tcf_block_offload_dec(block, &head->flags);
+ tc_setup_cb_destroy(block, tp, TC_SETUP_CLSMATCHALL, &cls_mall, false,
+ &head->flags, &head->in_hw_count, true);
}
static int mall_replace_hw_filter(struct tcf_proto *tp,
@@ -97,7 +97,7 @@ static int mall_replace_hw_filter(struct tcf_proto *tp,
cls_mall.command = TC_CLSMATCHALL_REPLACE;
cls_mall.cookie = cookie;
- err = tc_setup_flow_action(&cls_mall.rule->action, &head->exts);
+ err = tc_setup_flow_action(&cls_mall.rule->action, &head->exts, true);
if (err) {
kfree(cls_mall.rule);
mall_destroy_hw_filter(tp, head, cookie, NULL);
@@ -109,15 +109,14 @@ static int mall_replace_hw_filter(struct tcf_proto *tp,
return err;
}
- err = tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, skip_sw);
+ err = tc_setup_cb_add(block, tp, TC_SETUP_CLSMATCHALL, &cls_mall,
+ skip_sw, &head->flags, &head->in_hw_count, true);
+ tc_cleanup_flow_action(&cls_mall.rule->action);
kfree(cls_mall.rule);
- if (err < 0) {
+ if (err) {
mall_destroy_hw_filter(tp, head, cookie, NULL);
return err;
- } else if (err > 0) {
- head->in_hw_count = err;
- tcf_block_offload_inc(block, &head->flags);
}
if (skip_sw && !(head->flags & TCA_CLS_FLAGS_IN_HW))
@@ -158,6 +157,7 @@ static void *mall_get(struct tcf_proto *tp, u32 handle)
static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = {
[TCA_MATCHALL_UNSPEC] = { .type = NLA_UNSPEC },
[TCA_MATCHALL_CLASSID] = { .type = NLA_U32 },
+ [TCA_MATCHALL_FLAGS] = { .type = NLA_U32 },
};
static int mall_set_parms(struct net *net, struct tcf_proto *tp,
@@ -302,7 +302,7 @@ static int mall_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb,
TC_CLSMATCHALL_REPLACE : TC_CLSMATCHALL_DESTROY;
cls_mall.cookie = (unsigned long)head;
- err = tc_setup_flow_action(&cls_mall.rule->action, &head->exts);
+ err = tc_setup_flow_action(&cls_mall.rule->action, &head->exts, true);
if (err) {
kfree(cls_mall.rule);
if (add && tc_skip_sw(head->flags)) {
@@ -312,16 +312,14 @@ static int mall_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb,
return 0;
}
- err = cb(TC_SETUP_CLSMATCHALL, &cls_mall, cb_priv);
+ err = tc_setup_cb_reoffload(block, tp, add, cb, TC_SETUP_CLSMATCHALL,
+ &cls_mall, cb_priv, &head->flags,
+ &head->in_hw_count);
+ tc_cleanup_flow_action(&cls_mall.rule->action);
kfree(cls_mall.rule);
- if (err) {
- if (add && tc_skip_sw(head->flags))
- return err;
- return 0;
- }
-
- tc_cls_offload_cnt_update(block, &head->in_hw_count, &head->flags, add);
+ if (err)
+ return err;
return 0;
}
@@ -337,7 +335,7 @@ static void mall_stats_hw_filter(struct tcf_proto *tp,
cls_mall.command = TC_CLSMATCHALL_STATS;
cls_mall.cookie = cookie;
- tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, false);
+ tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, false, true);
tcf_exts_stats_update(&head->exts, cls_mall.stats.bytes,
cls_mall.stats.pkts, cls_mall.stats.lastused);
@@ -396,12 +394,17 @@ nla_put_failure:
return -1;
}
-static void mall_bind_class(void *fh, u32 classid, unsigned long cl)
+static void mall_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
+ unsigned long base)
{
struct cls_mall_head *head = fh;
- if (head && head->res.classid == classid)
- head->res.class = cl;
+ if (head && head->res.classid == classid) {
+ if (cl)
+ __tcf_bind_filter(q, &head->res, base);
+ else
+ __tcf_unbind_filter(q, &head->res);
+ }
}
static struct tcf_proto_ops cls_mall_ops __read_mostly = {
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index 2d9e0b4484ea..6f8786b06bde 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -641,12 +641,17 @@ nla_put_failure:
return -1;
}
-static void route4_bind_class(void *fh, u32 classid, unsigned long cl)
+static void route4_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
+ unsigned long base)
{
struct route4_filter *f = fh;
- if (f && f->res.classid == classid)
- f->res.class = cl;
+ if (f && f->res.classid == classid) {
+ if (cl)
+ __tcf_bind_filter(q, &f->res, base);
+ else
+ __tcf_unbind_filter(q, &f->res);
+ }
}
static struct tcf_proto_ops cls_route4_ops __read_mostly = {
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 2f3c03b25d5d..d36949d9382c 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -463,10 +463,8 @@ static u32 gen_tunnel(struct rsvp_head *data)
static const struct nla_policy rsvp_policy[TCA_RSVP_MAX + 1] = {
[TCA_RSVP_CLASSID] = { .type = NLA_U32 },
- [TCA_RSVP_DST] = { .type = NLA_BINARY,
- .len = RSVP_DST_LEN * sizeof(u32) },
- [TCA_RSVP_SRC] = { .type = NLA_BINARY,
- .len = RSVP_DST_LEN * sizeof(u32) },
+ [TCA_RSVP_DST] = { .len = RSVP_DST_LEN * sizeof(u32) },
+ [TCA_RSVP_SRC] = { .len = RSVP_DST_LEN * sizeof(u32) },
[TCA_RSVP_PINFO] = { .len = sizeof(struct tc_rsvp_pinfo) },
};
@@ -738,12 +736,17 @@ nla_put_failure:
return -1;
}
-static void rsvp_bind_class(void *fh, u32 classid, unsigned long cl)
+static void rsvp_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
+ unsigned long base)
{
struct rsvp_filter *f = fh;
- if (f && f->res.classid == classid)
- f->res.class = cl;
+ if (f && f->res.classid == classid) {
+ if (cl)
+ __tcf_bind_filter(q, &f->res, base);
+ else
+ __tcf_unbind_filter(q, &f->res);
+ }
}
static struct tcf_proto_ops RSVP_OPS __read_mostly = {
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index e573e5a5c794..09b7dc5fe7e0 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -333,12 +333,31 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
cp->fall_through = p->fall_through;
cp->tp = tp;
+ if (tb[TCA_TCINDEX_HASH])
+ cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]);
+
+ if (tb[TCA_TCINDEX_MASK])
+ cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]);
+
+ if (tb[TCA_TCINDEX_SHIFT])
+ cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]);
+
+ if (!cp->hash) {
+ /* Hash not specified, use perfect hash if the upper limit
+ * of the hashing index is below the threshold.
+ */
+ if ((cp->mask >> cp->shift) < PERFECT_HASH_THRESHOLD)
+ cp->hash = (cp->mask >> cp->shift) + 1;
+ else
+ cp->hash = DEFAULT_HASH_SIZE;
+ }
+
if (p->perfect) {
int i;
if (tcindex_alloc_perfect_hash(net, cp) < 0)
goto errout;
- for (i = 0; i < cp->hash; i++)
+ for (i = 0; i < min(cp->hash, p->hash); i++)
cp->perfect[i].res = p->perfect[i].res;
balloc = 1;
}
@@ -346,19 +365,10 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
err = tcindex_filter_result_init(&new_filter_result, net);
if (err < 0)
- goto errout1;
+ goto errout_alloc;
if (old_r)
cr = r->res;
- if (tb[TCA_TCINDEX_HASH])
- cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]);
-
- if (tb[TCA_TCINDEX_MASK])
- cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]);
-
- if (tb[TCA_TCINDEX_SHIFT])
- cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]);
-
err = -EBUSY;
/* Hash already allocated, make sure that we still meet the
@@ -376,16 +386,6 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
if (tb[TCA_TCINDEX_FALL_THROUGH])
cp->fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]);
- if (!cp->hash) {
- /* Hash not specified, use perfect hash if the upper limit
- * of the hashing index is below the threshold.
- */
- if ((cp->mask >> cp->shift) < PERFECT_HASH_THRESHOLD)
- cp->hash = (cp->mask >> cp->shift) + 1;
- else
- cp->hash = DEFAULT_HASH_SIZE;
- }
-
if (!cp->perfect && !cp->h)
cp->alloc_hash = cp->hash;
@@ -484,7 +484,6 @@ errout_alloc:
tcindex_free_perfect_hash(cp);
else if (balloc == 2)
kfree(cp->h);
-errout1:
tcf_exts_destroy(&new_filter_result.exts);
errout:
kfree(cp);
@@ -654,12 +653,17 @@ nla_put_failure:
return -1;
}
-static void tcindex_bind_class(void *fh, u32 classid, unsigned long cl)
+static void tcindex_bind_class(void *fh, u32 classid, unsigned long cl,
+ void *q, unsigned long base)
{
struct tcindex_filter_result *r = fh;
- if (r && r->res.classid == classid)
- r->res.class = cl;
+ if (r && r->res.classid == classid) {
+ if (cl)
+ __tcf_bind_filter(q, &r->res, base);
+ else
+ __tcf_unbind_filter(q, &r->res);
+ }
}
static struct tcf_proto_ops cls_tcindex_ops __read_mostly = {
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 8614088edd1b..e15ff335953d 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -480,7 +480,7 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
cls_u32.hnode.handle = h->handle;
cls_u32.hnode.prio = h->prio;
- tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, false);
+ tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, false, true);
}
static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
@@ -498,7 +498,7 @@ static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
cls_u32.hnode.handle = h->handle;
cls_u32.hnode.prio = h->prio;
- err = tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, skip_sw);
+ err = tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, skip_sw, true);
if (err < 0) {
u32_clear_hw_hnode(tp, h, NULL);
return err;
@@ -522,8 +522,8 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
cls_u32.command = TC_CLSU32_DELETE_KNODE;
cls_u32.knode.handle = n->handle;
- tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, false);
- tcf_block_offload_dec(block, &n->flags);
+ tc_setup_cb_destroy(block, tp, TC_SETUP_CLSU32, &cls_u32, false,
+ &n->flags, &n->in_hw_count, true);
}
static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
@@ -552,13 +552,11 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
if (n->ht_down)
cls_u32.knode.link_handle = ht->handle;
- err = tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, skip_sw);
- if (err < 0) {
+ err = tc_setup_cb_add(block, tp, TC_SETUP_CLSU32, &cls_u32, skip_sw,
+ &n->flags, &n->in_hw_count, true);
+ if (err) {
u32_remove_hw_knode(tp, n, NULL);
return err;
- } else if (err > 0) {
- n->in_hw_count = err;
- tcf_block_offload_inc(block, &n->flags);
}
if (skip_sw && !(n->flags & TCA_CLS_FLAGS_IN_HW))
@@ -1201,14 +1199,11 @@ static int u32_reoffload_knode(struct tcf_proto *tp, struct tc_u_knode *n,
cls_u32.knode.link_handle = ht->handle;
}
- err = cb(TC_SETUP_CLSU32, &cls_u32, cb_priv);
- if (err) {
- if (add && tc_skip_sw(n->flags))
- return err;
- return 0;
- }
-
- tc_cls_offload_cnt_update(block, &n->in_hw_count, &n->flags, add);
+ err = tc_setup_cb_reoffload(block, tp, add, cb, TC_SETUP_CLSU32,
+ &cls_u32, cb_priv, &n->flags,
+ &n->in_hw_count);
+ if (err)
+ return err;
return 0;
}
@@ -1260,12 +1255,17 @@ static int u32_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb,
return 0;
}
-static void u32_bind_class(void *fh, u32 classid, unsigned long cl)
+static void u32_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
+ unsigned long base)
{
struct tc_u_knode *n = fh;
- if (n && n->res.classid == classid)
- n->res.class = cl;
+ if (n && n->res.classid == classid) {
+ if (cl)
+ __tcf_bind_filter(q, &n->res, base);
+ else
+ __tcf_unbind_filter(q, &n->res);
+ }
}
static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh,
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 82bd14e7ac93..d99966a55c84 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -446,7 +446,7 @@ META_COLLECTOR(int_sk_wmem_queued)
*err = -1;
return;
}
- dst->value = sk->sk_wmem_queued;
+ dst->value = READ_ONCE(sk->sk_wmem_queued);
}
META_COLLECTOR(int_sk_fwd_alloc)
@@ -521,7 +521,7 @@ META_COLLECTOR(int_sk_ack_bl)
*err = -1;
return;
}
- dst->value = sk->sk_ack_backlog;
+ dst->value = READ_ONCE(sk->sk_ack_backlog);
}
META_COLLECTOR(int_sk_max_ack_bl)
@@ -532,7 +532,7 @@ META_COLLECTOR(int_sk_max_ack_bl)
*err = -1;
return;
}
- dst->value = sk->sk_max_ack_backlog;
+ dst->value = READ_ONCE(sk->sk_max_ack_backlog);
}
META_COLLECTOR(int_sk_prio)
@@ -554,7 +554,7 @@ META_COLLECTOR(int_sk_rcvlowat)
*err = -1;
return;
}
- dst->value = sk->sk_rcvlowat;
+ dst->value = READ_ONCE(sk->sk_rcvlowat);
}
META_COLLECTOR(int_sk_rcvtimeo)
diff --git a/net/sched/ematch.c b/net/sched/ematch.c
index 8f2ad706784d..dd3b8c11a2e0 100644
--- a/net/sched/ematch.c
+++ b/net/sched/ematch.c
@@ -238,6 +238,9 @@ static int tcf_em_validate(struct tcf_proto *tp,
goto errout;
if (em->ops->change) {
+ err = -EINVAL;
+ if (em_hdr->flags & TCF_EM_SIMPLE)
+ goto errout;
err = em->ops->change(net, data, data_len, em);
if (err < 0)
goto errout;
@@ -263,12 +266,12 @@ static int tcf_em_validate(struct tcf_proto *tp,
}
em->data = (unsigned long) v;
}
+ em->datalen = data_len;
}
}
em->matchid = em_hdr->matchid;
em->flags = em_hdr->flags;
- em->datalen = data_len;
em->net = net;
err = 0;
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 04faee7ccbce..50794125bf02 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1891,8 +1891,9 @@ static int tclass_del_notify(struct net *net,
struct tcf_bind_args {
struct tcf_walker w;
- u32 classid;
+ unsigned long base;
unsigned long cl;
+ u32 classid;
};
static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
@@ -1903,26 +1904,30 @@ static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
struct Qdisc *q = tcf_block_q(tp->chain->block);
sch_tree_lock(q);
- tp->ops->bind_class(n, a->classid, a->cl);
+ tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
sch_tree_unlock(q);
}
return 0;
}
-static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
- unsigned long new_cl)
+struct tc_bind_class_args {
+ struct qdisc_walker w;
+ unsigned long new_cl;
+ u32 portid;
+ u32 clid;
+};
+
+static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
+ struct qdisc_walker *w)
{
+ struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
const struct Qdisc_class_ops *cops = q->ops->cl_ops;
struct tcf_block *block;
struct tcf_chain *chain;
- unsigned long cl;
- cl = cops->find(q, portid);
- if (!cl)
- return;
block = cops->tcf_block(q, cl, NULL);
if (!block)
- return;
+ return 0;
for (chain = tcf_get_next_chain(block, NULL);
chain;
chain = tcf_get_next_chain(block, chain)) {
@@ -1933,11 +1938,29 @@ static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
struct tcf_bind_args arg = {};
arg.w.fn = tcf_node_bind;
- arg.classid = clid;
- arg.cl = new_cl;
+ arg.classid = a->clid;
+ arg.base = cl;
+ arg.cl = a->new_cl;
tp->ops->walk(tp, &arg.w, true);
}
}
+
+ return 0;
+}
+
+static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
+ unsigned long new_cl)
+{
+ const struct Qdisc_class_ops *cops = q->ops->cl_ops;
+ struct tc_bind_class_args args = {};
+
+ if (!cops->tcf_block)
+ return;
+ args.portid = portid;
+ args.clid = clid;
+ args.new_cl = new_cl;
+ args.w.fn = tc_bind_class_walker;
+ q->ops->cl_ops->walk(q, &args.w);
}
#else
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index 53a80bc6b13a..1496e87cd07b 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -173,8 +173,7 @@ struct cake_tin_data {
u64 tin_rate_bps;
u16 tin_rate_shft;
- u16 tin_quantum_prio;
- u16 tin_quantum_band;
+ u16 tin_quantum;
s32 tin_deficit;
u32 tin_backlog;
u32 tin_dropped;
@@ -1683,8 +1682,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
if (IS_ERR_OR_NULL(segs))
return qdisc_drop(skb, sch, to_free);
- while (segs) {
- nskb = segs->next;
+ skb_list_walk_safe(segs, segs, nskb) {
skb_mark_not_on_list(segs);
qdisc_skb_cb(segs)->pkt_len = segs->len;
cobalt_set_enqueue_time(segs, now);
@@ -1697,7 +1695,6 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
slen += segs->len;
q->buffer_used += segs->truesize;
b->packets++;
- segs = nskb;
}
/* stats */
@@ -1769,7 +1766,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
q->avg_window_begin));
u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
- do_div(b, window_interval);
+ b = div64_u64(b, window_interval);
q->avg_peak_bandwidth =
cake_ewma(q->avg_peak_bandwidth, b,
b > q->avg_peak_bandwidth ? 2 : 8);
@@ -1919,7 +1916,7 @@ begin:
while (b->tin_deficit < 0 ||
!(b->sparse_flow_count + b->bulk_flow_count)) {
if (b->tin_deficit <= 0)
- b->tin_deficit += b->tin_quantum_band;
+ b->tin_deficit += b->tin_quantum;
if (b->sparse_flow_count + b->bulk_flow_count)
empty = false;
@@ -2184,6 +2181,7 @@ static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
[TCA_CAKE_MPU] = { .type = NLA_U32 },
[TCA_CAKE_INGRESS] = { .type = NLA_U32 },
[TCA_CAKE_ACK_FILTER] = { .type = NLA_U32 },
+ [TCA_CAKE_SPLIT_GSO] = { .type = NLA_U32 },
[TCA_CAKE_FWMARK] = { .type = NLA_U32 },
};
@@ -2240,8 +2238,7 @@ static int cake_config_besteffort(struct Qdisc *sch)
cake_set_rate(b, rate, mtu,
us_to_ns(q->target), us_to_ns(q->interval));
- b->tin_quantum_band = 65535;
- b->tin_quantum_prio = 65535;
+ b->tin_quantum = 65535;
return 0;
}
@@ -2252,8 +2249,7 @@ static int cake_config_precedence(struct Qdisc *sch)
struct cake_sched_data *q = qdisc_priv(sch);
u32 mtu = psched_mtu(qdisc_dev(sch));
u64 rate = q->rate_bps;
- u32 quantum1 = 256;
- u32 quantum2 = 256;
+ u32 quantum = 256;
u32 i;
q->tin_cnt = 8;
@@ -2266,18 +2262,14 @@ static int cake_config_precedence(struct Qdisc *sch)
cake_set_rate(b, rate, mtu, us_to_ns(q->target),
us_to_ns(q->interval));
- b->tin_quantum_prio = max_t(u16, 1U, quantum1);
- b->tin_quantum_band = max_t(u16, 1U, quantum2);
+ b->tin_quantum = max_t(u16, 1U, quantum);
/* calculate next class's parameters */
rate *= 7;
rate >>= 3;
- quantum1 *= 3;
- quantum1 >>= 1;
-
- quantum2 *= 7;
- quantum2 >>= 3;
+ quantum *= 7;
+ quantum >>= 3;
}
return 0;
@@ -2346,8 +2338,7 @@ static int cake_config_diffserv8(struct Qdisc *sch)
struct cake_sched_data *q = qdisc_priv(sch);
u32 mtu = psched_mtu(qdisc_dev(sch));
u64 rate = q->rate_bps;
- u32 quantum1 = 256;
- u32 quantum2 = 256;
+ u32 quantum = 256;
u32 i;
q->tin_cnt = 8;
@@ -2363,18 +2354,14 @@ static int cake_config_diffserv8(struct Qdisc *sch)
cake_set_rate(b, rate, mtu, us_to_ns(q->target),
us_to_ns(q->interval));
- b->tin_quantum_prio = max_t(u16, 1U, quantum1);
- b->tin_quantum_band = max_t(u16, 1U, quantum2);
+ b->tin_quantum = max_t(u16, 1U, quantum);
/* calculate next class's parameters */
rate *= 7;
rate >>= 3;
- quantum1 *= 3;
- quantum1 >>= 1;
-
- quantum2 *= 7;
- quantum2 >>= 3;
+ quantum *= 7;
+ quantum >>= 3;
}
return 0;
@@ -2413,17 +2400,11 @@ static int cake_config_diffserv4(struct Qdisc *sch)
cake_set_rate(&q->tins[3], rate >> 2, mtu,
us_to_ns(q->target), us_to_ns(q->interval));
- /* priority weights */
- q->tins[0].tin_quantum_prio = quantum;
- q->tins[1].tin_quantum_prio = quantum >> 4;
- q->tins[2].tin_quantum_prio = quantum << 2;
- q->tins[3].tin_quantum_prio = quantum << 4;
-
/* bandwidth-sharing weights */
- q->tins[0].tin_quantum_band = quantum;
- q->tins[1].tin_quantum_band = quantum >> 4;
- q->tins[2].tin_quantum_band = quantum >> 1;
- q->tins[3].tin_quantum_band = quantum >> 2;
+ q->tins[0].tin_quantum = quantum;
+ q->tins[1].tin_quantum = quantum >> 4;
+ q->tins[2].tin_quantum = quantum >> 1;
+ q->tins[3].tin_quantum = quantum >> 2;
return 0;
}
@@ -2454,15 +2435,10 @@ static int cake_config_diffserv3(struct Qdisc *sch)
cake_set_rate(&q->tins[2], rate >> 2, mtu,
us_to_ns(q->target), us_to_ns(q->interval));
- /* priority weights */
- q->tins[0].tin_quantum_prio = quantum;
- q->tins[1].tin_quantum_prio = quantum >> 4;
- q->tins[2].tin_quantum_prio = quantum << 4;
-
/* bandwidth-sharing weights */
- q->tins[0].tin_quantum_band = quantum;
- q->tins[1].tin_quantum_band = quantum >> 4;
- q->tins[2].tin_quantum_band = quantum >> 2;
+ q->tins[0].tin_quantum = quantum;
+ q->tins[1].tin_quantum = quantum >> 4;
+ q->tins[2].tin_quantum = quantum >> 2;
return 0;
}
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 06c7a2da21bc..39b427dc7512 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1127,6 +1127,33 @@ static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
[TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) },
};
+static int cbq_opt_parse(struct nlattr *tb[TCA_CBQ_MAX + 1],
+ struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ if (!opt) {
+ NL_SET_ERR_MSG(extack, "CBQ options are required for this operation");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt,
+ cbq_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_CBQ_WRROPT]) {
+ const struct tc_cbq_wrropt *wrr = nla_data(tb[TCA_CBQ_WRROPT]);
+
+ if (wrr->priority > TC_CBQ_MAXPRIO) {
+ NL_SET_ERR_MSG(extack, "priority is bigger than TC_CBQ_MAXPRIO");
+ err = -EINVAL;
+ }
+ }
+ return err;
+}
+
static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
@@ -1139,13 +1166,7 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
q->delay_timer.function = cbq_undelay;
- if (!opt) {
- NL_SET_ERR_MSG(extack, "CBQ options are required for this operation");
- return -EINVAL;
- }
-
- err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt, cbq_policy,
- extack);
+ err = cbq_opt_parse(tb, opt, extack);
if (err < 0)
return err;
@@ -1464,13 +1485,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
struct cbq_class *parent;
struct qdisc_rate_table *rtab = NULL;
- if (!opt) {
- NL_SET_ERR_MSG(extack, "Mandatory qdisc options missing");
- return -EINVAL;
- }
-
- err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt, cbq_policy,
- extack);
+ err = cbq_opt_parse(tb, opt, extack);
if (err < 0)
return err;
diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c
index 732e109c3055..b2905b03a432 100644
--- a/net/sched/sch_cbs.c
+++ b/net/sched/sch_cbs.c
@@ -181,11 +181,6 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch)
s64 credits;
int len;
- if (atomic64_read(&q->port_rate) == -1) {
- WARN_ONCE(1, "cbs: dequeue() called with unknown port rate.");
- return NULL;
- }
-
if (q->credits < 0) {
credits = timediff_to_credits(now - q->last, q->idleslope);
@@ -303,11 +298,19 @@ static int cbs_enable_offload(struct net_device *dev, struct cbs_sched_data *q,
static void cbs_set_port_rate(struct net_device *dev, struct cbs_sched_data *q)
{
struct ethtool_link_ksettings ecmd;
- int port_rate = -1;
+ int speed = SPEED_10;
+ int port_rate;
+ int err;
+
+ err = __ethtool_get_link_ksettings(dev, &ecmd);
+ if (err < 0)
+ goto skip;
+
+ if (ecmd.base.speed && ecmd.base.speed != SPEED_UNKNOWN)
+ speed = ecmd.base.speed;
- if (!__ethtool_get_link_ksettings(dev, &ecmd) &&
- ecmd.base.speed != SPEED_UNKNOWN)
- port_rate = ecmd.base.speed * 1000 * BYTES_PER_KBIT;
+skip:
+ port_rate = speed * 1000 * BYTES_PER_KBIT;
atomic64_set(&q->port_rate, port_rate);
netdev_dbg(dev, "cbs: set %s's port_rate to: %lld, linkspeed: %d\n",
@@ -389,7 +392,6 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt,
{
struct cbs_sched_data *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
- int err;
if (!opt) {
NL_SET_ERR_MSG(extack, "Missing CBS qdisc options which are mandatory");
@@ -401,6 +403,10 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt,
if (!q->qdisc)
return -ENOMEM;
+ spin_lock(&cbs_list_lock);
+ list_add(&q->cbs_list, &cbs_list);
+ spin_unlock(&cbs_list_lock);
+
qdisc_hash_add(q->qdisc, false);
q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0);
@@ -410,17 +416,7 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt,
qdisc_watchdog_init(&q->watchdog, sch);
- err = cbs_change(sch, opt, extack);
- if (err)
- return err;
-
- if (!q->offload) {
- spin_lock(&cbs_list_lock);
- list_add(&q->cbs_list, &cbs_list);
- spin_unlock(&cbs_list_lock);
- }
-
- return 0;
+ return cbs_change(sch, opt, extack);
}
static void cbs_destroy(struct Qdisc *sch)
@@ -428,15 +424,18 @@ static void cbs_destroy(struct Qdisc *sch)
struct cbs_sched_data *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
- spin_lock(&cbs_list_lock);
- list_del(&q->cbs_list);
- spin_unlock(&cbs_list_lock);
+ /* Nothing to do if we couldn't create the underlying qdisc */
+ if (!q->qdisc)
+ return;
qdisc_watchdog_cancel(&q->watchdog);
cbs_disable_offload(dev, q);
- if (q->qdisc)
- qdisc_put(q->qdisc);
+ spin_lock(&cbs_list_lock);
+ list_del(&q->cbs_list);
+ spin_unlock(&cbs_list_lock);
+
+ qdisc_put(q->qdisc);
}
static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb)
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index dba70377bbd9..a36974e9c601 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -377,7 +377,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt,
if (mask != q->tab_mask) {
struct sk_buff **ntab;
- ntab = kvmalloc_array((mask + 1), sizeof(struct sk_buff *), GFP_KERNEL | __GFP_ZERO);
+ ntab = kvcalloc(mask + 1, sizeof(struct sk_buff *), GFP_KERNEL);
if (!ntab)
return -ENOMEM;
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index bad1cbe59a56..05605b30bef3 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -361,6 +361,8 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt,
goto errout;
err = -EINVAL;
+ if (!tb[TCA_DSMARK_INDICES])
+ goto errout;
indices = nla_get_u16(tb[TCA_DSMARK_INDICES]);
if (hweight32(indices) != 1)
diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c
index cebfb65d8556..b1da5589a0c6 100644
--- a/net/sched/sch_etf.c
+++ b/net/sched/sch_etf.c
@@ -177,7 +177,7 @@ static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch,
parent = *p;
skb = rb_to_skb(parent);
- if (ktime_after(txtime, skb->tstamp)) {
+ if (ktime_compare(txtime, skb->tstamp) >= 0) {
p = &parent->rb_right;
leftmost = false;
} else {
diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c
new file mode 100644
index 000000000000..a87e9159338c
--- /dev/null
+++ b/net/sched/sch_ets.c
@@ -0,0 +1,828 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * net/sched/sch_ets.c Enhanced Transmission Selection scheduler
+ *
+ * Description
+ * -----------
+ *
+ * The Enhanced Transmission Selection scheduler is a classful queuing
+ * discipline that merges functionality of PRIO and DRR qdiscs in one scheduler.
+ * ETS makes it easy to configure a set of strict and bandwidth-sharing bands to
+ * implement the transmission selection described in 802.1Qaz.
+ *
+ * Although ETS is technically classful, it's not possible to add and remove
+ * classes at will. Instead one specifies number of classes, how many are
+ * PRIO-like and how many DRR-like, and quanta for the latter.
+ *
+ * Algorithm
+ * ---------
+ *
+ * The strict classes, if any, are tried for traffic first: first band 0, if it
+ * has no traffic then band 1, etc.
+ *
+ * When there is no traffic in any of the strict queues, the bandwidth-sharing
+ * ones are tried next. Each band is assigned a deficit counter, initialized to
+ * "quantum" of that band. ETS maintains a list of active bandwidth-sharing
+ * bands whose qdiscs are non-empty. A packet is dequeued from the band at the
+ * head of the list if the packet size is smaller or equal to the deficit
+ * counter. If the counter is too small, it is increased by "quantum" and the
+ * scheduler moves on to the next band in the active list.
+ */
+
+#include <linux/module.h>
+#include <net/gen_stats.h>
+#include <net/netlink.h>
+#include <net/pkt_cls.h>
+#include <net/pkt_sched.h>
+#include <net/sch_generic.h>
+
+struct ets_class {
+ struct list_head alist; /* In struct ets_sched.active. */
+ struct Qdisc *qdisc;
+ u32 quantum;
+ u32 deficit;
+ struct gnet_stats_basic_packed bstats;
+ struct gnet_stats_queue qstats;
+};
+
+struct ets_sched {
+ struct list_head active;
+ struct tcf_proto __rcu *filter_list;
+ struct tcf_block *block;
+ unsigned int nbands;
+ unsigned int nstrict;
+ u8 prio2band[TC_PRIO_MAX + 1];
+ struct ets_class classes[TCQ_ETS_MAX_BANDS];
+};
+
+static const struct nla_policy ets_policy[TCA_ETS_MAX + 1] = {
+ [TCA_ETS_NBANDS] = { .type = NLA_U8 },
+ [TCA_ETS_NSTRICT] = { .type = NLA_U8 },
+ [TCA_ETS_QUANTA] = { .type = NLA_NESTED },
+ [TCA_ETS_PRIOMAP] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy ets_priomap_policy[TCA_ETS_MAX + 1] = {
+ [TCA_ETS_PRIOMAP_BAND] = { .type = NLA_U8 },
+};
+
+static const struct nla_policy ets_quanta_policy[TCA_ETS_MAX + 1] = {
+ [TCA_ETS_QUANTA_BAND] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy ets_class_policy[TCA_ETS_MAX + 1] = {
+ [TCA_ETS_QUANTA_BAND] = { .type = NLA_U32 },
+};
+
+static int ets_quantum_parse(struct Qdisc *sch, const struct nlattr *attr,
+ unsigned int *quantum,
+ struct netlink_ext_ack *extack)
+{
+ *quantum = nla_get_u32(attr);
+ if (!*quantum) {
+ NL_SET_ERR_MSG(extack, "ETS quantum cannot be zero");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static struct ets_class *
+ets_class_from_arg(struct Qdisc *sch, unsigned long arg)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+
+ return &q->classes[arg - 1];
+}
+
+static u32 ets_class_id(struct Qdisc *sch, const struct ets_class *cl)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+ int band = cl - q->classes;
+
+ return TC_H_MAKE(sch->handle, band + 1);
+}
+
+static void ets_offload_change(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct ets_sched *q = qdisc_priv(sch);
+ struct tc_ets_qopt_offload qopt;
+ unsigned int w_psum_prev = 0;
+ unsigned int q_psum = 0;
+ unsigned int q_sum = 0;
+ unsigned int quantum;
+ unsigned int w_psum;
+ unsigned int weight;
+ unsigned int i;
+
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return;
+
+ qopt.command = TC_ETS_REPLACE;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ qopt.replace_params.bands = q->nbands;
+ qopt.replace_params.qstats = &sch->qstats;
+ memcpy(&qopt.replace_params.priomap,
+ q->prio2band, sizeof(q->prio2band));
+
+ for (i = 0; i < q->nbands; i++)
+ q_sum += q->classes[i].quantum;
+
+ for (i = 0; i < q->nbands; i++) {
+ quantum = q->classes[i].quantum;
+ q_psum += quantum;
+ w_psum = quantum ? q_psum * 100 / q_sum : 0;
+ weight = w_psum - w_psum_prev;
+ w_psum_prev = w_psum;
+
+ qopt.replace_params.quanta[i] = quantum;
+ qopt.replace_params.weights[i] = weight;
+ }
+
+ dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETS, &qopt);
+}
+
+static void ets_offload_destroy(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_ets_qopt_offload qopt;
+
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return;
+
+ qopt.command = TC_ETS_DESTROY;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETS, &qopt);
+}
+
+static void ets_offload_graft(struct Qdisc *sch, struct Qdisc *new,
+ struct Qdisc *old, unsigned long arg,
+ struct netlink_ext_ack *extack)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_ets_qopt_offload qopt;
+
+ qopt.command = TC_ETS_GRAFT;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ qopt.graft_params.band = arg - 1;
+ qopt.graft_params.child_handle = new->handle;
+
+ qdisc_offload_graft_helper(dev, sch, new, old, TC_SETUP_QDISC_ETS,
+ &qopt, extack);
+}
+
+static int ets_offload_dump(struct Qdisc *sch)
+{
+ struct tc_ets_qopt_offload qopt;
+
+ qopt.command = TC_ETS_STATS;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ qopt.stats.bstats = &sch->bstats;
+ qopt.stats.qstats = &sch->qstats;
+
+ return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_ETS, &qopt);
+}
+
+static bool ets_class_is_strict(struct ets_sched *q, const struct ets_class *cl)
+{
+ unsigned int band = cl - q->classes;
+
+ return band < q->nstrict;
+}
+
+static int ets_class_change(struct Qdisc *sch, u32 classid, u32 parentid,
+ struct nlattr **tca, unsigned long *arg,
+ struct netlink_ext_ack *extack)
+{
+ struct ets_class *cl = ets_class_from_arg(sch, *arg);
+ struct ets_sched *q = qdisc_priv(sch);
+ struct nlattr *opt = tca[TCA_OPTIONS];
+ struct nlattr *tb[TCA_ETS_MAX + 1];
+ unsigned int quantum;
+ int err;
+
+ /* Classes can be added and removed only through Qdisc_ops.change
+ * interface.
+ */
+ if (!cl) {
+ NL_SET_ERR_MSG(extack, "Fine-grained class addition and removal is not supported");
+ return -EOPNOTSUPP;
+ }
+
+ if (!opt) {
+ NL_SET_ERR_MSG(extack, "ETS options are required for this operation");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_ETS_MAX, opt, ets_class_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_ETS_QUANTA_BAND])
+ /* Nothing to configure. */
+ return 0;
+
+ if (ets_class_is_strict(q, cl)) {
+ NL_SET_ERR_MSG(extack, "Strict bands do not have a configurable quantum");
+ return -EINVAL;
+ }
+
+ err = ets_quantum_parse(sch, tb[TCA_ETS_QUANTA_BAND], &quantum,
+ extack);
+ if (err)
+ return err;
+
+ sch_tree_lock(sch);
+ cl->quantum = quantum;
+ sch_tree_unlock(sch);
+
+ ets_offload_change(sch);
+ return 0;
+}
+
+static int ets_class_graft(struct Qdisc *sch, unsigned long arg,
+ struct Qdisc *new, struct Qdisc **old,
+ struct netlink_ext_ack *extack)
+{
+ struct ets_class *cl = ets_class_from_arg(sch, arg);
+
+ if (!new) {
+ new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ ets_class_id(sch, cl), NULL);
+ if (!new)
+ new = &noop_qdisc;
+ else
+ qdisc_hash_add(new, true);
+ }
+
+ *old = qdisc_replace(sch, new, &cl->qdisc);
+ ets_offload_graft(sch, new, *old, arg, extack);
+ return 0;
+}
+
+static struct Qdisc *ets_class_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct ets_class *cl = ets_class_from_arg(sch, arg);
+
+ return cl->qdisc;
+}
+
+static unsigned long ets_class_find(struct Qdisc *sch, u32 classid)
+{
+ unsigned long band = TC_H_MIN(classid);
+ struct ets_sched *q = qdisc_priv(sch);
+
+ if (band - 1 >= q->nbands)
+ return 0;
+ return band;
+}
+
+static void ets_class_qlen_notify(struct Qdisc *sch, unsigned long arg)
+{
+ struct ets_class *cl = ets_class_from_arg(sch, arg);
+ struct ets_sched *q = qdisc_priv(sch);
+
+ /* We get notified about zero-length child Qdiscs as well if they are
+ * offloaded. Those aren't on the active list though, so don't attempt
+ * to remove them.
+ */
+ if (!ets_class_is_strict(q, cl) && sch->q.qlen)
+ list_del(&cl->alist);
+}
+
+static int ets_class_dump(struct Qdisc *sch, unsigned long arg,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct ets_class *cl = ets_class_from_arg(sch, arg);
+ struct ets_sched *q = qdisc_priv(sch);
+ struct nlattr *nest;
+
+ tcm->tcm_parent = TC_H_ROOT;
+ tcm->tcm_handle = ets_class_id(sch, cl);
+ tcm->tcm_info = cl->qdisc->handle;
+
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
+ if (!nest)
+ goto nla_put_failure;
+ if (!ets_class_is_strict(q, cl)) {
+ if (nla_put_u32(skb, TCA_ETS_QUANTA_BAND, cl->quantum))
+ goto nla_put_failure;
+ }
+ return nla_nest_end(skb, nest);
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int ets_class_dump_stats(struct Qdisc *sch, unsigned long arg,
+ struct gnet_dump *d)
+{
+ struct ets_class *cl = ets_class_from_arg(sch, arg);
+ struct Qdisc *cl_q = cl->qdisc;
+
+ if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
+ d, NULL, &cl_q->bstats) < 0 ||
+ qdisc_qstats_copy(d, cl_q) < 0)
+ return -1;
+
+ return 0;
+}
+
+static void ets_qdisc_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+ int i;
+
+ if (arg->stop)
+ return;
+
+ for (i = 0; i < q->nbands; i++) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, i + 1, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->count++;
+ }
+}
+
+static struct tcf_block *
+ets_qdisc_tcf_block(struct Qdisc *sch, unsigned long cl,
+ struct netlink_ext_ack *extack)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+
+ if (cl) {
+ NL_SET_ERR_MSG(extack, "ETS classid must be zero");
+ return NULL;
+ }
+
+ return q->block;
+}
+
+static unsigned long ets_qdisc_bind_tcf(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ return ets_class_find(sch, classid);
+}
+
+static void ets_qdisc_unbind_tcf(struct Qdisc *sch, unsigned long arg)
+{
+}
+
+static struct ets_class *ets_classify(struct sk_buff *skb, struct Qdisc *sch,
+ int *qerr)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+ u32 band = skb->priority;
+ struct tcf_result res;
+ struct tcf_proto *fl;
+ int err;
+
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ if (TC_H_MAJ(skb->priority) != sch->handle) {
+ fl = rcu_dereference_bh(q->filter_list);
+ err = tcf_classify(skb, fl, &res, false);
+#ifdef CONFIG_NET_CLS_ACT
+ switch (err) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ case TC_ACT_TRAP:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ /* fall through */
+ case TC_ACT_SHOT:
+ return NULL;
+ }
+#endif
+ if (!fl || err < 0) {
+ if (TC_H_MAJ(band))
+ band = 0;
+ return &q->classes[q->prio2band[band & TC_PRIO_MAX]];
+ }
+ band = res.classid;
+ }
+ band = TC_H_MIN(band) - 1;
+ if (band >= q->nbands)
+ return &q->classes[q->prio2band[0]];
+ return &q->classes[band];
+}
+
+static int ets_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ unsigned int len = qdisc_pkt_len(skb);
+ struct ets_sched *q = qdisc_priv(sch);
+ struct ets_class *cl;
+ int err = 0;
+ bool first;
+
+ cl = ets_classify(skb, sch, &err);
+ if (!cl) {
+ if (err & __NET_XMIT_BYPASS)
+ qdisc_qstats_drop(sch);
+ __qdisc_drop(skb, to_free);
+ return err;
+ }
+
+ first = !cl->qdisc->q.qlen;
+ err = qdisc_enqueue(skb, cl->qdisc, to_free);
+ if (unlikely(err != NET_XMIT_SUCCESS)) {
+ if (net_xmit_drop_count(err)) {
+ cl->qstats.drops++;
+ qdisc_qstats_drop(sch);
+ }
+ return err;
+ }
+
+ if (first && !ets_class_is_strict(q, cl)) {
+ list_add_tail(&cl->alist, &q->active);
+ cl->deficit = cl->quantum;
+ }
+
+ sch->qstats.backlog += len;
+ sch->q.qlen++;
+ return err;
+}
+
+static struct sk_buff *
+ets_qdisc_dequeue_skb(struct Qdisc *sch, struct sk_buff *skb)
+{
+ qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
+ sch->q.qlen--;
+ return skb;
+}
+
+static struct sk_buff *ets_qdisc_dequeue(struct Qdisc *sch)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+ struct ets_class *cl;
+ struct sk_buff *skb;
+ unsigned int band;
+ unsigned int len;
+
+ while (1) {
+ for (band = 0; band < q->nstrict; band++) {
+ cl = &q->classes[band];
+ skb = qdisc_dequeue_peeked(cl->qdisc);
+ if (skb)
+ return ets_qdisc_dequeue_skb(sch, skb);
+ }
+
+ if (list_empty(&q->active))
+ goto out;
+
+ cl = list_first_entry(&q->active, struct ets_class, alist);
+ skb = cl->qdisc->ops->peek(cl->qdisc);
+ if (!skb) {
+ qdisc_warn_nonwc(__func__, cl->qdisc);
+ goto out;
+ }
+
+ len = qdisc_pkt_len(skb);
+ if (len <= cl->deficit) {
+ cl->deficit -= len;
+ skb = qdisc_dequeue_peeked(cl->qdisc);
+ if (unlikely(!skb))
+ goto out;
+ if (cl->qdisc->q.qlen == 0)
+ list_del(&cl->alist);
+ return ets_qdisc_dequeue_skb(sch, skb);
+ }
+
+ cl->deficit += cl->quantum;
+ list_move_tail(&cl->alist, &q->active);
+ }
+out:
+ return NULL;
+}
+
+static int ets_qdisc_priomap_parse(struct nlattr *priomap_attr,
+ unsigned int nbands, u8 *priomap,
+ struct netlink_ext_ack *extack)
+{
+ const struct nlattr *attr;
+ int prio = 0;
+ u8 band;
+ int rem;
+ int err;
+
+ err = __nla_validate_nested(priomap_attr, TCA_ETS_MAX,
+ ets_priomap_policy, NL_VALIDATE_STRICT,
+ extack);
+ if (err)
+ return err;
+
+ nla_for_each_nested(attr, priomap_attr, rem) {
+ switch (nla_type(attr)) {
+ case TCA_ETS_PRIOMAP_BAND:
+ if (prio > TC_PRIO_MAX) {
+ NL_SET_ERR_MSG_MOD(extack, "Too many priorities in ETS priomap");
+ return -EINVAL;
+ }
+ band = nla_get_u8(attr);
+ if (band >= nbands) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid band number in ETS priomap");
+ return -EINVAL;
+ }
+ priomap[prio++] = band;
+ break;
+ default:
+ WARN_ON_ONCE(1); /* Validate should have caught this. */
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int ets_qdisc_quanta_parse(struct Qdisc *sch, struct nlattr *quanta_attr,
+ unsigned int nbands, unsigned int nstrict,
+ unsigned int *quanta,
+ struct netlink_ext_ack *extack)
+{
+ const struct nlattr *attr;
+ int band = nstrict;
+ int rem;
+ int err;
+
+ err = __nla_validate_nested(quanta_attr, TCA_ETS_MAX,
+ ets_quanta_policy, NL_VALIDATE_STRICT,
+ extack);
+ if (err < 0)
+ return err;
+
+ nla_for_each_nested(attr, quanta_attr, rem) {
+ switch (nla_type(attr)) {
+ case TCA_ETS_QUANTA_BAND:
+ if (band >= nbands) {
+ NL_SET_ERR_MSG_MOD(extack, "ETS quanta has more values than bands");
+ return -EINVAL;
+ }
+ err = ets_quantum_parse(sch, attr, &quanta[band++],
+ extack);
+ if (err)
+ return err;
+ break;
+ default:
+ WARN_ON_ONCE(1); /* Validate should have caught this. */
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ unsigned int quanta[TCQ_ETS_MAX_BANDS] = {0};
+ struct Qdisc *queues[TCQ_ETS_MAX_BANDS];
+ struct ets_sched *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_ETS_MAX + 1];
+ unsigned int oldbands = q->nbands;
+ u8 priomap[TC_PRIO_MAX + 1];
+ unsigned int nstrict = 0;
+ unsigned int nbands;
+ unsigned int i;
+ int err;
+
+ if (!opt) {
+ NL_SET_ERR_MSG(extack, "ETS options are required for this operation");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_ETS_MAX, opt, ets_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_ETS_NBANDS]) {
+ NL_SET_ERR_MSG_MOD(extack, "Number of bands is a required argument");
+ return -EINVAL;
+ }
+ nbands = nla_get_u8(tb[TCA_ETS_NBANDS]);
+ if (nbands < 1 || nbands > TCQ_ETS_MAX_BANDS) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid number of bands");
+ return -EINVAL;
+ }
+ /* Unless overridden, traffic goes to the last band. */
+ memset(priomap, nbands - 1, sizeof(priomap));
+
+ if (tb[TCA_ETS_NSTRICT]) {
+ nstrict = nla_get_u8(tb[TCA_ETS_NSTRICT]);
+ if (nstrict > nbands) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid number of strict bands");
+ return -EINVAL;
+ }
+ }
+
+ if (tb[TCA_ETS_PRIOMAP]) {
+ err = ets_qdisc_priomap_parse(tb[TCA_ETS_PRIOMAP],
+ nbands, priomap, extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[TCA_ETS_QUANTA]) {
+ err = ets_qdisc_quanta_parse(sch, tb[TCA_ETS_QUANTA],
+ nbands, nstrict, quanta, extack);
+ if (err)
+ return err;
+ }
+ /* If there are more bands than strict + quanta provided, the remaining
+ * ones are ETS with quantum of MTU. Initialize the missing values here.
+ */
+ for (i = nstrict; i < nbands; i++) {
+ if (!quanta[i])
+ quanta[i] = psched_mtu(qdisc_dev(sch));
+ }
+
+ /* Before commit, make sure we can allocate all new qdiscs */
+ for (i = oldbands; i < nbands; i++) {
+ queues[i] = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ ets_class_id(sch, &q->classes[i]),
+ extack);
+ if (!queues[i]) {
+ while (i > oldbands)
+ qdisc_put(queues[--i]);
+ return -ENOMEM;
+ }
+ }
+
+ sch_tree_lock(sch);
+
+ q->nbands = nbands;
+ q->nstrict = nstrict;
+ memcpy(q->prio2band, priomap, sizeof(priomap));
+
+ for (i = q->nbands; i < oldbands; i++)
+ qdisc_tree_flush_backlog(q->classes[i].qdisc);
+
+ for (i = 0; i < q->nbands; i++)
+ q->classes[i].quantum = quanta[i];
+
+ for (i = oldbands; i < q->nbands; i++) {
+ q->classes[i].qdisc = queues[i];
+ if (q->classes[i].qdisc != &noop_qdisc)
+ qdisc_hash_add(q->classes[i].qdisc, true);
+ }
+
+ sch_tree_unlock(sch);
+
+ ets_offload_change(sch);
+ for (i = q->nbands; i < oldbands; i++) {
+ qdisc_put(q->classes[i].qdisc);
+ memset(&q->classes[i], 0, sizeof(q->classes[i]));
+ }
+ return 0;
+}
+
+static int ets_qdisc_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+ int err;
+
+ if (!opt)
+ return -EINVAL;
+
+ err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
+ if (err)
+ return err;
+
+ INIT_LIST_HEAD(&q->active);
+ return ets_qdisc_change(sch, opt, extack);
+}
+
+static void ets_qdisc_reset(struct Qdisc *sch)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+ int band;
+
+ for (band = q->nstrict; band < q->nbands; band++) {
+ if (q->classes[band].qdisc->q.qlen)
+ list_del(&q->classes[band].alist);
+ }
+ for (band = 0; band < q->nbands; band++)
+ qdisc_reset(q->classes[band].qdisc);
+ sch->qstats.backlog = 0;
+ sch->q.qlen = 0;
+}
+
+static void ets_qdisc_destroy(struct Qdisc *sch)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+ int band;
+
+ ets_offload_destroy(sch);
+ tcf_block_put(q->block);
+ for (band = 0; band < q->nbands; band++)
+ qdisc_put(q->classes[band].qdisc);
+}
+
+static int ets_qdisc_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+ struct nlattr *opts;
+ struct nlattr *nest;
+ int band;
+ int prio;
+ int err;
+
+ err = ets_offload_dump(sch);
+ if (err)
+ return err;
+
+ opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
+ if (!opts)
+ goto nla_err;
+
+ if (nla_put_u8(skb, TCA_ETS_NBANDS, q->nbands))
+ goto nla_err;
+
+ if (q->nstrict &&
+ nla_put_u8(skb, TCA_ETS_NSTRICT, q->nstrict))
+ goto nla_err;
+
+ if (q->nbands > q->nstrict) {
+ nest = nla_nest_start(skb, TCA_ETS_QUANTA);
+ if (!nest)
+ goto nla_err;
+
+ for (band = q->nstrict; band < q->nbands; band++) {
+ if (nla_put_u32(skb, TCA_ETS_QUANTA_BAND,
+ q->classes[band].quantum))
+ goto nla_err;
+ }
+
+ nla_nest_end(skb, nest);
+ }
+
+ nest = nla_nest_start(skb, TCA_ETS_PRIOMAP);
+ if (!nest)
+ goto nla_err;
+
+ for (prio = 0; prio <= TC_PRIO_MAX; prio++) {
+ if (nla_put_u8(skb, TCA_ETS_PRIOMAP_BAND, q->prio2band[prio]))
+ goto nla_err;
+ }
+
+ nla_nest_end(skb, nest);
+
+ return nla_nest_end(skb, opts);
+
+nla_err:
+ nla_nest_cancel(skb, opts);
+ return -EMSGSIZE;
+}
+
+static const struct Qdisc_class_ops ets_class_ops = {
+ .change = ets_class_change,
+ .graft = ets_class_graft,
+ .leaf = ets_class_leaf,
+ .find = ets_class_find,
+ .qlen_notify = ets_class_qlen_notify,
+ .dump = ets_class_dump,
+ .dump_stats = ets_class_dump_stats,
+ .walk = ets_qdisc_walk,
+ .tcf_block = ets_qdisc_tcf_block,
+ .bind_tcf = ets_qdisc_bind_tcf,
+ .unbind_tcf = ets_qdisc_unbind_tcf,
+};
+
+static struct Qdisc_ops ets_qdisc_ops __read_mostly = {
+ .cl_ops = &ets_class_ops,
+ .id = "ets",
+ .priv_size = sizeof(struct ets_sched),
+ .enqueue = ets_qdisc_enqueue,
+ .dequeue = ets_qdisc_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .change = ets_qdisc_change,
+ .init = ets_qdisc_init,
+ .reset = ets_qdisc_reset,
+ .destroy = ets_qdisc_destroy,
+ .dump = ets_qdisc_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init ets_init(void)
+{
+ return register_qdisc(&ets_qdisc_ops);
+}
+
+static void __exit ets_exit(void)
+{
+ unregister_qdisc(&ets_qdisc_ops);
+}
+
+module_init(ets_init);
+module_exit(ets_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 98dd87ce1510..a5a295477ecc 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -301,6 +301,9 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
f->socket_hash != sk->sk_hash)) {
f->credit = q->initial_quantum;
f->socket_hash = sk->sk_hash;
+ if (q->rate_enable)
+ smp_store_release(&sk->sk_pacing_status,
+ SK_PACING_FQ);
if (fq_flow_is_throttled(f))
fq_flow_unset_throttled(q, f);
f->time_next_packet = 0ULL;
@@ -322,8 +325,12 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
fq_flow_set_detached(f);
f->sk = sk;
- if (skb->sk == sk)
+ if (skb->sk == sk) {
f->socket_hash = sk->sk_hash;
+ if (q->rate_enable)
+ smp_store_release(&sk->sk_pacing_status,
+ SK_PACING_FQ);
+ }
f->credit = q->initial_quantum;
rb_link_node(&f->fq_node, parent, p);
@@ -428,17 +435,9 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
f->qlen++;
qdisc_qstats_backlog_inc(sch, skb);
if (fq_flow_is_detached(f)) {
- struct sock *sk = skb->sk;
-
fq_flow_add_tail(&q->new_flows, f);
if (time_after(jiffies, f->age + q->flow_refill_delay))
f->credit = max_t(u32, f->credit, q->quantum);
- if (sk && q->rate_enable) {
- if (unlikely(smp_load_acquire(&sk->sk_pacing_status) !=
- SK_PACING_FQ))
- smp_store_release(&sk->sk_pacing_status,
- SK_PACING_FQ);
- }
q->inactive_flows--;
}
@@ -530,8 +529,7 @@ begin:
fq_flow_set_throttled(q, f);
goto begin;
}
- if (time_next_packet &&
- (s64)(now - time_next_packet - q->ce_threshold) > 0) {
+ if ((s64)(now - time_next_packet - q->ce_threshold) > 0) {
INET_ECN_set_ce(skb);
q->stat_ce_mark++;
}
@@ -788,10 +786,12 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt,
if (tb[TCA_FQ_QUANTUM]) {
u32 quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);
- if (quantum > 0)
+ if (quantum > 0 && quantum <= (1 << 20)) {
q->quantum = quantum;
- else
+ } else {
+ NL_SET_ERR_MSG_MOD(extack, "invalid quantum");
err = -EINVAL;
+ }
}
if (tb[TCA_FQ_INITIAL_QUANTUM])
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index d59fbcc745d1..968519ff36e9 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -14,7 +14,6 @@
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/skbuff.h>
-#include <linux/jhash.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <net/netlink.h>
@@ -45,7 +44,6 @@ struct fq_codel_flow {
struct sk_buff *tail;
struct list_head flowchain;
int deficit;
- u32 dropped; /* number of drops (or ECN marks) on this flow */
struct codel_vars cvars;
}; /* please try to keep this structure <= 64 bytes */
@@ -173,7 +171,8 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets,
__qdisc_drop(skb, to_free);
} while (++i < max_packets && len < threshold);
- flow->dropped += i;
+ /* Tell codel to increase its signal strength also */
+ flow->cvars.count += i;
q->backlogs[idx] -= len;
q->memory_usage -= mem;
sch->qstats.drops += i;
@@ -211,7 +210,6 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch,
list_add_tail(&flow->flowchain, &q->new_flows);
q->new_flow_count++;
flow->deficit = q->quantum;
- flow->dropped = 0;
}
get_codel_cb(skb)->mem_usage = skb->truesize;
q->memory_usage += get_codel_cb(skb)->mem_usage;
@@ -286,7 +284,6 @@ static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch)
struct sk_buff *skb;
struct fq_codel_flow *flow;
struct list_head *head;
- u32 prev_drop_count, prev_ecn_mark;
begin:
head = &q->new_flows;
@@ -303,16 +300,10 @@ begin:
goto begin;
}
- prev_drop_count = q->cstats.drop_count;
- prev_ecn_mark = q->cstats.ecn_mark;
-
skb = codel_dequeue(sch, &sch->qstats.backlog, &q->cparams,
&flow->cvars, &q->cstats, qdisc_pkt_len,
codel_get_enqueue_time, drop_func, dequeue_func);
- flow->dropped += q->cstats.drop_count - prev_drop_count;
- flow->dropped += q->cstats.ecn_mark - prev_ecn_mark;
-
if (!skb) {
/* force a pass through old_flows to prevent starvation */
if ((head == &q->new_flows) && !list_empty(&q->old_flows))
@@ -658,7 +649,7 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl,
sch_tree_unlock(sch);
}
qs.backlog = q->backlogs[idx];
- qs.drops = flow->dropped;
+ qs.drops = 0;
}
if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
return -1;
diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c
new file mode 100644
index 000000000000..214657eb3dfd
--- /dev/null
+++ b/net/sched/sch_fq_pie.c
@@ -0,0 +1,562 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Flow Queue PIE discipline
+ *
+ * Copyright (C) 2019 Mohit P. Tahiliani <tahiliani@nitk.edu.in>
+ * Copyright (C) 2019 Sachin D. Patil <sdp.sachin@gmail.com>
+ * Copyright (C) 2019 V. Saicharan <vsaicharan1998@gmail.com>
+ * Copyright (C) 2019 Mohit Bhasi <mohitbhasi1998@gmail.com>
+ * Copyright (C) 2019 Leslie Monis <lesliemonis@gmail.com>
+ * Copyright (C) 2019 Gautam Ramakrishnan <gautamramk@gmail.com>
+ */
+
+#include <linux/jhash.h>
+#include <linux/sizes.h>
+#include <linux/vmalloc.h>
+#include <net/pkt_cls.h>
+#include <net/pie.h>
+
+/* Flow Queue PIE
+ *
+ * Principles:
+ * - Packets are classified on flows.
+ * - This is a Stochastic model (as we use a hash, several flows might
+ * be hashed to the same slot)
+ * - Each flow has a PIE managed queue.
+ * - Flows are linked onto two (Round Robin) lists,
+ * so that new flows have priority on old ones.
+ * - For a given flow, packets are not reordered.
+ * - Drops during enqueue only.
+ * - ECN capability is off by default.
+ * - ECN threshold (if ECN is enabled) is at 10% by default.
+ * - Uses timestamps to calculate queue delay by default.
+ */
+
+/**
+ * struct fq_pie_flow - contains data for each flow
+ * @vars: pie vars associated with the flow
+ * @deficit: number of remaining byte credits
+ * @backlog: size of data in the flow
+ * @qlen: number of packets in the flow
+ * @flowchain: flowchain for the flow
+ * @head: first packet in the flow
+ * @tail: last packet in the flow
+ */
+struct fq_pie_flow {
+ struct pie_vars vars;
+ s32 deficit;
+ u32 backlog;
+ u32 qlen;
+ struct list_head flowchain;
+ struct sk_buff *head;
+ struct sk_buff *tail;
+};
+
+struct fq_pie_sched_data {
+ struct tcf_proto __rcu *filter_list; /* optional external classifier */
+ struct tcf_block *block;
+ struct fq_pie_flow *flows;
+ struct Qdisc *sch;
+ struct list_head old_flows;
+ struct list_head new_flows;
+ struct pie_params p_params;
+ u32 ecn_prob;
+ u32 flows_cnt;
+ u32 quantum;
+ u32 memory_limit;
+ u32 new_flow_count;
+ u32 memory_usage;
+ u32 overmemory;
+ struct pie_stats stats;
+ struct timer_list adapt_timer;
+};
+
+static unsigned int fq_pie_hash(const struct fq_pie_sched_data *q,
+ struct sk_buff *skb)
+{
+ return reciprocal_scale(skb_get_hash(skb), q->flows_cnt);
+}
+
+static unsigned int fq_pie_classify(struct sk_buff *skb, struct Qdisc *sch,
+ int *qerr)
+{
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+ struct tcf_proto *filter;
+ struct tcf_result res;
+ int result;
+
+ if (TC_H_MAJ(skb->priority) == sch->handle &&
+ TC_H_MIN(skb->priority) > 0 &&
+ TC_H_MIN(skb->priority) <= q->flows_cnt)
+ return TC_H_MIN(skb->priority);
+
+ filter = rcu_dereference_bh(q->filter_list);
+ if (!filter)
+ return fq_pie_hash(q, skb) + 1;
+
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ result = tcf_classify(skb, filter, &res, false);
+ if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ case TC_ACT_TRAP:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ /* fall through */
+ case TC_ACT_SHOT:
+ return 0;
+ }
+#endif
+ if (TC_H_MIN(res.classid) <= q->flows_cnt)
+ return TC_H_MIN(res.classid);
+ }
+ return 0;
+}
+
+/* add skb to flow queue (tail add) */
+static inline void flow_queue_add(struct fq_pie_flow *flow,
+ struct sk_buff *skb)
+{
+ if (!flow->head)
+ flow->head = skb;
+ else
+ flow->tail->next = skb;
+ flow->tail = skb;
+ skb->next = NULL;
+}
+
+static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+ struct fq_pie_flow *sel_flow;
+ int uninitialized_var(ret);
+ u8 memory_limited = false;
+ u8 enqueue = false;
+ u32 pkt_len;
+ u32 idx;
+
+ /* Classifies packet into corresponding flow */
+ idx = fq_pie_classify(skb, sch, &ret);
+ sel_flow = &q->flows[idx];
+
+ /* Checks whether adding a new packet would exceed memory limit */
+ get_pie_cb(skb)->mem_usage = skb->truesize;
+ memory_limited = q->memory_usage > q->memory_limit + skb->truesize;
+
+ /* Checks if the qdisc is full */
+ if (unlikely(qdisc_qlen(sch) >= sch->limit)) {
+ q->stats.overlimit++;
+ goto out;
+ } else if (unlikely(memory_limited)) {
+ q->overmemory++;
+ }
+
+ if (!pie_drop_early(sch, &q->p_params, &sel_flow->vars,
+ sel_flow->backlog, skb->len)) {
+ enqueue = true;
+ } else if (q->p_params.ecn &&
+ sel_flow->vars.prob <= (MAX_PROB / 100) * q->ecn_prob &&
+ INET_ECN_set_ce(skb)) {
+ /* If packet is ecn capable, mark it if drop probability
+ * is lower than the parameter ecn_prob, else drop it.
+ */
+ q->stats.ecn_mark++;
+ enqueue = true;
+ }
+ if (enqueue) {
+ /* Set enqueue time only when dq_rate_estimator is disabled. */
+ if (!q->p_params.dq_rate_estimator)
+ pie_set_enqueue_time(skb);
+
+ pkt_len = qdisc_pkt_len(skb);
+ q->stats.packets_in++;
+ q->memory_usage += skb->truesize;
+ sch->qstats.backlog += pkt_len;
+ sch->q.qlen++;
+ flow_queue_add(sel_flow, skb);
+ if (list_empty(&sel_flow->flowchain)) {
+ list_add_tail(&sel_flow->flowchain, &q->new_flows);
+ q->new_flow_count++;
+ sel_flow->deficit = q->quantum;
+ sel_flow->qlen = 0;
+ sel_flow->backlog = 0;
+ }
+ sel_flow->qlen++;
+ sel_flow->backlog += pkt_len;
+ return NET_XMIT_SUCCESS;
+ }
+out:
+ q->stats.dropped++;
+ sel_flow->vars.accu_prob = 0;
+ sel_flow->vars.accu_prob_overflows = 0;
+ __qdisc_drop(skb, to_free);
+ qdisc_qstats_drop(sch);
+ return NET_XMIT_CN;
+}
+
+static const struct nla_policy fq_pie_policy[TCA_FQ_PIE_MAX + 1] = {
+ [TCA_FQ_PIE_LIMIT] = {.type = NLA_U32},
+ [TCA_FQ_PIE_FLOWS] = {.type = NLA_U32},
+ [TCA_FQ_PIE_TARGET] = {.type = NLA_U32},
+ [TCA_FQ_PIE_TUPDATE] = {.type = NLA_U32},
+ [TCA_FQ_PIE_ALPHA] = {.type = NLA_U32},
+ [TCA_FQ_PIE_BETA] = {.type = NLA_U32},
+ [TCA_FQ_PIE_QUANTUM] = {.type = NLA_U32},
+ [TCA_FQ_PIE_MEMORY_LIMIT] = {.type = NLA_U32},
+ [TCA_FQ_PIE_ECN_PROB] = {.type = NLA_U32},
+ [TCA_FQ_PIE_ECN] = {.type = NLA_U32},
+ [TCA_FQ_PIE_BYTEMODE] = {.type = NLA_U32},
+ [TCA_FQ_PIE_DQ_RATE_ESTIMATOR] = {.type = NLA_U32},
+};
+
+static inline struct sk_buff *dequeue_head(struct fq_pie_flow *flow)
+{
+ struct sk_buff *skb = flow->head;
+
+ flow->head = skb->next;
+ skb->next = NULL;
+ return skb;
+}
+
+static struct sk_buff *fq_pie_qdisc_dequeue(struct Qdisc *sch)
+{
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb = NULL;
+ struct fq_pie_flow *flow;
+ struct list_head *head;
+ u32 pkt_len;
+
+begin:
+ head = &q->new_flows;
+ if (list_empty(head)) {
+ head = &q->old_flows;
+ if (list_empty(head))
+ return NULL;
+ }
+
+ flow = list_first_entry(head, struct fq_pie_flow, flowchain);
+ /* Flow has exhausted all its credits */
+ if (flow->deficit <= 0) {
+ flow->deficit += q->quantum;
+ list_move_tail(&flow->flowchain, &q->old_flows);
+ goto begin;
+ }
+
+ if (flow->head) {
+ skb = dequeue_head(flow);
+ pkt_len = qdisc_pkt_len(skb);
+ sch->qstats.backlog -= pkt_len;
+ sch->q.qlen--;
+ qdisc_bstats_update(sch, skb);
+ }
+
+ if (!skb) {
+ /* force a pass through old_flows to prevent starvation */
+ if (head == &q->new_flows && !list_empty(&q->old_flows))
+ list_move_tail(&flow->flowchain, &q->old_flows);
+ else
+ list_del_init(&flow->flowchain);
+ goto begin;
+ }
+
+ flow->qlen--;
+ flow->deficit -= pkt_len;
+ flow->backlog -= pkt_len;
+ q->memory_usage -= get_pie_cb(skb)->mem_usage;
+ pie_process_dequeue(skb, &q->p_params, &flow->vars, flow->backlog);
+ return skb;
+}
+
+static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_FQ_PIE_MAX + 1];
+ unsigned int len_dropped = 0;
+ unsigned int num_dropped = 0;
+ int err;
+
+ if (!opt)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_FQ_PIE_MAX, opt, fq_pie_policy, extack);
+ if (err < 0)
+ return err;
+
+ sch_tree_lock(sch);
+ if (tb[TCA_FQ_PIE_LIMIT]) {
+ u32 limit = nla_get_u32(tb[TCA_FQ_PIE_LIMIT]);
+
+ q->p_params.limit = limit;
+ sch->limit = limit;
+ }
+ if (tb[TCA_FQ_PIE_FLOWS]) {
+ if (q->flows) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Number of flows cannot be changed");
+ goto flow_error;
+ }
+ q->flows_cnt = nla_get_u32(tb[TCA_FQ_PIE_FLOWS]);
+ if (!q->flows_cnt || q->flows_cnt > 65536) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Number of flows must be < 65536");
+ goto flow_error;
+ }
+ }
+
+ /* convert from microseconds to pschedtime */
+ if (tb[TCA_FQ_PIE_TARGET]) {
+ /* target is in us */
+ u32 target = nla_get_u32(tb[TCA_FQ_PIE_TARGET]);
+
+ /* convert to pschedtime */
+ q->p_params.target =
+ PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC);
+ }
+
+ /* tupdate is in jiffies */
+ if (tb[TCA_FQ_PIE_TUPDATE])
+ q->p_params.tupdate =
+ usecs_to_jiffies(nla_get_u32(tb[TCA_FQ_PIE_TUPDATE]));
+
+ if (tb[TCA_FQ_PIE_ALPHA])
+ q->p_params.alpha = nla_get_u32(tb[TCA_FQ_PIE_ALPHA]);
+
+ if (tb[TCA_FQ_PIE_BETA])
+ q->p_params.beta = nla_get_u32(tb[TCA_FQ_PIE_BETA]);
+
+ if (tb[TCA_FQ_PIE_QUANTUM])
+ q->quantum = nla_get_u32(tb[TCA_FQ_PIE_QUANTUM]);
+
+ if (tb[TCA_FQ_PIE_MEMORY_LIMIT])
+ q->memory_limit = nla_get_u32(tb[TCA_FQ_PIE_MEMORY_LIMIT]);
+
+ if (tb[TCA_FQ_PIE_ECN_PROB])
+ q->ecn_prob = nla_get_u32(tb[TCA_FQ_PIE_ECN_PROB]);
+
+ if (tb[TCA_FQ_PIE_ECN])
+ q->p_params.ecn = nla_get_u32(tb[TCA_FQ_PIE_ECN]);
+
+ if (tb[TCA_FQ_PIE_BYTEMODE])
+ q->p_params.bytemode = nla_get_u32(tb[TCA_FQ_PIE_BYTEMODE]);
+
+ if (tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR])
+ q->p_params.dq_rate_estimator =
+ nla_get_u32(tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR]);
+
+ /* Drop excess packets if new limit is lower */
+ while (sch->q.qlen > sch->limit) {
+ struct sk_buff *skb = fq_pie_qdisc_dequeue(sch);
+
+ len_dropped += qdisc_pkt_len(skb);
+ num_dropped += 1;
+ rtnl_kfree_skbs(skb, skb);
+ }
+ qdisc_tree_reduce_backlog(sch, num_dropped, len_dropped);
+
+ sch_tree_unlock(sch);
+ return 0;
+
+flow_error:
+ sch_tree_unlock(sch);
+ return -EINVAL;
+}
+
+static void fq_pie_timer(struct timer_list *t)
+{
+ struct fq_pie_sched_data *q = from_timer(q, t, adapt_timer);
+ struct Qdisc *sch = q->sch;
+ spinlock_t *root_lock; /* to lock qdisc for probability calculations */
+ u16 idx;
+
+ root_lock = qdisc_lock(qdisc_root_sleeping(sch));
+ spin_lock(root_lock);
+
+ for (idx = 0; idx < q->flows_cnt; idx++)
+ pie_calculate_probability(&q->p_params, &q->flows[idx].vars,
+ q->flows[idx].backlog);
+
+ /* reset the timer to fire after 'tupdate' jiffies. */
+ if (q->p_params.tupdate)
+ mod_timer(&q->adapt_timer, jiffies + q->p_params.tupdate);
+
+ spin_unlock(root_lock);
+}
+
+static int fq_pie_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+ int err;
+ u16 idx;
+
+ pie_params_init(&q->p_params);
+ sch->limit = 10 * 1024;
+ q->p_params.limit = sch->limit;
+ q->quantum = psched_mtu(qdisc_dev(sch));
+ q->sch = sch;
+ q->ecn_prob = 10;
+ q->flows_cnt = 1024;
+ q->memory_limit = SZ_32M;
+
+ INIT_LIST_HEAD(&q->new_flows);
+ INIT_LIST_HEAD(&q->old_flows);
+
+ if (opt) {
+ err = fq_pie_change(sch, opt, extack);
+
+ if (err)
+ return err;
+ }
+
+ err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
+ if (err)
+ goto init_failure;
+
+ q->flows = kvcalloc(q->flows_cnt, sizeof(struct fq_pie_flow),
+ GFP_KERNEL);
+ if (!q->flows) {
+ err = -ENOMEM;
+ goto init_failure;
+ }
+ for (idx = 0; idx < q->flows_cnt; idx++) {
+ struct fq_pie_flow *flow = q->flows + idx;
+
+ INIT_LIST_HEAD(&flow->flowchain);
+ pie_vars_init(&flow->vars);
+ }
+
+ timer_setup(&q->adapt_timer, fq_pie_timer, 0);
+ mod_timer(&q->adapt_timer, jiffies + HZ / 2);
+
+ return 0;
+
+init_failure:
+ q->flows_cnt = 0;
+
+ return err;
+}
+
+static int fq_pie_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts;
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (!opts)
+ return -EMSGSIZE;
+
+ /* convert target from pschedtime to us */
+ if (nla_put_u32(skb, TCA_FQ_PIE_LIMIT, sch->limit) ||
+ nla_put_u32(skb, TCA_FQ_PIE_FLOWS, q->flows_cnt) ||
+ nla_put_u32(skb, TCA_FQ_PIE_TARGET,
+ ((u32)PSCHED_TICKS2NS(q->p_params.target)) /
+ NSEC_PER_USEC) ||
+ nla_put_u32(skb, TCA_FQ_PIE_TUPDATE,
+ jiffies_to_usecs(q->p_params.tupdate)) ||
+ nla_put_u32(skb, TCA_FQ_PIE_ALPHA, q->p_params.alpha) ||
+ nla_put_u32(skb, TCA_FQ_PIE_BETA, q->p_params.beta) ||
+ nla_put_u32(skb, TCA_FQ_PIE_QUANTUM, q->quantum) ||
+ nla_put_u32(skb, TCA_FQ_PIE_MEMORY_LIMIT, q->memory_limit) ||
+ nla_put_u32(skb, TCA_FQ_PIE_ECN_PROB, q->ecn_prob) ||
+ nla_put_u32(skb, TCA_FQ_PIE_ECN, q->p_params.ecn) ||
+ nla_put_u32(skb, TCA_FQ_PIE_BYTEMODE, q->p_params.bytemode) ||
+ nla_put_u32(skb, TCA_FQ_PIE_DQ_RATE_ESTIMATOR,
+ q->p_params.dq_rate_estimator))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ nla_nest_cancel(skb, opts);
+ return -EMSGSIZE;
+}
+
+static int fq_pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+ struct tc_fq_pie_xstats st = {
+ .packets_in = q->stats.packets_in,
+ .overlimit = q->stats.overlimit,
+ .overmemory = q->overmemory,
+ .dropped = q->stats.dropped,
+ .ecn_mark = q->stats.ecn_mark,
+ .new_flow_count = q->new_flow_count,
+ .memory_usage = q->memory_usage,
+ };
+ struct list_head *pos;
+
+ sch_tree_lock(sch);
+ list_for_each(pos, &q->new_flows)
+ st.new_flows_len++;
+
+ list_for_each(pos, &q->old_flows)
+ st.old_flows_len++;
+ sch_tree_unlock(sch);
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static void fq_pie_reset(struct Qdisc *sch)
+{
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+ u16 idx;
+
+ INIT_LIST_HEAD(&q->new_flows);
+ INIT_LIST_HEAD(&q->old_flows);
+ for (idx = 0; idx < q->flows_cnt; idx++) {
+ struct fq_pie_flow *flow = q->flows + idx;
+
+ /* Removes all packets from flow */
+ rtnl_kfree_skbs(flow->head, flow->tail);
+ flow->head = NULL;
+
+ INIT_LIST_HEAD(&flow->flowchain);
+ pie_vars_init(&flow->vars);
+ }
+
+ sch->q.qlen = 0;
+ sch->qstats.backlog = 0;
+}
+
+static void fq_pie_destroy(struct Qdisc *sch)
+{
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+
+ tcf_block_put(q->block);
+ del_timer_sync(&q->adapt_timer);
+ kvfree(q->flows);
+}
+
+static struct Qdisc_ops fq_pie_qdisc_ops __read_mostly = {
+ .id = "fq_pie",
+ .priv_size = sizeof(struct fq_pie_sched_data),
+ .enqueue = fq_pie_qdisc_enqueue,
+ .dequeue = fq_pie_qdisc_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = fq_pie_init,
+ .destroy = fq_pie_destroy,
+ .reset = fq_pie_reset,
+ .change = fq_pie_change,
+ .dump = fq_pie_dump,
+ .dump_stats = fq_pie_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init fq_pie_module_init(void)
+{
+ return register_qdisc(&fq_pie_qdisc_ops);
+}
+
+static void __exit fq_pie_module_exit(void)
+{
+ unregister_qdisc(&fq_pie_qdisc_ops);
+}
+
+module_init(fq_pie_module_init);
+module_exit(fq_pie_module_exit);
+
+MODULE_DESCRIPTION("Flow Queue Proportional Integral controller Enhanced (FQ-PIE)");
+MODULE_AUTHOR("Mohit P. Tahiliani");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 11c03cf4aa74..6c9595f1048a 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -46,6 +46,8 @@ EXPORT_SYMBOL(default_qdisc_ops);
* - updates to tree and tree walking are only done under the rtnl mutex.
*/
+#define SKB_XOFF_MAGIC ((struct sk_buff *)1UL)
+
static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q)
{
const struct netdev_queue *txq = q->dev_queue;
@@ -71,7 +73,7 @@ static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q)
q->q.qlen--;
}
} else {
- skb = NULL;
+ skb = SKB_XOFF_MAGIC;
}
}
@@ -253,8 +255,11 @@ validate:
return skb;
skb = qdisc_dequeue_skb_bad_txq(q);
- if (unlikely(skb))
+ if (unlikely(skb)) {
+ if (skb == SKB_XOFF_MAGIC)
+ return NULL;
goto bulk;
+ }
skb = q->dequeue(q);
if (skb) {
bulk:
@@ -377,13 +382,8 @@ void __qdisc_run(struct Qdisc *q)
int packets;
while (qdisc_restart(q, &packets)) {
- /*
- * Ordered by possible occurrence: Postpone processing if
- * 1. we've exceeded packet quota
- * 2. another process needs the CPU;
- */
quota -= packets;
- if (quota <= 0 || need_resched()) {
+ if (quota <= 0) {
__netif_schedule(q);
break;
}
@@ -441,7 +441,7 @@ static void dev_watchdog(struct timer_list *t)
trace_net_dev_xmit_timeout(dev, i);
WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
dev->name, netdev_drivername(dev), i);
- dev->netdev_ops->ndo_tx_timeout(dev);
+ dev->netdev_ops->ndo_tx_timeout(dev, i);
}
if (!mod_timer(&dev->watchdog_timer,
round_jiffies(jiffies +
@@ -624,8 +624,12 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
err = skb_array_produce(q, skb);
- if (unlikely(err))
- return qdisc_drop_cpu(skb, qdisc, to_free);
+ if (unlikely(err)) {
+ if (qdisc_is_percpu_stats(qdisc))
+ return qdisc_drop_cpu(skb, qdisc, to_free);
+ else
+ return qdisc_drop(skb, qdisc, to_free);
+ }
qdisc_update_stats_at_enqueue(qdisc, pkt_len);
return NET_XMIT_SUCCESS;
@@ -648,7 +652,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
if (likely(skb)) {
qdisc_update_stats_at_dequeue(qdisc, skb);
} else {
- qdisc->empty = true;
+ WRITE_ONCE(qdisc->empty, true);
}
return skb;
@@ -688,11 +692,14 @@ static void pfifo_fast_reset(struct Qdisc *qdisc)
kfree_skb(skb);
}
- for_each_possible_cpu(i) {
- struct gnet_stats_queue *q = per_cpu_ptr(qdisc->cpu_qstats, i);
+ if (qdisc_is_percpu_stats(qdisc)) {
+ for_each_possible_cpu(i) {
+ struct gnet_stats_queue *q;
- q->backlog = 0;
- q->qlen = 0;
+ q = per_cpu_ptr(qdisc->cpu_qstats, i);
+ q->backlog = 0;
+ q->qlen = 0;
+ }
}
}
@@ -787,9 +794,6 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = {
};
EXPORT_SYMBOL(pfifo_fast_ops);
-static struct lock_class_key qdisc_tx_busylock;
-static struct lock_class_key qdisc_running_key;
-
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
const struct Qdisc_ops *ops,
struct netlink_ext_ack *extack)
@@ -842,17 +846,9 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
}
spin_lock_init(&sch->busylock);
- lockdep_set_class(&sch->busylock,
- dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
-
/* seqlock has the same scope of busylock, for NOLOCK qdisc */
spin_lock_init(&sch->seqlock);
- lockdep_set_class(&sch->busylock,
- dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
-
seqcount_init(&sch->running);
- lockdep_set_class(&sch->running,
- dev->qdisc_running_key ?: &qdisc_running_key);
sch->ops = ops;
sch->flags = ops->static_flags;
@@ -863,6 +859,12 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
dev_hold(dev);
refcount_set(&sch->refcnt, 1);
+ if (sch != &noop_qdisc) {
+ lockdep_set_class(&sch->busylock, &dev->qdisc_tx_busylock_key);
+ lockdep_set_class(&sch->seqlock, &dev->qdisc_tx_busylock_key);
+ lockdep_set_class(&sch->running, &dev->qdisc_running_key);
+ }
+
return sch;
errout1:
kfree(p);
@@ -973,6 +975,9 @@ static void qdisc_destroy(struct Qdisc *qdisc)
void qdisc_put(struct Qdisc *qdisc)
{
+ if (!qdisc)
+ return;
+
if (qdisc->flags & TCQ_F_BUILTIN ||
!refcount_dec_and_test(&qdisc->refcnt))
return;
@@ -1028,6 +1033,8 @@ static void attach_one_default_qdisc(struct net_device *dev,
if (dev->priv_flags & IFF_NO_QUEUE)
ops = &noqueue_qdisc_ops;
+ else if(dev->type == ARPHRD_CAN)
+ ops = &pfifo_fast_ops;
qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT, NULL);
if (!qdisc) {
@@ -1202,8 +1209,13 @@ void dev_deactivate_many(struct list_head *head)
/* Wait for outstanding qdisc_run calls. */
list_for_each_entry(dev, head, close_list) {
- while (some_qdisc_is_busy(dev))
- yield();
+ while (some_qdisc_is_busy(dev)) {
+ /* wait_event() would avoid this sleep-loop but would
+ * require expensive checks in the fast paths of packet
+ * processing which isn't worth it.
+ */
+ schedule_timeout_uninterruptible(1);
+ }
/* The new qdisc is assigned at this point so we can safely
* unwind stale skb lists and qdisc statistics
*/
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
index cee6971c1c82..be35f03b657b 100644
--- a/net/sched/sch_hhf.c
+++ b/net/sched/sch_hhf.c
@@ -5,11 +5,11 @@
* Copyright (C) 2013 Nandita Dukkipati <nanditad@google.com>
*/
-#include <linux/jhash.h>
#include <linux/jiffies.h>
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
+#include <linux/siphash.h>
#include <net/pkt_sched.h>
#include <net/sock.h>
@@ -126,7 +126,7 @@ struct wdrr_bucket {
struct hhf_sched_data {
struct wdrr_bucket buckets[WDRR_BUCKET_CNT];
- u32 perturbation; /* hash perturbation */
+ siphash_key_t perturbation; /* hash perturbation */
u32 quantum; /* psched_mtu(qdisc_dev(sch)); */
u32 drop_overlimit; /* number of times max qdisc packet
* limit was hit
@@ -264,7 +264,7 @@ static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch)
}
/* Get hashed flow-id of the skb. */
- hash = skb_get_hash_perturb(skb, q->perturbation);
+ hash = skb_get_hash_perturb(skb, &q->perturbation);
/* Check if this packet belongs to an already established HH flow. */
flow_pos = hash & HHF_BIT_MASK;
@@ -531,7 +531,7 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt,
new_hhf_non_hh_weight = nla_get_u32(tb[TCA_HHF_NON_HH_WEIGHT]);
non_hh_quantum = (u64)new_quantum * new_hhf_non_hh_weight;
- if (non_hh_quantum > INT_MAX)
+ if (non_hh_quantum == 0 || non_hh_quantum > INT_MAX)
return -EINVAL;
sch_tree_lock(sch);
@@ -582,7 +582,7 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt,
sch->limit = 1000;
q->quantum = psched_mtu(qdisc_dev(sch));
- q->perturbation = prandom_u32();
+ get_random_bytes(&q->perturbation, sizeof(q->perturbation));
INIT_LIST_HEAD(&q->new_buckets);
INIT_LIST_HEAD(&q->old_buckets);
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 7bcf20ef9145..8184c87da8be 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1302,6 +1302,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
struct htb_class *cl = (struct htb_class *)*arg, *parent;
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_HTB_MAX + 1];
+ struct Qdisc *parent_qdisc = NULL;
struct tc_htb_opt *hopt;
u64 rate64, ceil64;
int warn = 0;
@@ -1401,7 +1402,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
if (parent && !parent->level) {
/* turn parent into inner node */
qdisc_purge_queue(parent->leaf.q);
- qdisc_put(parent->leaf.q);
+ parent_qdisc = parent->leaf.q;
if (parent->prio_activity)
htb_deactivate(q, parent);
@@ -1480,6 +1481,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
sch_tree_unlock(sch);
+ qdisc_put(parent_qdisc);
if (warn)
pr_warn("HTB: quantum of class %X is %s. Consider r2q change.\n",
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index 0d578333e967..e79f1afe0cfd 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -153,6 +153,7 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
__gnet_stats_copy_queue(&sch->qstats,
qdisc->cpu_qstats,
&qdisc->qstats, qlen);
+ sch->q.qlen += qlen;
} else {
sch->q.qlen += qdisc->q.qlen;
sch->bstats.bytes += qdisc->bstats.bytes;
@@ -245,7 +246,8 @@ static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
sch = dev_queue->qdisc_sleeping;
- if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 ||
+ if (gnet_stats_copy_basic(&sch->running, d, sch->cpu_bstats,
+ &sch->bstats) < 0 ||
qdisc_qstats_copy(d, sch) < 0)
return -1;
return 0;
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 46980b8d66c5..8766ab5b8788 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -411,6 +411,7 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
__gnet_stats_copy_queue(&sch->qstats,
qdisc->cpu_qstats,
&qdisc->qstats, qlen);
+ sch->q.qlen += qlen;
} else {
sch->q.qlen += qdisc->q.qlen;
sch->bstats.bytes += qdisc->bstats.bytes;
@@ -433,7 +434,7 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
opt.offset[tc] = dev->tc_to_txq[tc].offset;
}
- if (nla_put(skb, TCA_OPTIONS, NLA_ALIGN(sizeof(opt)), &opt))
+ if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
goto nla_put_failure;
if ((priv->flags & TC_MQPRIO_F_MODE) &&
@@ -557,8 +558,8 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
sch = dev_queue->qdisc_sleeping;
- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
- d, NULL, &sch->bstats) < 0 ||
+ if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d,
+ sch->cpu_bstats, &sch->bstats) < 0 ||
qdisc_qstats_copy(d, sch) < 0)
return -1;
}
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index e1087746f6a2..1330ad224931 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -174,7 +174,8 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt,
{
struct multiq_sched_data *q = qdisc_priv(sch);
struct tc_multiq_qopt *qopt;
- int i;
+ struct Qdisc **removed;
+ int i, n_removed = 0;
if (!netif_is_multiqueue(qdisc_dev(sch)))
return -EOPNOTSUPP;
@@ -185,6 +186,11 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt,
qopt->bands = qdisc_dev(sch)->real_num_tx_queues;
+ removed = kmalloc(sizeof(*removed) * (q->max_bands - q->bands),
+ GFP_KERNEL);
+ if (!removed)
+ return -ENOMEM;
+
sch_tree_lock(sch);
q->bands = qopt->bands;
for (i = q->bands; i < q->max_bands; i++) {
@@ -192,13 +198,17 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt,
struct Qdisc *child = q->queues[i];
q->queues[i] = &noop_qdisc;
- qdisc_tree_flush_backlog(child);
- qdisc_put(child);
+ qdisc_purge_queue(child);
+ removed[n_removed++] = child;
}
}
sch_tree_unlock(sch);
+ for (i = 0; i < n_removed; i++)
+ qdisc_put(removed[i]);
+ kfree(removed);
+
for (i = 0; i < q->bands; i++) {
if (q->queues[i] == &noop_qdisc) {
struct Qdisc *child, *old;
@@ -213,11 +223,10 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt,
if (child != &noop_qdisc)
qdisc_hash_add(child, true);
- if (old != &noop_qdisc) {
- qdisc_tree_flush_backlog(old);
- qdisc_put(old);
- }
+ if (old != &noop_qdisc)
+ qdisc_purge_queue(old);
sch_tree_unlock(sch);
+ qdisc_put(old);
}
}
}
@@ -330,7 +339,7 @@ static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
cl_q = q->queues[cl - 1];
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
- d, NULL, &cl_q->bstats) < 0 ||
+ d, cl_q->cpu_bstats, &cl_q->bstats) < 0 ||
qdisc_qstats_copy(d, cl_q) < 0)
return -1;
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index b17f2ed970e2..42e557d48e4e 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -476,7 +476,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
* skb will be queued.
*/
if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
- struct Qdisc *rootq = qdisc_root(sch);
+ struct Qdisc *rootq = qdisc_root_bh(sch);
u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
q->duplicate = 0;
@@ -509,6 +509,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
if (skb->ip_summed == CHECKSUM_PARTIAL &&
skb_checksum_help(skb)) {
qdisc_drop(skb, sch, to_free);
+ skb = NULL;
goto finish_segs;
}
@@ -593,9 +594,10 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
finish_segs:
if (segs) {
unsigned int len, last_len;
- int nb = 0;
+ int nb;
- len = skb->len;
+ len = skb ? skb->len : 0;
+ nb = skb ? 1 : 0;
while (segs) {
skb2 = segs->next;
@@ -612,7 +614,10 @@ finish_segs:
}
segs = skb2;
}
- qdisc_tree_reduce_backlog(sch, -nb, prev_len - len);
+ /* Parent qdiscs accounted for 1 skb of size @prev_len */
+ qdisc_tree_reduce_backlog(sch, -(nb - 1), -(len - prev_len));
+ } else if (!skb) {
+ return NET_XMIT_DROP;
}
return NET_XMIT_SUCCESS;
}
@@ -777,7 +782,7 @@ static int get_dist_table(struct Qdisc *sch, struct disttable **tbl,
struct disttable *d;
int i;
- if (n > NETEM_DIST_MAX)
+ if (!n || n > NETEM_DIST_MAX)
return -EINVAL;
d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL);
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index df98a887eb89..915bcdb59a9f 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -19,134 +19,76 @@
#include <linux/skbuff.h>
#include <net/pkt_sched.h>
#include <net/inet_ecn.h>
-
-#define QUEUE_THRESHOLD 16384
-#define DQCOUNT_INVALID -1
-#define MAX_PROB 0xffffffffffffffff
-#define PIE_SCALE 8
-
-/* parameters used */
-struct pie_params {
- psched_time_t target; /* user specified target delay in pschedtime */
- u32 tupdate; /* timer frequency (in jiffies) */
- u32 limit; /* number of packets that can be enqueued */
- u32 alpha; /* alpha and beta are between 0 and 32 */
- u32 beta; /* and are used for shift relative to 1 */
- bool ecn; /* true if ecn is enabled */
- bool bytemode; /* to scale drop early prob based on pkt size */
-};
-
-/* variables used */
-struct pie_vars {
- u64 prob; /* probability but scaled by u64 limit. */
- psched_time_t burst_time;
- psched_time_t qdelay;
- psched_time_t qdelay_old;
- u64 dq_count; /* measured in bytes */
- psched_time_t dq_tstamp; /* drain rate */
- u64 accu_prob; /* accumulated drop probability */
- u32 avg_dq_rate; /* bytes per pschedtime tick,scaled */
- u32 qlen_old; /* in bytes */
- u8 accu_prob_overflows; /* overflows of accu_prob */
-};
-
-/* statistics gathering */
-struct pie_stats {
- u32 packets_in; /* total number of packets enqueued */
- u32 dropped; /* packets dropped due to pie_action */
- u32 overlimit; /* dropped due to lack of space in queue */
- u32 maxq; /* maximum queue size */
- u32 ecn_mark; /* packets marked with ECN */
-};
+#include <net/pie.h>
/* private data for the Qdisc */
struct pie_sched_data {
- struct pie_params params;
struct pie_vars vars;
+ struct pie_params params;
struct pie_stats stats;
struct timer_list adapt_timer;
struct Qdisc *sch;
};
-static void pie_params_init(struct pie_params *params)
-{
- params->alpha = 2;
- params->beta = 20;
- params->tupdate = usecs_to_jiffies(15 * USEC_PER_MSEC); /* 15 ms */
- params->limit = 1000; /* default of 1000 packets */
- params->target = PSCHED_NS2TICKS(15 * NSEC_PER_MSEC); /* 15 ms */
- params->ecn = false;
- params->bytemode = false;
-}
-
-static void pie_vars_init(struct pie_vars *vars)
+bool pie_drop_early(struct Qdisc *sch, struct pie_params *params,
+ struct pie_vars *vars, u32 qlen, u32 packet_size)
{
- vars->dq_count = DQCOUNT_INVALID;
- vars->accu_prob = 0;
- vars->avg_dq_rate = 0;
- /* default of 150 ms in pschedtime */
- vars->burst_time = PSCHED_NS2TICKS(150 * NSEC_PER_MSEC);
- vars->accu_prob_overflows = 0;
-}
-
-static bool drop_early(struct Qdisc *sch, u32 packet_size)
-{
- struct pie_sched_data *q = qdisc_priv(sch);
u64 rnd;
- u64 local_prob = q->vars.prob;
+ u64 local_prob = vars->prob;
u32 mtu = psched_mtu(qdisc_dev(sch));
/* If there is still burst allowance left skip random early drop */
- if (q->vars.burst_time > 0)
+ if (vars->burst_time > 0)
return false;
/* If current delay is less than half of target, and
* if drop prob is low already, disable early_drop
*/
- if ((q->vars.qdelay < q->params.target / 2) &&
- (q->vars.prob < MAX_PROB / 5))
+ if ((vars->qdelay < params->target / 2) &&
+ (vars->prob < MAX_PROB / 5))
return false;
- /* If we have fewer than 2 mtu-sized packets, disable drop_early,
+ /* If we have fewer than 2 mtu-sized packets, disable pie_drop_early,
* similar to min_th in RED
*/
- if (sch->qstats.backlog < 2 * mtu)
+ if (qlen < 2 * mtu)
return false;
/* If bytemode is turned on, use packet size to compute new
* probablity. Smaller packets will have lower drop prob in this case
*/
- if (q->params.bytemode && packet_size <= mtu)
+ if (params->bytemode && packet_size <= mtu)
local_prob = (u64)packet_size * div_u64(local_prob, mtu);
else
- local_prob = q->vars.prob;
+ local_prob = vars->prob;
if (local_prob == 0) {
- q->vars.accu_prob = 0;
- q->vars.accu_prob_overflows = 0;
+ vars->accu_prob = 0;
+ vars->accu_prob_overflows = 0;
}
- if (local_prob > MAX_PROB - q->vars.accu_prob)
- q->vars.accu_prob_overflows++;
+ if (local_prob > MAX_PROB - vars->accu_prob)
+ vars->accu_prob_overflows++;
- q->vars.accu_prob += local_prob;
+ vars->accu_prob += local_prob;
- if (q->vars.accu_prob_overflows == 0 &&
- q->vars.accu_prob < (MAX_PROB / 100) * 85)
+ if (vars->accu_prob_overflows == 0 &&
+ vars->accu_prob < (MAX_PROB / 100) * 85)
return false;
- if (q->vars.accu_prob_overflows == 8 &&
- q->vars.accu_prob >= MAX_PROB / 2)
+ if (vars->accu_prob_overflows == 8 &&
+ vars->accu_prob >= MAX_PROB / 2)
return true;
prandom_bytes(&rnd, 8);
if (rnd < local_prob) {
- q->vars.accu_prob = 0;
- q->vars.accu_prob_overflows = 0;
+ vars->accu_prob = 0;
+ vars->accu_prob_overflows = 0;
return true;
}
return false;
}
+EXPORT_SYMBOL_GPL(pie_drop_early);
static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
@@ -159,7 +101,8 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
goto out;
}
- if (!drop_early(sch, skb->len)) {
+ if (!pie_drop_early(sch, &q->params, &q->vars, sch->qstats.backlog,
+ skb->len)) {
enqueue = true;
} else if (q->params.ecn && (q->vars.prob <= MAX_PROB / 10) &&
INET_ECN_set_ce(skb)) {
@@ -172,6 +115,10 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
/* we can enqueue the packet */
if (enqueue) {
+ /* Set enqueue time only when dq_rate_estimator is disabled. */
+ if (!q->params.dq_rate_estimator)
+ pie_set_enqueue_time(skb);
+
q->stats.packets_in++;
if (qdisc_qlen(sch) > q->stats.maxq)
q->stats.maxq = qdisc_qlen(sch);
@@ -187,13 +134,14 @@ out:
}
static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = {
- [TCA_PIE_TARGET] = {.type = NLA_U32},
- [TCA_PIE_LIMIT] = {.type = NLA_U32},
- [TCA_PIE_TUPDATE] = {.type = NLA_U32},
- [TCA_PIE_ALPHA] = {.type = NLA_U32},
- [TCA_PIE_BETA] = {.type = NLA_U32},
- [TCA_PIE_ECN] = {.type = NLA_U32},
- [TCA_PIE_BYTEMODE] = {.type = NLA_U32},
+ [TCA_PIE_TARGET] = {.type = NLA_U32},
+ [TCA_PIE_LIMIT] = {.type = NLA_U32},
+ [TCA_PIE_TUPDATE] = {.type = NLA_U32},
+ [TCA_PIE_ALPHA] = {.type = NLA_U32},
+ [TCA_PIE_BETA] = {.type = NLA_U32},
+ [TCA_PIE_ECN] = {.type = NLA_U32},
+ [TCA_PIE_BYTEMODE] = {.type = NLA_U32},
+ [TCA_PIE_DQ_RATE_ESTIMATOR] = {.type = NLA_U32},
};
static int pie_change(struct Qdisc *sch, struct nlattr *opt,
@@ -247,6 +195,10 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt,
if (tb[TCA_PIE_BYTEMODE])
q->params.bytemode = nla_get_u32(tb[TCA_PIE_BYTEMODE]);
+ if (tb[TCA_PIE_DQ_RATE_ESTIMATOR])
+ q->params.dq_rate_estimator =
+ nla_get_u32(tb[TCA_PIE_DQ_RATE_ESTIMATOR]);
+
/* Drop excess packets if new limit is lower */
qlen = sch->q.qlen;
while (sch->q.qlen > sch->limit) {
@@ -262,48 +214,69 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt,
return 0;
}
-static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb)
+void pie_process_dequeue(struct sk_buff *skb, struct pie_params *params,
+ struct pie_vars *vars, u32 qlen)
{
- struct pie_sched_data *q = qdisc_priv(sch);
- int qlen = sch->qstats.backlog; /* current queue size in bytes */
+ psched_time_t now = psched_get_time();
+ u32 dtime = 0;
+
+ /* If dq_rate_estimator is disabled, calculate qdelay using the
+ * packet timestamp.
+ */
+ if (!params->dq_rate_estimator) {
+ vars->qdelay = now - pie_get_enqueue_time(skb);
+
+ if (vars->dq_tstamp != DTIME_INVALID)
+ dtime = now - vars->dq_tstamp;
+
+ vars->dq_tstamp = now;
+
+ if (qlen == 0)
+ vars->qdelay = 0;
+
+ if (dtime == 0)
+ return;
+
+ goto burst_allowance_reduction;
+ }
/* If current queue is about 10 packets or more and dq_count is unset
* we have enough packets to calculate the drain rate. Save
* current time as dq_tstamp and start measurement cycle.
*/
- if (qlen >= QUEUE_THRESHOLD && q->vars.dq_count == DQCOUNT_INVALID) {
- q->vars.dq_tstamp = psched_get_time();
- q->vars.dq_count = 0;
+ if (qlen >= QUEUE_THRESHOLD && vars->dq_count == DQCOUNT_INVALID) {
+ vars->dq_tstamp = psched_get_time();
+ vars->dq_count = 0;
}
- /* Calculate the average drain rate from this value. If queue length
- * has receded to a small value viz., <= QUEUE_THRESHOLD bytes,reset
+ /* Calculate the average drain rate from this value. If queue length
+ * has receded to a small value viz., <= QUEUE_THRESHOLD bytes, reset
* the dq_count to -1 as we don't have enough packets to calculate the
- * drain rate anymore The following if block is entered only when we
+ * drain rate anymore. The following if block is entered only when we
* have a substantial queue built up (QUEUE_THRESHOLD bytes or more)
* and we calculate the drain rate for the threshold here. dq_count is
* in bytes, time difference in psched_time, hence rate is in
* bytes/psched_time.
*/
- if (q->vars.dq_count != DQCOUNT_INVALID) {
- q->vars.dq_count += skb->len;
+ if (vars->dq_count != DQCOUNT_INVALID) {
+ vars->dq_count += skb->len;
+
+ if (vars->dq_count >= QUEUE_THRESHOLD) {
+ u32 count = vars->dq_count << PIE_SCALE;
- if (q->vars.dq_count >= QUEUE_THRESHOLD) {
- psched_time_t now = psched_get_time();
- u32 dtime = now - q->vars.dq_tstamp;
- u32 count = q->vars.dq_count << PIE_SCALE;
+ dtime = now - vars->dq_tstamp;
if (dtime == 0)
return;
count = count / dtime;
- if (q->vars.avg_dq_rate == 0)
- q->vars.avg_dq_rate = count;
+ if (vars->avg_dq_rate == 0)
+ vars->avg_dq_rate = count;
else
- q->vars.avg_dq_rate =
- (q->vars.avg_dq_rate -
- (q->vars.avg_dq_rate >> 3)) + (count >> 3);
+ vars->avg_dq_rate =
+ (vars->avg_dq_rate -
+ (vars->avg_dq_rate >> 3)) + (count >> 3);
/* If the queue has receded below the threshold, we hold
* on to the last drain rate calculated, else we reset
@@ -311,43 +284,54 @@ static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb)
* packet is dequeued
*/
if (qlen < QUEUE_THRESHOLD) {
- q->vars.dq_count = DQCOUNT_INVALID;
+ vars->dq_count = DQCOUNT_INVALID;
} else {
- q->vars.dq_count = 0;
- q->vars.dq_tstamp = psched_get_time();
+ vars->dq_count = 0;
+ vars->dq_tstamp = psched_get_time();
}
- if (q->vars.burst_time > 0) {
- if (q->vars.burst_time > dtime)
- q->vars.burst_time -= dtime;
- else
- q->vars.burst_time = 0;
- }
+ goto burst_allowance_reduction;
}
}
+
+ return;
+
+burst_allowance_reduction:
+ if (vars->burst_time > 0) {
+ if (vars->burst_time > dtime)
+ vars->burst_time -= dtime;
+ else
+ vars->burst_time = 0;
+ }
}
+EXPORT_SYMBOL_GPL(pie_process_dequeue);
-static void calculate_probability(struct Qdisc *sch)
+void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars,
+ u32 qlen)
{
- struct pie_sched_data *q = qdisc_priv(sch);
- u32 qlen = sch->qstats.backlog; /* queue size in bytes */
psched_time_t qdelay = 0; /* in pschedtime */
- psched_time_t qdelay_old = q->vars.qdelay; /* in pschedtime */
+ psched_time_t qdelay_old = 0; /* in pschedtime */
s64 delta = 0; /* determines the change in probability */
u64 oldprob;
u64 alpha, beta;
u32 power;
bool update_prob = true;
- q->vars.qdelay_old = q->vars.qdelay;
+ if (params->dq_rate_estimator) {
+ qdelay_old = vars->qdelay;
+ vars->qdelay_old = vars->qdelay;
- if (q->vars.avg_dq_rate > 0)
- qdelay = (qlen << PIE_SCALE) / q->vars.avg_dq_rate;
- else
- qdelay = 0;
+ if (vars->avg_dq_rate > 0)
+ qdelay = (qlen << PIE_SCALE) / vars->avg_dq_rate;
+ else
+ qdelay = 0;
+ } else {
+ qdelay = vars->qdelay;
+ qdelay_old = vars->qdelay_old;
+ }
- /* If qdelay is zero and qlen is not, it means qlen is very small, less
- * than dequeue_rate, so we do not update probabilty in this round
+ /* If qdelay is zero and qlen is not, it means qlen is very small,
+ * so we do not update probabilty in this round.
*/
if (qdelay == 0 && qlen != 0)
update_prob = false;
@@ -359,18 +343,18 @@ static void calculate_probability(struct Qdisc *sch)
* probability. alpha/beta are updated locally below by scaling down
* by 16 to come to 0-2 range.
*/
- alpha = ((u64)q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
- beta = ((u64)q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
+ alpha = ((u64)params->alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
+ beta = ((u64)params->beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
/* We scale alpha and beta differently depending on how heavy the
* congestion is. Please see RFC 8033 for details.
*/
- if (q->vars.prob < MAX_PROB / 10) {
+ if (vars->prob < MAX_PROB / 10) {
alpha >>= 1;
beta >>= 1;
power = 100;
- while (q->vars.prob < div_u64(MAX_PROB, power) &&
+ while (vars->prob < div_u64(MAX_PROB, power) &&
power <= 1000000) {
alpha >>= 2;
beta >>= 2;
@@ -379,14 +363,14 @@ static void calculate_probability(struct Qdisc *sch)
}
/* alpha and beta should be between 0 and 32, in multiples of 1/16 */
- delta += alpha * (u64)(qdelay - q->params.target);
+ delta += alpha * (u64)(qdelay - params->target);
delta += beta * (u64)(qdelay - qdelay_old);
- oldprob = q->vars.prob;
+ oldprob = vars->prob;
/* to ensure we increase probability in steps of no more than 2% */
if (delta > (s64)(MAX_PROB / (100 / 2)) &&
- q->vars.prob >= MAX_PROB / 10)
+ vars->prob >= MAX_PROB / 10)
delta = (MAX_PROB / 100) * 2;
/* Non-linear drop:
@@ -397,12 +381,12 @@ static void calculate_probability(struct Qdisc *sch)
if (qdelay > (PSCHED_NS2TICKS(250 * NSEC_PER_MSEC)))
delta += MAX_PROB / (100 / 2);
- q->vars.prob += delta;
+ vars->prob += delta;
if (delta > 0) {
/* prevent overflow */
- if (q->vars.prob < oldprob) {
- q->vars.prob = MAX_PROB;
+ if (vars->prob < oldprob) {
+ vars->prob = MAX_PROB;
/* Prevent normalization error. If probability is at
* maximum value already, we normalize it here, and
* skip the check to do a non-linear drop in the next
@@ -412,8 +396,8 @@ static void calculate_probability(struct Qdisc *sch)
}
} else {
/* prevent underflow */
- if (q->vars.prob > oldprob)
- q->vars.prob = 0;
+ if (vars->prob > oldprob)
+ vars->prob = 0;
}
/* Non-linear drop in probability: Reduce drop probability quickly if
@@ -422,23 +406,28 @@ static void calculate_probability(struct Qdisc *sch)
if (qdelay == 0 && qdelay_old == 0 && update_prob)
/* Reduce drop probability to 98.4% */
- q->vars.prob -= q->vars.prob / 64u;
+ vars->prob -= vars->prob / 64;
- q->vars.qdelay = qdelay;
- q->vars.qlen_old = qlen;
+ vars->qdelay = qdelay;
+ vars->qlen_old = qlen;
/* We restart the measurement cycle if the following conditions are met
* 1. If the delay has been low for 2 consecutive Tupdate periods
* 2. Calculated drop probability is zero
- * 3. We have atleast one estimate for the avg_dq_rate ie.,
- * is a non-zero value
+ * 3. If average dq_rate_estimator is enabled, we have atleast one
+ * estimate for the avg_dq_rate ie., is a non-zero value
*/
- if ((q->vars.qdelay < q->params.target / 2) &&
- (q->vars.qdelay_old < q->params.target / 2) &&
- q->vars.prob == 0 &&
- q->vars.avg_dq_rate > 0)
- pie_vars_init(&q->vars);
+ if ((vars->qdelay < params->target / 2) &&
+ (vars->qdelay_old < params->target / 2) &&
+ vars->prob == 0 &&
+ (!params->dq_rate_estimator || vars->avg_dq_rate > 0)) {
+ pie_vars_init(vars);
+ }
+
+ if (!params->dq_rate_estimator)
+ vars->qdelay_old = qdelay;
}
+EXPORT_SYMBOL_GPL(pie_calculate_probability);
static void pie_timer(struct timer_list *t)
{
@@ -447,7 +436,7 @@ static void pie_timer(struct timer_list *t)
spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
spin_lock(root_lock);
- calculate_probability(sch);
+ pie_calculate_probability(&q->params, &q->vars, sch->qstats.backlog);
/* reset the timer to fire after 'tupdate'. tupdate is in jiffies. */
if (q->params.tupdate)
@@ -497,7 +486,9 @@ static int pie_dump(struct Qdisc *sch, struct sk_buff *skb)
nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) ||
nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) ||
nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) ||
- nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode))
+ nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode) ||
+ nla_put_u32(skb, TCA_PIE_DQ_RATE_ESTIMATOR,
+ q->params.dq_rate_estimator))
goto nla_put_failure;
return nla_nest_end(skb, opts);
@@ -514,9 +505,6 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
.prob = q->vars.prob,
.delay = ((u32)PSCHED_TICKS2NS(q->vars.qdelay)) /
NSEC_PER_USEC,
- /* unscale and return dq_rate in bytes per sec */
- .avg_dq_rate = q->vars.avg_dq_rate *
- (PSCHED_TICKS_PER_SEC) >> PIE_SCALE,
.packets_in = q->stats.packets_in,
.overlimit = q->stats.overlimit,
.maxq = q->stats.maxq,
@@ -524,17 +512,26 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
.ecn_mark = q->stats.ecn_mark,
};
+ /* avg_dq_rate is only valid if dq_rate_estimator is enabled */
+ st.dq_rate_estimating = q->params.dq_rate_estimator;
+
+ /* unscale and return dq_rate in bytes per sec */
+ if (q->params.dq_rate_estimator)
+ st.avg_dq_rate = q->vars.avg_dq_rate *
+ (PSCHED_TICKS_PER_SEC) >> PIE_SCALE;
+
return gnet_stats_copy_app(d, &st, sizeof(st));
}
static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch)
{
+ struct pie_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb = qdisc_dequeue_head(sch);
if (!skb)
return NULL;
- pie_process_dequeue(sch, skb);
+ pie_process_dequeue(skb, &q->params, &q->vars, sch->qstats.backlog);
return skb;
}
@@ -555,7 +552,7 @@ static void pie_destroy(struct Qdisc *sch)
}
static struct Qdisc_ops pie_qdisc_ops __read_mostly = {
- .id = "pie",
+ .id = "pie",
.priv_size = sizeof(struct pie_sched_data),
.enqueue = pie_qdisc_enqueue,
.dequeue = pie_qdisc_dequeue,
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 0f8fedb8809a..647941702f9f 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -292,8 +292,14 @@ static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
struct tc_prio_qopt_offload graft_offload;
unsigned long band = arg - 1;
- if (new == NULL)
- new = &noop_qdisc;
+ if (!new) {
+ new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ TC_H_MAKE(sch->handle, arg), extack);
+ if (!new)
+ new = &noop_qdisc;
+ else
+ qdisc_hash_add(new, true);
+ }
*old = qdisc_replace(sch, new, &q->queues[band]);
@@ -356,7 +362,7 @@ static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
cl_q = q->queues[cl - 1];
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
- d, NULL, &cl_q->bstats) < 0 ||
+ d, cl_q->cpu_bstats, &cl_q->bstats) < 0 ||
qdisc_qstats_copy(d, cl_q) < 0)
return -1;
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index 1dff8506a715..4074c50ac3d7 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -18,7 +18,7 @@
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/random.h>
-#include <linux/jhash.h>
+#include <linux/siphash.h>
#include <net/ip.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
@@ -45,7 +45,7 @@ struct sfb_bucket {
* (Section 4.4 of SFB reference : moving hash functions)
*/
struct sfb_bins {
- u32 perturbation; /* jhash perturbation */
+ siphash_key_t perturbation; /* siphash key */
struct sfb_bucket bins[SFB_LEVELS][SFB_NUMBUCKETS];
};
@@ -217,7 +217,8 @@ static u32 sfb_compute_qlen(u32 *prob_r, u32 *avgpm_r, const struct sfb_sched_da
static void sfb_init_perturbation(u32 slot, struct sfb_sched_data *q)
{
- q->bins[slot].perturbation = prandom_u32();
+ get_random_bytes(&q->bins[slot].perturbation,
+ sizeof(q->bins[slot].perturbation));
}
static void sfb_swap_slot(struct sfb_sched_data *q)
@@ -314,9 +315,9 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
/* If using external classifiers, get result and record it. */
if (!sfb_classify(skb, fl, &ret, &salt))
goto other_drop;
- sfbhash = jhash_1word(salt, q->bins[slot].perturbation);
+ sfbhash = siphash_1u32(salt, &q->bins[slot].perturbation);
} else {
- sfbhash = skb_get_hash_perturb(skb, q->bins[slot].perturbation);
+ sfbhash = skb_get_hash_perturb(skb, &q->bins[slot].perturbation);
}
@@ -352,7 +353,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
/* Inelastic flow */
if (q->double_buffering) {
sfbhash = skb_get_hash_perturb(skb,
- q->bins[slot].perturbation);
+ &q->bins[slot].perturbation);
if (!sfbhash)
sfbhash = 1;
sfb_skb_cb(skb)->hashes[slot] = sfbhash;
@@ -488,7 +489,7 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
struct sfb_sched_data *q = qdisc_priv(sch);
- struct Qdisc *child;
+ struct Qdisc *child, *old;
struct nlattr *tb[TCA_SFB_MAX + 1];
const struct tc_sfb_qopt *ctl = &sfb_default_ops;
u32 limit;
@@ -518,8 +519,8 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt,
qdisc_hash_add(child, true);
sch_tree_lock(sch);
- qdisc_tree_flush_backlog(q->qdisc);
- qdisc_put(q->qdisc);
+ qdisc_purge_queue(q->qdisc);
+ old = q->qdisc;
q->qdisc = child;
q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval);
@@ -542,6 +543,7 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt,
sfb_init_perturbation(1, q);
sch_tree_unlock(sch);
+ qdisc_put(old);
return 0;
}
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 68404a9d2ce4..c787d4d46017 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -14,7 +14,7 @@
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/skbuff.h>
-#include <linux/jhash.h>
+#include <linux/siphash.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <net/netlink.h>
@@ -117,7 +117,7 @@ struct sfq_sched_data {
u8 headdrop;
u8 maxdepth; /* limit of packets per flow */
- u32 perturbation;
+ siphash_key_t perturbation;
u8 cur_depth; /* depth of longest slot */
u8 flags;
unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
@@ -157,7 +157,7 @@ static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index
static unsigned int sfq_hash(const struct sfq_sched_data *q,
const struct sk_buff *skb)
{
- return skb_get_hash_perturb(skb, q->perturbation) & (q->divisor - 1);
+ return skb_get_hash_perturb(skb, &q->perturbation) & (q->divisor - 1);
}
static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
@@ -607,9 +607,11 @@ static void sfq_perturbation(struct timer_list *t)
struct sfq_sched_data *q = from_timer(q, t, perturb_timer);
struct Qdisc *sch = q->sch;
spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
+ siphash_key_t nkey;
+ get_random_bytes(&nkey, sizeof(nkey));
spin_lock(root_lock);
- q->perturbation = prandom_u32();
+ q->perturbation = nkey;
if (!q->filter_list && q->tail)
sfq_rehash(sch);
spin_unlock(root_lock);
@@ -688,7 +690,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
del_timer(&q->perturb_timer);
if (q->perturb_period) {
mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
- q->perturbation = prandom_u32();
+ get_random_bytes(&q->perturbation, sizeof(q->perturbation));
}
sch_tree_unlock(sch);
kfree(p);
@@ -745,7 +747,7 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt,
q->quantum = psched_mtu(qdisc_dev(sch));
q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
q->perturb_period = 0;
- q->perturbation = prandom_u32();
+ get_random_bytes(&q->perturbation, sizeof(q->perturbation));
if (opt) {
int err = sfq_change(sch, opt);
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index e25d414ae12f..660fc45ee40f 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -29,8 +29,9 @@ static DEFINE_SPINLOCK(taprio_list_lock);
#define TAPRIO_ALL_GATES_OPEN -1
-#define FLAGS_VALID(flags) (!((flags) & ~TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST))
#define TXTIME_ASSIST_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST)
+#define FULL_OFFLOAD_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)
+#define TAPRIO_FLAGS_INVALID U32_MAX
struct sched_entry {
struct list_head list;
@@ -75,9 +76,16 @@ struct taprio_sched {
struct sched_gate_list __rcu *admin_sched;
struct hrtimer advance_timer;
struct list_head taprio_list;
+ struct sk_buff *(*dequeue)(struct Qdisc *sch);
+ struct sk_buff *(*peek)(struct Qdisc *sch);
u32 txtime_delay;
};
+struct __tc_taprio_qopt_offload {
+ refcount_t users;
+ struct tc_taprio_qopt_offload offload;
+};
+
static ktime_t sched_base_time(const struct sched_gate_list *sched)
{
if (!sched)
@@ -268,6 +276,19 @@ static bool is_valid_interval(struct sk_buff *skb, struct Qdisc *sch)
return entry;
}
+static bool taprio_flags_valid(u32 flags)
+{
+ /* Make sure no other flag bits are set. */
+ if (flags & ~(TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST |
+ TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD))
+ return false;
+ /* txtime-assist and full offload are mutually exclusive */
+ if ((flags & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) &&
+ (flags & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD))
+ return false;
+ return true;
+}
+
/* This returns the tstamp value set by TCP in terms of the set clock. */
static ktime_t get_tcp_tstamp(struct taprio_sched *q, struct sk_buff *skb)
{
@@ -417,7 +438,7 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
return qdisc_enqueue(skb, child, to_free);
}
-static struct sk_buff *taprio_peek(struct Qdisc *sch)
+static struct sk_buff *taprio_peek_soft(struct Qdisc *sch)
{
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
@@ -461,6 +482,36 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch)
return NULL;
}
+static struct sk_buff *taprio_peek_offload(struct Qdisc *sch)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct sk_buff *skb;
+ int i;
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ struct Qdisc *child = q->qdiscs[i];
+
+ if (unlikely(!child))
+ continue;
+
+ skb = child->ops->peek(child);
+ if (!skb)
+ continue;
+
+ return skb;
+ }
+
+ return NULL;
+}
+
+static struct sk_buff *taprio_peek(struct Qdisc *sch)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+
+ return q->peek(sch);
+}
+
static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry)
{
atomic_set(&entry->budget,
@@ -468,7 +519,7 @@ static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry)
atomic64_read(&q->picos_per_byte)));
}
-static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
+static struct sk_buff *taprio_dequeue_soft(struct Qdisc *sch)
{
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
@@ -477,11 +528,6 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
u32 gate_mask;
int i;
- if (atomic64_read(&q->picos_per_byte) == -1) {
- WARN_ONCE(1, "taprio: dequeue() called with unknown picos per byte.");
- return NULL;
- }
-
rcu_read_lock();
entry = rcu_dereference(q->current_entry);
/* if there's no entry, it means that the schedule didn't
@@ -555,6 +601,40 @@ done:
return skb;
}
+static struct sk_buff *taprio_dequeue_offload(struct Qdisc *sch)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct sk_buff *skb;
+ int i;
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ struct Qdisc *child = q->qdiscs[i];
+
+ if (unlikely(!child))
+ continue;
+
+ skb = child->ops->dequeue(child);
+ if (unlikely(!skb))
+ continue;
+
+ qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
+ sch->q.qlen--;
+
+ return skb;
+ }
+
+ return NULL;
+}
+
+static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+
+ return q->dequeue(sch);
+}
+
static bool should_restart_cycle(const struct sched_gate_list *oper,
const struct sched_entry *entry)
{
@@ -677,10 +757,6 @@ static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = {
[TCA_TAPRIO_SCHED_ENTRY_INTERVAL] = { .type = NLA_U32 },
};
-static const struct nla_policy entry_list_policy[TCA_TAPRIO_SCHED_MAX + 1] = {
- [TCA_TAPRIO_SCHED_ENTRY] = { .type = NLA_NESTED },
-};
-
static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = {
[TCA_TAPRIO_ATTR_PRIOMAP] = {
.len = sizeof(struct tc_mqprio_qopt)
@@ -691,6 +767,7 @@ static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = {
[TCA_TAPRIO_ATTR_SCHED_CLOCKID] = { .type = NLA_S32 },
[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME] = { .type = NLA_S64 },
[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION] = { .type = NLA_S64 },
+ [TCA_TAPRIO_ATTR_FLAGS] = { .type = NLA_U32 },
};
static int fill_sched_entry(struct nlattr **tb, struct sched_entry *entry,
@@ -847,7 +924,7 @@ static int taprio_parse_mqprio_opt(struct net_device *dev,
}
/* Verify priority mapping uses valid tcs */
- for (i = 0; i < TC_BITMASK + 1; i++) {
+ for (i = 0; i <= TC_BITMASK; i++) {
if (qopt->prio_tc_map[i] >= qopt->num_tc) {
NL_SET_ERR_MSG(extack, "Invalid traffic class in priority to traffic class mapping");
return -EINVAL;
@@ -941,6 +1018,9 @@ static void taprio_start_sched(struct Qdisc *sch,
struct taprio_sched *q = qdisc_priv(sch);
ktime_t expires;
+ if (FULL_OFFLOAD_IS_ENABLED(q->flags))
+ return;
+
expires = hrtimer_get_expires(&q->advance_timer);
if (expires == 0)
expires = KTIME_MAX;
@@ -958,12 +1038,19 @@ static void taprio_set_picos_per_byte(struct net_device *dev,
struct taprio_sched *q)
{
struct ethtool_link_ksettings ecmd;
- int picos_per_byte = -1;
+ int speed = SPEED_10;
+ int picos_per_byte;
+ int err;
+
+ err = __ethtool_get_link_ksettings(dev, &ecmd);
+ if (err < 0)
+ goto skip;
+
+ if (ecmd.base.speed && ecmd.base.speed != SPEED_UNKNOWN)
+ speed = ecmd.base.speed;
- if (!__ethtool_get_link_ksettings(dev, &ecmd) &&
- ecmd.base.speed != SPEED_UNKNOWN)
- picos_per_byte = div64_s64(NSEC_PER_SEC * 1000LL * 8,
- ecmd.base.speed * 1000 * 1000);
+skip:
+ picos_per_byte = (USEC_PER_SEC * 8) / speed;
atomic64_set(&q->picos_per_byte, picos_per_byte);
netdev_dbg(dev, "taprio: set %s's picos_per_byte to: %lld, linkspeed: %d\n",
@@ -1012,6 +1099,303 @@ static void setup_txtime(struct taprio_sched *q,
}
}
+static struct tc_taprio_qopt_offload *taprio_offload_alloc(int num_entries)
+{
+ size_t size = sizeof(struct tc_taprio_sched_entry) * num_entries +
+ sizeof(struct __tc_taprio_qopt_offload);
+ struct __tc_taprio_qopt_offload *__offload;
+
+ __offload = kzalloc(size, GFP_KERNEL);
+ if (!__offload)
+ return NULL;
+
+ refcount_set(&__offload->users, 1);
+
+ return &__offload->offload;
+}
+
+struct tc_taprio_qopt_offload *taprio_offload_get(struct tc_taprio_qopt_offload
+ *offload)
+{
+ struct __tc_taprio_qopt_offload *__offload;
+
+ __offload = container_of(offload, struct __tc_taprio_qopt_offload,
+ offload);
+
+ refcount_inc(&__offload->users);
+
+ return offload;
+}
+EXPORT_SYMBOL_GPL(taprio_offload_get);
+
+void taprio_offload_free(struct tc_taprio_qopt_offload *offload)
+{
+ struct __tc_taprio_qopt_offload *__offload;
+
+ __offload = container_of(offload, struct __tc_taprio_qopt_offload,
+ offload);
+
+ if (!refcount_dec_and_test(&__offload->users))
+ return;
+
+ kfree(__offload);
+}
+EXPORT_SYMBOL_GPL(taprio_offload_free);
+
+/* The function will only serve to keep the pointers to the "oper" and "admin"
+ * schedules valid in relation to their base times, so when calling dump() the
+ * users looks at the right schedules.
+ * When using full offload, the admin configuration is promoted to oper at the
+ * base_time in the PHC time domain. But because the system time is not
+ * necessarily in sync with that, we can't just trigger a hrtimer to call
+ * switch_schedules at the right hardware time.
+ * At the moment we call this by hand right away from taprio, but in the future
+ * it will be useful to create a mechanism for drivers to notify taprio of the
+ * offload state (PENDING, ACTIVE, INACTIVE) so it can be visible in dump().
+ * This is left as TODO.
+ */
+static void taprio_offload_config_changed(struct taprio_sched *q)
+{
+ struct sched_gate_list *oper, *admin;
+
+ spin_lock(&q->current_entry_lock);
+
+ oper = rcu_dereference_protected(q->oper_sched,
+ lockdep_is_held(&q->current_entry_lock));
+ admin = rcu_dereference_protected(q->admin_sched,
+ lockdep_is_held(&q->current_entry_lock));
+
+ switch_schedules(q, &admin, &oper);
+
+ spin_unlock(&q->current_entry_lock);
+}
+
+static void taprio_sched_to_offload(struct taprio_sched *q,
+ struct sched_gate_list *sched,
+ const struct tc_mqprio_qopt *mqprio,
+ struct tc_taprio_qopt_offload *offload)
+{
+ struct sched_entry *entry;
+ int i = 0;
+
+ offload->base_time = sched->base_time;
+ offload->cycle_time = sched->cycle_time;
+ offload->cycle_time_extension = sched->cycle_time_extension;
+
+ list_for_each_entry(entry, &sched->entries, list) {
+ struct tc_taprio_sched_entry *e = &offload->entries[i];
+
+ e->command = entry->command;
+ e->interval = entry->interval;
+ e->gate_mask = entry->gate_mask;
+ i++;
+ }
+
+ offload->num_entries = i;
+}
+
+static int taprio_enable_offload(struct net_device *dev,
+ struct tc_mqprio_qopt *mqprio,
+ struct taprio_sched *q,
+ struct sched_gate_list *sched,
+ struct netlink_ext_ack *extack)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct tc_taprio_qopt_offload *offload;
+ int err = 0;
+
+ if (!ops->ndo_setup_tc) {
+ NL_SET_ERR_MSG(extack,
+ "Device does not support taprio offload");
+ return -EOPNOTSUPP;
+ }
+
+ offload = taprio_offload_alloc(sched->num_entries);
+ if (!offload) {
+ NL_SET_ERR_MSG(extack,
+ "Not enough memory for enabling offload mode");
+ return -ENOMEM;
+ }
+ offload->enable = 1;
+ taprio_sched_to_offload(q, sched, mqprio, offload);
+
+ err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack,
+ "Device failed to setup taprio offload");
+ goto done;
+ }
+
+done:
+ taprio_offload_free(offload);
+
+ return err;
+}
+
+static int taprio_disable_offload(struct net_device *dev,
+ struct taprio_sched *q,
+ struct netlink_ext_ack *extack)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct tc_taprio_qopt_offload *offload;
+ int err;
+
+ if (!FULL_OFFLOAD_IS_ENABLED(q->flags))
+ return 0;
+
+ if (!ops->ndo_setup_tc)
+ return -EOPNOTSUPP;
+
+ offload = taprio_offload_alloc(0);
+ if (!offload) {
+ NL_SET_ERR_MSG(extack,
+ "Not enough memory to disable offload mode");
+ return -ENOMEM;
+ }
+ offload->enable = 0;
+
+ err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack,
+ "Device failed to disable offload");
+ goto out;
+ }
+
+out:
+ taprio_offload_free(offload);
+
+ return err;
+}
+
+/* If full offload is enabled, the only possible clockid is the net device's
+ * PHC. For that reason, specifying a clockid through netlink is incorrect.
+ * For txtime-assist, it is implicitly assumed that the device's PHC is kept
+ * in sync with the specified clockid via a user space daemon such as phc2sys.
+ * For both software taprio and txtime-assist, the clockid is used for the
+ * hrtimer that advances the schedule and hence mandatory.
+ */
+static int taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ int err = -EINVAL;
+
+ if (FULL_OFFLOAD_IS_ENABLED(q->flags)) {
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_ts_info info = {
+ .cmd = ETHTOOL_GET_TS_INFO,
+ .phc_index = -1,
+ };
+
+ if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) {
+ NL_SET_ERR_MSG(extack,
+ "The 'clockid' cannot be specified for full offload");
+ goto out;
+ }
+
+ if (ops && ops->get_ts_info)
+ err = ops->get_ts_info(dev, &info);
+
+ if (err || info.phc_index < 0) {
+ NL_SET_ERR_MSG(extack,
+ "Device does not have a PTP clock");
+ err = -ENOTSUPP;
+ goto out;
+ }
+ } else if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) {
+ int clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]);
+
+ /* We only support static clockids and we don't allow
+ * for it to be modified after the first init.
+ */
+ if (clockid < 0 ||
+ (q->clockid != -1 && q->clockid != clockid)) {
+ NL_SET_ERR_MSG(extack,
+ "Changing the 'clockid' of a running schedule is not supported");
+ err = -ENOTSUPP;
+ goto out;
+ }
+
+ switch (clockid) {
+ case CLOCK_REALTIME:
+ q->tk_offset = TK_OFFS_REAL;
+ break;
+ case CLOCK_MONOTONIC:
+ q->tk_offset = TK_OFFS_MAX;
+ break;
+ case CLOCK_BOOTTIME:
+ q->tk_offset = TK_OFFS_BOOT;
+ break;
+ case CLOCK_TAI:
+ q->tk_offset = TK_OFFS_TAI;
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Invalid 'clockid'");
+ err = -EINVAL;
+ goto out;
+ }
+
+ q->clockid = clockid;
+ } else {
+ NL_SET_ERR_MSG(extack, "Specifying a 'clockid' is mandatory");
+ goto out;
+ }
+
+ /* Everything went ok, return success. */
+ err = 0;
+
+out:
+ return err;
+}
+
+static int taprio_mqprio_cmp(const struct net_device *dev,
+ const struct tc_mqprio_qopt *mqprio)
+{
+ int i;
+
+ if (!mqprio || mqprio->num_tc != dev->num_tc)
+ return -1;
+
+ for (i = 0; i < mqprio->num_tc; i++)
+ if (dev->tc_to_txq[i].count != mqprio->count[i] ||
+ dev->tc_to_txq[i].offset != mqprio->offset[i])
+ return -1;
+
+ for (i = 0; i <= TC_BITMASK; i++)
+ if (dev->prio_tc_map[i] != mqprio->prio_tc_map[i])
+ return -1;
+
+ return 0;
+}
+
+/* The semantics of the 'flags' argument in relation to 'change()'
+ * requests, are interpreted following two rules (which are applied in
+ * this order): (1) an omitted 'flags' argument is interpreted as
+ * zero; (2) the 'flags' of a "running" taprio instance cannot be
+ * changed.
+ */
+static int taprio_new_flags(const struct nlattr *attr, u32 old,
+ struct netlink_ext_ack *extack)
+{
+ u32 new = 0;
+
+ if (attr)
+ new = nla_get_u32(attr);
+
+ if (old != TAPRIO_FLAGS_INVALID && old != new) {
+ NL_SET_ERR_MSG_MOD(extack, "Changing 'flags' of a running schedule is not supported");
+ return -EOPNOTSUPP;
+ }
+
+ if (!taprio_flags_valid(new)) {
+ NL_SET_ERR_MSG_MOD(extack, "Specified 'flags' are not valid");
+ return -EINVAL;
+ }
+
+ return new;
+}
+
static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
@@ -1020,10 +1404,9 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
struct tc_mqprio_qopt *mqprio = NULL;
- u32 taprio_flags = 0;
- int i, err, clockid;
unsigned long flags;
ktime_t start;
+ int i, err;
err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_ATTR_MAX, opt,
taprio_policy, extack);
@@ -1033,21 +1416,14 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
if (tb[TCA_TAPRIO_ATTR_PRIOMAP])
mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]);
- if (tb[TCA_TAPRIO_ATTR_FLAGS]) {
- taprio_flags = nla_get_u32(tb[TCA_TAPRIO_ATTR_FLAGS]);
-
- if (q->flags != 0 && q->flags != taprio_flags) {
- NL_SET_ERR_MSG_MOD(extack, "Changing 'flags' of a running schedule is not supported");
- return -EOPNOTSUPP;
- } else if (!FLAGS_VALID(taprio_flags)) {
- NL_SET_ERR_MSG_MOD(extack, "Specified 'flags' are not valid");
- return -EINVAL;
- }
+ err = taprio_new_flags(tb[TCA_TAPRIO_ATTR_FLAGS],
+ q->flags, extack);
+ if (err < 0)
+ return err;
- q->flags = taprio_flags;
- }
+ q->flags = err;
- err = taprio_parse_mqprio_opt(dev, mqprio, extack, taprio_flags);
+ err = taprio_parse_mqprio_opt(dev, mqprio, extack, q->flags);
if (err < 0)
return err;
@@ -1063,6 +1439,10 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
admin = rcu_dereference(q->admin_sched);
rcu_read_unlock();
+ /* no changes - no new mqprio settings */
+ if (!taprio_mqprio_cmp(dev, mqprio))
+ mqprio = NULL;
+
if (mqprio && (oper || admin)) {
NL_SET_ERR_MSG(extack, "Changing the traffic mapping of a running schedule is not supported");
err = -ENOTSUPP;
@@ -1079,29 +1459,31 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
goto free_sched;
}
- if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) {
- clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]);
+ err = taprio_parse_clockid(sch, tb, extack);
+ if (err < 0)
+ goto free_sched;
- /* We only support static clockids and we don't allow
- * for it to be modified after the first init.
- */
- if (clockid < 0 ||
- (q->clockid != -1 && q->clockid != clockid)) {
- NL_SET_ERR_MSG(extack, "Changing the 'clockid' of a running schedule is not supported");
- err = -ENOTSUPP;
- goto free_sched;
- }
+ taprio_set_picos_per_byte(dev, q);
- q->clockid = clockid;
- }
+ if (mqprio) {
+ netdev_set_num_tc(dev, mqprio->num_tc);
+ for (i = 0; i < mqprio->num_tc; i++)
+ netdev_set_tc_queue(dev, i,
+ mqprio->count[i],
+ mqprio->offset[i]);
- if (q->clockid == -1 && !tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) {
- NL_SET_ERR_MSG(extack, "Specifying a 'clockid' is mandatory");
- err = -EINVAL;
- goto free_sched;
+ /* Always use supplied priority mappings */
+ for (i = 0; i <= TC_BITMASK; i++)
+ netdev_set_prio_tc_map(dev, i,
+ mqprio->prio_tc_map[i]);
}
- taprio_set_picos_per_byte(dev, q);
+ if (FULL_OFFLOAD_IS_ENABLED(q->flags))
+ err = taprio_enable_offload(dev, mqprio, q, new_admin, extack);
+ else
+ err = taprio_disable_offload(dev, q, extack);
+ if (err)
+ goto free_sched;
/* Protects against enqueue()/dequeue() */
spin_lock_bh(qdisc_lock(sch));
@@ -1116,42 +1498,22 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
q->txtime_delay = nla_get_u32(tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]);
}
- if (!TXTIME_ASSIST_IS_ENABLED(taprio_flags) &&
+ if (!TXTIME_ASSIST_IS_ENABLED(q->flags) &&
+ !FULL_OFFLOAD_IS_ENABLED(q->flags) &&
!hrtimer_active(&q->advance_timer)) {
hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS);
q->advance_timer.function = advance_sched;
}
- if (mqprio) {
- netdev_set_num_tc(dev, mqprio->num_tc);
- for (i = 0; i < mqprio->num_tc; i++)
- netdev_set_tc_queue(dev, i,
- mqprio->count[i],
- mqprio->offset[i]);
-
- /* Always use supplied priority mappings */
- for (i = 0; i < TC_BITMASK + 1; i++)
- netdev_set_prio_tc_map(dev, i,
- mqprio->prio_tc_map[i]);
- }
-
- switch (q->clockid) {
- case CLOCK_REALTIME:
- q->tk_offset = TK_OFFS_REAL;
- break;
- case CLOCK_MONOTONIC:
- q->tk_offset = TK_OFFS_MAX;
- break;
- case CLOCK_BOOTTIME:
- q->tk_offset = TK_OFFS_BOOT;
- break;
- case CLOCK_TAI:
- q->tk_offset = TK_OFFS_TAI;
- break;
- default:
- NL_SET_ERR_MSG(extack, "Invalid 'clockid'");
- err = -EINVAL;
- goto unlock;
+ if (FULL_OFFLOAD_IS_ENABLED(q->flags)) {
+ q->dequeue = taprio_dequeue_offload;
+ q->peek = taprio_peek_offload;
+ } else {
+ /* Be sure to always keep the function pointers
+ * in a consistent state.
+ */
+ q->dequeue = taprio_dequeue_soft;
+ q->peek = taprio_peek_soft;
}
err = taprio_get_start_time(sch, new_admin, &start);
@@ -1160,9 +1522,9 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
goto unlock;
}
- if (TXTIME_ASSIST_IS_ENABLED(taprio_flags)) {
- setup_txtime(q, new_admin, start);
+ setup_txtime(q, new_admin, start);
+ if (TXTIME_ASSIST_IS_ENABLED(q->flags)) {
if (!oper) {
rcu_assign_pointer(q->oper_sched, new_admin);
err = 0;
@@ -1186,6 +1548,9 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
call_rcu(&admin->rcu, taprio_free_sched_cb);
spin_unlock_irqrestore(&q->current_entry_lock, flags);
+
+ if (FULL_OFFLOAD_IS_ENABLED(q->flags))
+ taprio_offload_config_changed(q);
}
new_admin = NULL;
@@ -1213,6 +1578,8 @@ static void taprio_destroy(struct Qdisc *sch)
hrtimer_cancel(&q->advance_timer);
+ taprio_disable_offload(dev, q, NULL);
+
if (q->qdiscs) {
for (i = 0; i < dev->num_tx_queues && q->qdiscs[i]; i++)
qdisc_put(q->qdiscs[i]);
@@ -1221,7 +1588,7 @@ static void taprio_destroy(struct Qdisc *sch)
}
q->qdiscs = NULL;
- netdev_set_num_tc(dev, 0);
+ netdev_reset_tc(dev);
if (q->oper_sched)
call_rcu(&q->oper_sched->rcu, taprio_free_sched_cb);
@@ -1242,12 +1609,20 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt,
hrtimer_init(&q->advance_timer, CLOCK_TAI, HRTIMER_MODE_ABS);
q->advance_timer.function = advance_sched;
+ q->dequeue = taprio_dequeue_soft;
+ q->peek = taprio_peek_soft;
+
q->root = sch;
/* We only support static clockids. Use an invalid value as default
* and get the valid one on taprio_change().
*/
q->clockid = -1;
+ q->flags = TAPRIO_FLAGS_INVALID;
+
+ spin_lock(&taprio_list_lock);
+ list_add(&q->taprio_list, &taprio_list);
+ spin_unlock(&taprio_list_lock);
if (sch->parent != TC_H_ROOT)
return -EOPNOTSUPP;
@@ -1266,10 +1641,6 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt,
if (!opt)
return -EINVAL;
- spin_lock(&taprio_list_lock);
- list_add(&q->taprio_list, &taprio_list);
- spin_unlock(&taprio_list_lock);
-
for (i = 0; i < dev->num_tx_queues; i++) {
struct netdev_queue *dev_queue;
struct Qdisc *qdisc;
@@ -1424,7 +1795,8 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
if (nla_put(skb, TCA_TAPRIO_ATTR_PRIOMAP, sizeof(opt), &opt))
goto options_error;
- if (nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid))
+ if (!FULL_OFFLOAD_IS_ENABLED(q->flags) &&
+ nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid))
goto options_error;
if (q->flags && nla_put_u32(skb, TCA_TAPRIO_ATTR_FLAGS, q->flags))
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 5f72f3f916a5..78e79029dc63 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -15,6 +15,7 @@
#include <linux/skbuff.h>
#include <net/netlink.h>
#include <net/sch_generic.h>
+#include <net/pkt_cls.h>
#include <net/pkt_sched.h>
@@ -137,6 +138,52 @@ static u64 psched_ns_t2l(const struct psched_ratecfg *r,
return len;
}
+static void tbf_offload_change(struct Qdisc *sch)
+{
+ struct tbf_sched_data *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_tbf_qopt_offload qopt;
+
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return;
+
+ qopt.command = TC_TBF_REPLACE;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ qopt.replace_params.rate = q->rate;
+ qopt.replace_params.max_size = q->max_size;
+ qopt.replace_params.qstats = &sch->qstats;
+
+ dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt);
+}
+
+static void tbf_offload_destroy(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_tbf_qopt_offload qopt;
+
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return;
+
+ qopt.command = TC_TBF_DESTROY;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt);
+}
+
+static int tbf_offload_dump(struct Qdisc *sch)
+{
+ struct tc_tbf_qopt_offload qopt;
+
+ qopt.command = TC_TBF_STATS;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ qopt.stats.bstats = &sch->bstats;
+ qopt.stats.qstats = &sch->qstats;
+
+ return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_TBF, &qopt);
+}
+
/* GSO packet is too big, segment it so that tbf can transmit
* each segment in time
*/
@@ -155,8 +202,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
return qdisc_drop(skb, sch, to_free);
nb = 0;
- while (segs) {
- nskb = segs->next;
+ skb_list_walk_safe(segs, segs, nskb) {
skb_mark_not_on_list(segs);
qdisc_skb_cb(segs)->pkt_len = segs->len;
len += segs->len;
@@ -167,7 +213,6 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
} else {
nb++;
}
- segs = nskb;
}
sch->q.qlen += nb;
if (nb > 1)
@@ -409,6 +454,8 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
sch_tree_unlock(sch);
err = 0;
+
+ tbf_offload_change(sch);
done:
return err;
}
@@ -434,6 +481,7 @@ static void tbf_destroy(struct Qdisc *sch)
struct tbf_sched_data *q = qdisc_priv(sch);
qdisc_watchdog_cancel(&q->watchdog);
+ tbf_offload_destroy(sch);
qdisc_put(q->qdisc);
}
@@ -442,8 +490,12 @@ static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
struct tbf_sched_data *q = qdisc_priv(sch);
struct nlattr *nest;
struct tc_tbf_qopt opt;
+ int err;
+
+ err = tbf_offload_dump(sch);
+ if (err)
+ return err;
- sch->qstats.backlog = q->qdisc->qstats.backlog;
nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
OpenPOWER on IntegriCloud