diff options
Diffstat (limited to 'net/sched')
-rw-r--r-- | net/sched/Kconfig | 14 | ||||
-rw-r--r-- | net/sched/Makefile | 1 | ||||
-rw-r--r-- | net/sched/cls_cgroup.c | 39 | ||||
-rw-r--r-- | net/sched/sch_api.c | 94 | ||||
-rw-r--r-- | net/sched/sch_choke.c | 3 | ||||
-rw-r--r-- | net/sched/sch_fq.c | 793 | ||||
-rw-r--r-- | net/sched/sch_generic.c | 28 | ||||
-rw-r--r-- | net/sched/sch_htb.c | 13 | ||||
-rw-r--r-- | net/sched/sch_mq.c | 2 | ||||
-rw-r--r-- | net/sched/sch_mqprio.c | 2 | ||||
-rw-r--r-- | net/sched/sch_netem.c | 5 |
11 files changed, 961 insertions, 33 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 235e01acac51..c03a32a0418e 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -272,6 +272,20 @@ config NET_SCH_FQ_CODEL If unsure, say N. +config NET_SCH_FQ + tristate "Fair Queue" + help + Say Y here if you want to use the FQ packet scheduling algorithm. + + FQ does flow separation, and is able to respect pacing requirements + set by TCP stack into sk->sk_pacing_rate (for localy generated + traffic) + + To compile this driver as a module, choose M here: the module + will be called sch_fq. + + If unsure, say N. + config NET_SCH_INGRESS tristate "Ingress Qdisc" depends on NET_CLS_ACT diff --git a/net/sched/Makefile b/net/sched/Makefile index 978cbf004e80..e5f9abe9a5db 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -39,6 +39,7 @@ obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o obj-$(CONFIG_NET_SCH_FQ_CODEL) += sch_fq_codel.o +obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 3a294eb98d61..867b4a3e3980 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -23,19 +23,18 @@ #include <net/sock.h> #include <net/cls_cgroup.h> -static inline struct cgroup_cls_state *cgrp_cls_state(struct cgroup *cgrp) +static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state *css) { - return container_of(cgroup_subsys_state(cgrp, net_cls_subsys_id), - struct cgroup_cls_state, css); + return css ? container_of(css, struct cgroup_cls_state, css) : NULL; } static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p) { - return container_of(task_subsys_state(p, net_cls_subsys_id), - struct cgroup_cls_state, css); + return css_cls_state(task_css(p, net_cls_subsys_id)); } -static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp) +static struct cgroup_subsys_state * +cgrp_css_alloc(struct cgroup_subsys_state *parent_css) { struct cgroup_cls_state *cs; @@ -45,17 +44,19 @@ static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp) return &cs->css; } -static int cgrp_css_online(struct cgroup *cgrp) +static int cgrp_css_online(struct cgroup_subsys_state *css) { - if (cgrp->parent) - cgrp_cls_state(cgrp)->classid = - cgrp_cls_state(cgrp->parent)->classid; + struct cgroup_cls_state *cs = css_cls_state(css); + struct cgroup_cls_state *parent = css_cls_state(css_parent(css)); + + if (parent) + cs->classid = parent->classid; return 0; } -static void cgrp_css_free(struct cgroup *cgrp) +static void cgrp_css_free(struct cgroup_subsys_state *css) { - kfree(cgrp_cls_state(cgrp)); + kfree(css_cls_state(css)); } static int update_classid(const void *v, struct file *file, unsigned n) @@ -67,12 +68,13 @@ static int update_classid(const void *v, struct file *file, unsigned n) return 0; } -static void cgrp_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static void cgrp_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) { struct task_struct *p; void *v; - cgroup_taskset_for_each(p, cgrp, tset) { + cgroup_taskset_for_each(p, css, tset) { task_lock(p); v = (void *)(unsigned long)task_cls_classid(p); iterate_fd(p->files, 0, update_classid, v); @@ -80,14 +82,15 @@ static void cgrp_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) } } -static u64 read_classid(struct cgroup *cgrp, struct cftype *cft) +static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft) { - return cgrp_cls_state(cgrp)->classid; + return css_cls_state(css)->classid; } -static int write_classid(struct cgroup *cgrp, struct cftype *cft, u64 value) +static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, + u64 value) { - cgrp_cls_state(cgrp)->classid = (u32) value; + css_cls_state(css)->classid = (u32) value; return 0; } diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 281c1bded1f6..2adda7fa2d39 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -200,6 +200,58 @@ int unregister_qdisc(struct Qdisc_ops *qops) } EXPORT_SYMBOL(unregister_qdisc); +/* Get default qdisc if not otherwise specified */ +void qdisc_get_default(char *name, size_t len) +{ + read_lock(&qdisc_mod_lock); + strlcpy(name, default_qdisc_ops->id, len); + read_unlock(&qdisc_mod_lock); +} + +static struct Qdisc_ops *qdisc_lookup_default(const char *name) +{ + struct Qdisc_ops *q = NULL; + + for (q = qdisc_base; q; q = q->next) { + if (!strcmp(name, q->id)) { + if (!try_module_get(q->owner)) + q = NULL; + break; + } + } + + return q; +} + +/* Set new default qdisc to use */ +int qdisc_set_default(const char *name) +{ + const struct Qdisc_ops *ops; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + write_lock(&qdisc_mod_lock); + ops = qdisc_lookup_default(name); + if (!ops) { + /* Not found, drop lock and try to load module */ + write_unlock(&qdisc_mod_lock); + request_module("sch_%s", name); + write_lock(&qdisc_mod_lock); + + ops = qdisc_lookup_default(name); + } + + if (ops) { + /* Set new default */ + module_put(default_qdisc_ops->owner); + default_qdisc_ops = ops; + } + write_unlock(&qdisc_mod_lock); + + return ops ? 0 : -ENOENT; +} + /* We know handle. Find qdisc among all qdisc's attached to device (root qdisc, all its children, children of children etc.) */ @@ -285,6 +337,45 @@ static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind) return q; } +/* The linklayer setting were not transferred from iproute2, in older + * versions, and the rate tables lookup systems have been dropped in + * the kernel. To keep backward compatible with older iproute2 tc + * utils, we detect the linklayer setting by detecting if the rate + * table were modified. + * + * For linklayer ATM table entries, the rate table will be aligned to + * 48 bytes, thus some table entries will contain the same value. The + * mpu (min packet unit) is also encoded into the old rate table, thus + * starting from the mpu, we find low and high table entries for + * mapping this cell. If these entries contain the same value, when + * the rate tables have been modified for linklayer ATM. + * + * This is done by rounding mpu to the nearest 48 bytes cell/entry, + * and then roundup to the next cell, calc the table entry one below, + * and compare. + */ +static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab) +{ + int low = roundup(r->mpu, 48); + int high = roundup(low+1, 48); + int cell_low = low >> r->cell_log; + int cell_high = (high >> r->cell_log) - 1; + + /* rtab is too inaccurate at rates > 100Mbit/s */ + if ((r->rate > (100000000/8)) || (rtab[0] == 0)) { + pr_debug("TC linklayer: Giving up ATM detection\n"); + return TC_LINKLAYER_ETHERNET; + } + + if ((cell_high > cell_low) && (cell_high < 256) + && (rtab[cell_low] == rtab[cell_high])) { + pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n", + cell_low, cell_high, rtab[cell_high]); + return TC_LINKLAYER_ATM; + } + return TC_LINKLAYER_ETHERNET; +} + static struct qdisc_rate_table *qdisc_rtab_list; struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab) @@ -308,6 +399,8 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *ta rtab->rate = *r; rtab->refcnt = 1; memcpy(rtab->data, nla_data(tab), 1024); + if (r->linklayer == TC_LINKLAYER_UNAWARE) + r->linklayer = __detect_linklayer(r, rtab->data); rtab->next = qdisc_rtab_list; qdisc_rtab_list = rtab; } @@ -1813,6 +1906,7 @@ static int __init pktsched_init(void) return err; } + register_qdisc(&pfifo_fast_ops); register_qdisc(&pfifo_qdisc_ops); register_qdisc(&bfifo_qdisc_ops); register_qdisc(&pfifo_head_drop_qdisc_ops); diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index ef53ab8d0aae..ddd73cb2d7ba 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -438,7 +438,8 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt) if (mask != q->tab_mask) { struct sk_buff **ntab; - ntab = kcalloc(mask + 1, sizeof(struct sk_buff *), GFP_KERNEL); + ntab = kcalloc(mask + 1, sizeof(struct sk_buff *), + GFP_KERNEL | __GFP_NOWARN); if (!ntab) ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *)); if (!ntab) diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c new file mode 100644 index 000000000000..32ad015ee8ce --- /dev/null +++ b/net/sched/sch_fq.c @@ -0,0 +1,793 @@ +/* + * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing) + * + * Copyright (C) 2013 Eric Dumazet <edumazet@google.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Meant to be mostly used for localy generated traffic : + * Fast classification depends on skb->sk being set before reaching us. + * If not, (router workload), we use rxhash as fallback, with 32 bits wide hash. + * All packets belonging to a socket are considered as a 'flow'. + * + * Flows are dynamically allocated and stored in a hash table of RB trees + * They are also part of one Round Robin 'queues' (new or old flows) + * + * Burst avoidance (aka pacing) capability : + * + * Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a + * bunch of packets, and this packet scheduler adds delay between + * packets to respect rate limitation. + * + * enqueue() : + * - lookup one RB tree (out of 1024 or more) to find the flow. + * If non existent flow, create it, add it to the tree. + * Add skb to the per flow list of skb (fifo). + * - Use a special fifo for high prio packets + * + * dequeue() : serves flows in Round Robin + * Note : When a flow becomes empty, we do not immediately remove it from + * rb trees, for performance reasons (its expected to send additional packets, + * or SLAB cache will reuse socket for another flow) + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/jiffies.h> +#include <linux/string.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/rbtree.h> +#include <linux/hash.h> +#include <linux/prefetch.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> +#include <net/sock.h> +#include <net/tcp_states.h> + +/* + * Per flow structure, dynamically allocated + */ +struct fq_flow { + struct sk_buff *head; /* list of skbs for this flow : first skb */ + union { + struct sk_buff *tail; /* last skb in the list */ + unsigned long age; /* jiffies when flow was emptied, for gc */ + }; + struct rb_node fq_node; /* anchor in fq_root[] trees */ + struct sock *sk; + int qlen; /* number of packets in flow queue */ + int credit; + u32 socket_hash; /* sk_hash */ + struct fq_flow *next; /* next pointer in RR lists, or &detached */ + + struct rb_node rate_node; /* anchor in q->delayed tree */ + u64 time_next_packet; +}; + +struct fq_flow_head { + struct fq_flow *first; + struct fq_flow *last; +}; + +struct fq_sched_data { + struct fq_flow_head new_flows; + + struct fq_flow_head old_flows; + + struct rb_root delayed; /* for rate limited flows */ + u64 time_next_delayed_flow; + + struct fq_flow internal; /* for non classified or high prio packets */ + u32 quantum; + u32 initial_quantum; + u32 flow_default_rate;/* rate per flow : bytes per second */ + u32 flow_max_rate; /* optional max rate per flow */ + u32 flow_plimit; /* max packets per flow */ + struct rb_root *fq_root; + u8 rate_enable; + u8 fq_trees_log; + + u32 flows; + u32 inactive_flows; + u32 throttled_flows; + + u64 stat_gc_flows; + u64 stat_internal_packets; + u64 stat_tcp_retrans; + u64 stat_throttled; + u64 stat_flows_plimit; + u64 stat_pkts_too_long; + u64 stat_allocation_errors; + struct qdisc_watchdog watchdog; +}; + +/* special value to mark a detached flow (not on old/new list) */ +static struct fq_flow detached, throttled; + +static void fq_flow_set_detached(struct fq_flow *f) +{ + f->next = &detached; +} + +static bool fq_flow_is_detached(const struct fq_flow *f) +{ + return f->next == &detached; +} + +static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f) +{ + struct rb_node **p = &q->delayed.rb_node, *parent = NULL; + + while (*p) { + struct fq_flow *aux; + + parent = *p; + aux = container_of(parent, struct fq_flow, rate_node); + if (f->time_next_packet >= aux->time_next_packet) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&f->rate_node, parent, p); + rb_insert_color(&f->rate_node, &q->delayed); + q->throttled_flows++; + q->stat_throttled++; + + f->next = &throttled; + if (q->time_next_delayed_flow > f->time_next_packet) + q->time_next_delayed_flow = f->time_next_packet; +} + + +static struct kmem_cache *fq_flow_cachep __read_mostly; + +static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow) +{ + if (head->first) + head->last->next = flow; + else + head->first = flow; + head->last = flow; + flow->next = NULL; +} + +/* limit number of collected flows per round */ +#define FQ_GC_MAX 8 +#define FQ_GC_AGE (3*HZ) + +static bool fq_gc_candidate(const struct fq_flow *f) +{ + return fq_flow_is_detached(f) && + time_after(jiffies, f->age + FQ_GC_AGE); +} + +static void fq_gc(struct fq_sched_data *q, + struct rb_root *root, + struct sock *sk) +{ + struct fq_flow *f, *tofree[FQ_GC_MAX]; + struct rb_node **p, *parent; + int fcnt = 0; + + p = &root->rb_node; + parent = NULL; + while (*p) { + parent = *p; + + f = container_of(parent, struct fq_flow, fq_node); + if (f->sk == sk) + break; + + if (fq_gc_candidate(f)) { + tofree[fcnt++] = f; + if (fcnt == FQ_GC_MAX) + break; + } + + if (f->sk > sk) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + + q->flows -= fcnt; + q->inactive_flows -= fcnt; + q->stat_gc_flows += fcnt; + while (fcnt) { + struct fq_flow *f = tofree[--fcnt]; + + rb_erase(&f->fq_node, root); + kmem_cache_free(fq_flow_cachep, f); + } +} + +static const u8 prio2band[TC_PRIO_MAX + 1] = { + 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 +}; + +static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) +{ + struct rb_node **p, *parent; + struct sock *sk = skb->sk; + struct rb_root *root; + struct fq_flow *f; + int band; + + /* warning: no starvation prevention... */ + band = prio2band[skb->priority & TC_PRIO_MAX]; + if (unlikely(band == 0)) + return &q->internal; + + if (unlikely(!sk)) { + /* By forcing low order bit to 1, we make sure to not + * collide with a local flow (socket pointers are word aligned) + */ + sk = (struct sock *)(skb_get_rxhash(skb) | 1L); + } + + root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)]; + + if (q->flows >= (2U << q->fq_trees_log) && + q->inactive_flows > q->flows/2) + fq_gc(q, root, sk); + + p = &root->rb_node; + parent = NULL; + while (*p) { + parent = *p; + + f = container_of(parent, struct fq_flow, fq_node); + if (f->sk == sk) { + /* socket might have been reallocated, so check + * if its sk_hash is the same. + * It not, we need to refill credit with + * initial quantum + */ + if (unlikely(skb->sk && + f->socket_hash != sk->sk_hash)) { + f->credit = q->initial_quantum; + f->socket_hash = sk->sk_hash; + } + return f; + } + if (f->sk > sk) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + + f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN); + if (unlikely(!f)) { + q->stat_allocation_errors++; + return &q->internal; + } + fq_flow_set_detached(f); + f->sk = sk; + if (skb->sk) + f->socket_hash = sk->sk_hash; + f->credit = q->initial_quantum; + + rb_link_node(&f->fq_node, parent, p); + rb_insert_color(&f->fq_node, root); + + q->flows++; + q->inactive_flows++; + return f; +} + + +/* remove one skb from head of flow queue */ +static struct sk_buff *fq_dequeue_head(struct fq_flow *flow) +{ + struct sk_buff *skb = flow->head; + + if (skb) { + flow->head = skb->next; + skb->next = NULL; + flow->qlen--; + } + return skb; +} + +/* We might add in the future detection of retransmits + * For the time being, just return false + */ +static bool skb_is_retransmit(struct sk_buff *skb) +{ + return false; +} + +/* add skb to flow queue + * flow queue is a linked list, kind of FIFO, except for TCP retransmits + * We special case tcp retransmits to be transmitted before other packets. + * We rely on fact that TCP retransmits are unlikely, so we do not waste + * a separate queue or a pointer. + * head-> [retrans pkt 1] + * [retrans pkt 2] + * [ normal pkt 1] + * [ normal pkt 2] + * [ normal pkt 3] + * tail-> [ normal pkt 4] + */ +static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) +{ + struct sk_buff *prev, *head = flow->head; + + skb->next = NULL; + if (!head) { + flow->head = skb; + flow->tail = skb; + return; + } + if (likely(!skb_is_retransmit(skb))) { + flow->tail->next = skb; + flow->tail = skb; + return; + } + + /* This skb is a tcp retransmit, + * find the last retrans packet in the queue + */ + prev = NULL; + while (skb_is_retransmit(head)) { + prev = head; + head = head->next; + if (!head) + break; + } + if (!prev) { /* no rtx packet in queue, become the new head */ + skb->next = flow->head; + flow->head = skb; + } else { + if (prev == flow->tail) + flow->tail = skb; + else + skb->next = prev->next; + prev->next = skb; + } +} + +static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct fq_sched_data *q = qdisc_priv(sch); + struct fq_flow *f; + + if (unlikely(sch->q.qlen >= sch->limit)) + return qdisc_drop(skb, sch); + + f = fq_classify(skb, q); + if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) { + q->stat_flows_plimit++; + return qdisc_drop(skb, sch); + } + + f->qlen++; + flow_queue_add(f, skb); + if (skb_is_retransmit(skb)) + q->stat_tcp_retrans++; + sch->qstats.backlog += qdisc_pkt_len(skb); + if (fq_flow_is_detached(f)) { + fq_flow_add_tail(&q->new_flows, f); + if (q->quantum > f->credit) + f->credit = q->quantum; + q->inactive_flows--; + qdisc_unthrottled(sch); + } + if (unlikely(f == &q->internal)) { + q->stat_internal_packets++; + qdisc_unthrottled(sch); + } + sch->q.qlen++; + + return NET_XMIT_SUCCESS; +} + +static void fq_check_throttled(struct fq_sched_data *q, u64 now) +{ + struct rb_node *p; + + if (q->time_next_delayed_flow > now) + return; + + q->time_next_delayed_flow = ~0ULL; + while ((p = rb_first(&q->delayed)) != NULL) { + struct fq_flow *f = container_of(p, struct fq_flow, rate_node); + + if (f->time_next_packet > now) { + q->time_next_delayed_flow = f->time_next_packet; + break; + } + rb_erase(p, &q->delayed); + q->throttled_flows--; + fq_flow_add_tail(&q->old_flows, f); + } +} + +static struct sk_buff *fq_dequeue(struct Qdisc *sch) +{ + struct fq_sched_data *q = qdisc_priv(sch); + u64 now = ktime_to_ns(ktime_get()); + struct fq_flow_head *head; + struct sk_buff *skb; + struct fq_flow *f; + + skb = fq_dequeue_head(&q->internal); + if (skb) + goto out; + fq_check_throttled(q, now); +begin: + head = &q->new_flows; + if (!head->first) { + head = &q->old_flows; + if (!head->first) { + if (q->time_next_delayed_flow != ~0ULL) + qdisc_watchdog_schedule_ns(&q->watchdog, + q->time_next_delayed_flow); + return NULL; + } + } + f = head->first; + + if (f->credit <= 0) { + f->credit += q->quantum; + head->first = f->next; + fq_flow_add_tail(&q->old_flows, f); + goto begin; + } + + if (unlikely(f->head && now < f->time_next_packet)) { + head->first = f->next; + fq_flow_set_throttled(q, f); + goto begin; + } + + skb = fq_dequeue_head(f); + if (!skb) { + head->first = f->next; + /* force a pass through old_flows to prevent starvation */ + if ((head == &q->new_flows) && q->old_flows.first) { + fq_flow_add_tail(&q->old_flows, f); + } else { + fq_flow_set_detached(f); + f->age = jiffies; + q->inactive_flows++; + } + goto begin; + } + prefetch(&skb->end); + f->time_next_packet = now; + f->credit -= qdisc_pkt_len(skb); + + if (f->credit <= 0 && + q->rate_enable && + skb->sk && skb->sk->sk_state != TCP_TIME_WAIT) { + u32 rate = skb->sk->sk_pacing_rate ?: q->flow_default_rate; + + rate = min(rate, q->flow_max_rate); + if (rate) { + u64 len = (u64)qdisc_pkt_len(skb) * NSEC_PER_SEC; + + do_div(len, rate); + /* Since socket rate can change later, + * clamp the delay to 125 ms. + * TODO: maybe segment the too big skb, as in commit + * e43ac79a4bc ("sch_tbf: segment too big GSO packets") + */ + if (unlikely(len > 125 * NSEC_PER_MSEC)) { + len = 125 * NSEC_PER_MSEC; + q->stat_pkts_too_long++; + } + + f->time_next_packet = now + len; + } + } +out: + sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_bstats_update(sch, skb); + sch->q.qlen--; + qdisc_unthrottled(sch); + return skb; +} + +static void fq_reset(struct Qdisc *sch) +{ + struct sk_buff *skb; + + while ((skb = fq_dequeue(sch)) != NULL) + kfree_skb(skb); +} + +static void fq_rehash(struct fq_sched_data *q, + struct rb_root *old_array, u32 old_log, + struct rb_root *new_array, u32 new_log) +{ + struct rb_node *op, **np, *parent; + struct rb_root *oroot, *nroot; + struct fq_flow *of, *nf; + int fcnt = 0; + u32 idx; + + for (idx = 0; idx < (1U << old_log); idx++) { + oroot = &old_array[idx]; + while ((op = rb_first(oroot)) != NULL) { + rb_erase(op, oroot); + of = container_of(op, struct fq_flow, fq_node); + if (fq_gc_candidate(of)) { + fcnt++; + kmem_cache_free(fq_flow_cachep, of); + continue; + } + nroot = &new_array[hash_32((u32)(long)of->sk, new_log)]; + + np = &nroot->rb_node; + parent = NULL; + while (*np) { + parent = *np; + + nf = container_of(parent, struct fq_flow, fq_node); + BUG_ON(nf->sk == of->sk); + + if (nf->sk > of->sk) + np = &parent->rb_right; + else + np = &parent->rb_left; + } + + rb_link_node(&of->fq_node, parent, np); + rb_insert_color(&of->fq_node, nroot); + } + } + q->flows -= fcnt; + q->inactive_flows -= fcnt; + q->stat_gc_flows += fcnt; +} + +static int fq_resize(struct fq_sched_data *q, u32 log) +{ + struct rb_root *array; + u32 idx; + + if (q->fq_root && log == q->fq_trees_log) + return 0; + + array = kmalloc(sizeof(struct rb_root) << log, GFP_KERNEL); + if (!array) + return -ENOMEM; + + for (idx = 0; idx < (1U << log); idx++) + array[idx] = RB_ROOT; + + if (q->fq_root) { + fq_rehash(q, q->fq_root, q->fq_trees_log, array, log); + kfree(q->fq_root); + } + q->fq_root = array; + q->fq_trees_log = log; + + return 0; +} + +static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { + [TCA_FQ_PLIMIT] = { .type = NLA_U32 }, + [TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 }, + [TCA_FQ_QUANTUM] = { .type = NLA_U32 }, + [TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 }, + [TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 }, + [TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 }, + [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 }, + [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 }, +}; + +static int fq_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct fq_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_FQ_MAX + 1]; + int err, drop_count = 0; + u32 fq_log; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy); + if (err < 0) + return err; + + sch_tree_lock(sch); + + fq_log = q->fq_trees_log; + + if (tb[TCA_FQ_BUCKETS_LOG]) { + u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]); + + if (nval >= 1 && nval <= ilog2(256*1024)) + fq_log = nval; + else + err = -EINVAL; + } + if (tb[TCA_FQ_PLIMIT]) + sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]); + + if (tb[TCA_FQ_FLOW_PLIMIT]) + q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]); + + if (tb[TCA_FQ_QUANTUM]) + q->quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]); + + if (tb[TCA_FQ_INITIAL_QUANTUM]) + q->quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]); + + if (tb[TCA_FQ_FLOW_DEFAULT_RATE]) + q->flow_default_rate = nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]); + + if (tb[TCA_FQ_FLOW_MAX_RATE]) + q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); + + if (tb[TCA_FQ_RATE_ENABLE]) { + u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]); + + if (enable <= 1) + q->rate_enable = enable; + else + err = -EINVAL; + } + + if (!err) + err = fq_resize(q, fq_log); + + while (sch->q.qlen > sch->limit) { + struct sk_buff *skb = fq_dequeue(sch); + + kfree_skb(skb); + drop_count++; + } + qdisc_tree_decrease_qlen(sch, drop_count); + + sch_tree_unlock(sch); + return err; +} + +static void fq_destroy(struct Qdisc *sch) +{ + struct fq_sched_data *q = qdisc_priv(sch); + struct rb_root *root; + struct rb_node *p; + unsigned int idx; + + if (q->fq_root) { + for (idx = 0; idx < (1U << q->fq_trees_log); idx++) { + root = &q->fq_root[idx]; + while ((p = rb_first(root)) != NULL) { + rb_erase(p, root); + kmem_cache_free(fq_flow_cachep, + container_of(p, struct fq_flow, fq_node)); + } + } + kfree(q->fq_root); + } + qdisc_watchdog_cancel(&q->watchdog); +} + +static int fq_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct fq_sched_data *q = qdisc_priv(sch); + int err; + + sch->limit = 10000; + q->flow_plimit = 100; + q->quantum = 2 * psched_mtu(qdisc_dev(sch)); + q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch)); + q->flow_default_rate = 0; + q->flow_max_rate = ~0U; + q->rate_enable = 1; + q->new_flows.first = NULL; + q->old_flows.first = NULL; + q->delayed = RB_ROOT; + q->fq_root = NULL; + q->fq_trees_log = ilog2(1024); + qdisc_watchdog_init(&q->watchdog, sch); + + if (opt) + err = fq_change(sch, opt); + else + err = fq_resize(q, q->fq_trees_log); + + return err; +} + +static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct fq_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) || + nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) || + nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) || + nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) || + nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) || + nla_put_u32(skb, TCA_FQ_FLOW_DEFAULT_RATE, q->flow_default_rate) || + nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) || + nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log)) + goto nla_put_failure; + + nla_nest_end(skb, opts); + return skb->len; + +nla_put_failure: + return -1; +} + +static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct fq_sched_data *q = qdisc_priv(sch); + u64 now = ktime_to_ns(ktime_get()); + struct tc_fq_qd_stats st = { + .gc_flows = q->stat_gc_flows, + .highprio_packets = q->stat_internal_packets, + .tcp_retrans = q->stat_tcp_retrans, + .throttled = q->stat_throttled, + .flows_plimit = q->stat_flows_plimit, + .pkts_too_long = q->stat_pkts_too_long, + .allocation_errors = q->stat_allocation_errors, + .flows = q->flows, + .inactive_flows = q->inactive_flows, + .throttled_flows = q->throttled_flows, + .time_next_delayed_flow = q->time_next_delayed_flow - now, + }; + + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static struct Qdisc_ops fq_qdisc_ops __read_mostly = { + .id = "fq", + .priv_size = sizeof(struct fq_sched_data), + + .enqueue = fq_enqueue, + .dequeue = fq_dequeue, + .peek = qdisc_peek_dequeued, + .init = fq_init, + .reset = fq_reset, + .destroy = fq_destroy, + .change = fq_change, + .dump = fq_dump, + .dump_stats = fq_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init fq_module_init(void) +{ + int ret; + + fq_flow_cachep = kmem_cache_create("fq_flow_cache", + sizeof(struct fq_flow), + 0, 0, NULL); + if (!fq_flow_cachep) + return -ENOMEM; + + ret = register_qdisc(&fq_qdisc_ops); + if (ret) + kmem_cache_destroy(fq_flow_cachep); + return ret; +} + +static void __exit fq_module_exit(void) +{ + unregister_qdisc(&fq_qdisc_ops); + kmem_cache_destroy(fq_flow_cachep); +} + +module_init(fq_module_init) +module_exit(fq_module_exit) +MODULE_AUTHOR("Eric Dumazet"); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 4626cef4b76e..a74e278654aa 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -25,10 +25,15 @@ #include <linux/rcupdate.h> #include <linux/list.h> #include <linux/slab.h> +#include <linux/if_vlan.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> #include <net/dst.h> +/* Qdisc to use by default */ +const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops; +EXPORT_SYMBOL(default_qdisc_ops); + /* Main transmission queue. */ /* Modifications to data participating in scheduling must be protected with @@ -207,15 +212,19 @@ void __qdisc_run(struct Qdisc *q) unsigned long dev_trans_start(struct net_device *dev) { - unsigned long val, res = dev->trans_start; + unsigned long val, res; unsigned int i; + if (is_vlan_dev(dev)) + dev = vlan_dev_real_dev(dev); + res = dev->trans_start; for (i = 0; i < dev->num_tx_queues; i++) { val = netdev_get_tx_queue(dev, i)->trans_start; if (val && time_after(val, res)) res = val; } dev->trans_start = res; + return res; } EXPORT_SYMBOL(dev_trans_start); @@ -525,12 +534,11 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = { .dump = pfifo_fast_dump, .owner = THIS_MODULE, }; -EXPORT_SYMBOL(pfifo_fast_ops); static struct lock_class_key qdisc_tx_busylock; struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, - struct Qdisc_ops *ops) + const struct Qdisc_ops *ops) { void *p; struct Qdisc *sch; @@ -574,10 +582,14 @@ errout: } struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, - struct Qdisc_ops *ops, unsigned int parentid) + const struct Qdisc_ops *ops, + unsigned int parentid) { struct Qdisc *sch; + if (!try_module_get(ops->owner)) + goto errout; + sch = qdisc_alloc(dev_queue, ops); if (IS_ERR(sch)) goto errout; @@ -681,7 +693,7 @@ static void attach_one_default_qdisc(struct net_device *dev, if (dev->tx_queue_len) { qdisc = qdisc_create_dflt(dev_queue, - &pfifo_fast_ops, TC_H_ROOT); + default_qdisc_ops, TC_H_ROOT); if (!qdisc) { netdev_info(dev, "activation failed\n"); return; @@ -734,9 +746,8 @@ void dev_activate(struct net_device *dev) int need_watchdog; /* No queueing discipline is attached to device; - create default one i.e. pfifo_fast for devices, - which need queueing and noqueue_qdisc for - virtual interfaces + * create default one for devices, which need queueing + * and noqueue_qdisc for virtual interfaces */ if (dev->qdisc == &noop_qdisc) @@ -904,6 +915,7 @@ void psched_ratecfg_precompute(struct psched_ratecfg *r, memset(r, 0, sizeof(*r)); r->overhead = conf->overhead; r->rate_bytes_ps = conf->rate; + r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK); r->mult = 1; /* * The deal here is to replace a divide by a reciprocal one diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 45e751527dfc..c2178b15ca6e 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1329,6 +1329,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl = (struct htb_class *)*arg, *parent; struct nlattr *opt = tca[TCA_OPTIONS]; + struct qdisc_rate_table *rtab = NULL, *ctab = NULL; struct nlattr *tb[TCA_HTB_MAX + 1]; struct tc_htb_opt *hopt; @@ -1350,6 +1351,18 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, if (!hopt->rate.rate || !hopt->ceil.rate) goto failure; + /* Keeping backward compatible with rate_table based iproute2 tc */ + if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE) { + rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]); + if (rtab) + qdisc_put_rtab(rtab); + } + if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE) { + ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]); + if (ctab) + qdisc_put_rtab(ctab); + } + if (!cl) { /* new class */ struct Qdisc *new_q; int prio; diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index 5da78a19ac9a..2e56185736d6 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -57,7 +57,7 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt) for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { dev_queue = netdev_get_tx_queue(dev, ntx); - qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops, + qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops, TC_H_MAKE(TC_H_MAJ(sch->handle), TC_H_MIN(ntx + 1))); if (qdisc == NULL) diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index accec33c454c..d44c868cb537 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -124,7 +124,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) for (i = 0; i < dev->num_tx_queues; i++) { dev_queue = netdev_get_tx_queue(dev, i); - qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops, + qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops, TC_H_MAKE(TC_H_MAJ(sch->handle), TC_H_MIN(i + 1))); if (qdisc == NULL) { diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 82f6016d89ab..a6d788d45216 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -412,12 +412,9 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) /* If a delay is expected, orphan the skb. (orphaning usually takes * place at TX completion time, so _before_ the link transit delay) - * Ideally, this orphaning should be done after the rate limiting - * module, because this breaks TCP Small Queue, and other mechanisms - * based on socket sk_wmem_alloc. */ if (q->latency || q->jitter) - skb_orphan(skb); + skb_orphan_partial(skb); /* * If we need to duplicate packet, then re-insert at top of the |