diff options
Diffstat (limited to 'net')
54 files changed, 1136 insertions, 889 deletions
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 303c779bfe38..43ba91c440bc 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -58,7 +58,7 @@ static struct sk_buff *l2cap_build_cmd(struct l2cap_conn *conn, u8 code, u8 ident, u16 dlen, void *data); static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len, void *data); -static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data); +static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data, size_t data_size); static void l2cap_send_disconn_req(struct l2cap_chan *chan, int err); static void l2cap_tx(struct l2cap_chan *chan, struct l2cap_ctrl *control, @@ -1473,7 +1473,7 @@ static void l2cap_conn_start(struct l2cap_conn *conn) set_bit(CONF_REQ_SENT, &chan->conf_state); l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, - l2cap_build_conf_req(chan, buf), buf); + l2cap_build_conf_req(chan, buf, sizeof(buf)), buf); chan->num_conf_req++; } @@ -2987,12 +2987,15 @@ static inline int l2cap_get_conf_opt(void **ptr, int *type, int *olen, return len; } -static void l2cap_add_conf_opt(void **ptr, u8 type, u8 len, unsigned long val) +static void l2cap_add_conf_opt(void **ptr, u8 type, u8 len, unsigned long val, size_t size) { struct l2cap_conf_opt *opt = *ptr; BT_DBG("type 0x%2.2x len %u val 0x%lx", type, len, val); + if (size < L2CAP_CONF_OPT_SIZE + len) + return; + opt->type = type; opt->len = len; @@ -3017,7 +3020,7 @@ static void l2cap_add_conf_opt(void **ptr, u8 type, u8 len, unsigned long val) *ptr += L2CAP_CONF_OPT_SIZE + len; } -static void l2cap_add_opt_efs(void **ptr, struct l2cap_chan *chan) +static void l2cap_add_opt_efs(void **ptr, struct l2cap_chan *chan, size_t size) { struct l2cap_conf_efs efs; @@ -3045,7 +3048,7 @@ static void l2cap_add_opt_efs(void **ptr, struct l2cap_chan *chan) } l2cap_add_conf_opt(ptr, L2CAP_CONF_EFS, sizeof(efs), - (unsigned long) &efs); + (unsigned long) &efs, size); } static void l2cap_ack_timeout(struct work_struct *work) @@ -3191,11 +3194,12 @@ static inline void l2cap_txwin_setup(struct l2cap_chan *chan) chan->ack_win = chan->tx_win; } -static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data) +static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data, size_t data_size) { struct l2cap_conf_req *req = data; struct l2cap_conf_rfc rfc = { .mode = chan->mode }; void *ptr = req->data; + void *endptr = data + data_size; u16 size; BT_DBG("chan %p", chan); @@ -3220,7 +3224,7 @@ static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data) done: if (chan->imtu != L2CAP_DEFAULT_MTU) - l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu); + l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu, endptr - ptr); switch (chan->mode) { case L2CAP_MODE_BASIC: @@ -3239,7 +3243,7 @@ done: rfc.max_pdu_size = 0; l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc), - (unsigned long) &rfc); + (unsigned long) &rfc, endptr - ptr); break; case L2CAP_MODE_ERTM: @@ -3259,21 +3263,21 @@ done: L2CAP_DEFAULT_TX_WINDOW); l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc), - (unsigned long) &rfc); + (unsigned long) &rfc, endptr - ptr); if (test_bit(FLAG_EFS_ENABLE, &chan->flags)) - l2cap_add_opt_efs(&ptr, chan); + l2cap_add_opt_efs(&ptr, chan, endptr - ptr); if (test_bit(FLAG_EXT_CTRL, &chan->flags)) l2cap_add_conf_opt(&ptr, L2CAP_CONF_EWS, 2, - chan->tx_win); + chan->tx_win, endptr - ptr); if (chan->conn->feat_mask & L2CAP_FEAT_FCS) if (chan->fcs == L2CAP_FCS_NONE || test_bit(CONF_RECV_NO_FCS, &chan->conf_state)) { chan->fcs = L2CAP_FCS_NONE; l2cap_add_conf_opt(&ptr, L2CAP_CONF_FCS, 1, - chan->fcs); + chan->fcs, endptr - ptr); } break; @@ -3291,17 +3295,17 @@ done: rfc.max_pdu_size = cpu_to_le16(size); l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc), - (unsigned long) &rfc); + (unsigned long) &rfc, endptr - ptr); if (test_bit(FLAG_EFS_ENABLE, &chan->flags)) - l2cap_add_opt_efs(&ptr, chan); + l2cap_add_opt_efs(&ptr, chan, endptr - ptr); if (chan->conn->feat_mask & L2CAP_FEAT_FCS) if (chan->fcs == L2CAP_FCS_NONE || test_bit(CONF_RECV_NO_FCS, &chan->conf_state)) { chan->fcs = L2CAP_FCS_NONE; l2cap_add_conf_opt(&ptr, L2CAP_CONF_FCS, 1, - chan->fcs); + chan->fcs, endptr - ptr); } break; } @@ -3312,10 +3316,11 @@ done: return ptr - data; } -static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data) +static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data, size_t data_size) { struct l2cap_conf_rsp *rsp = data; void *ptr = rsp->data; + void *endptr = data + data_size; void *req = chan->conf_req; int len = chan->conf_len; int type, hint, olen; @@ -3417,7 +3422,7 @@ done: return -ECONNREFUSED; l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc), - (unsigned long) &rfc); + (unsigned long) &rfc, endptr - ptr); } if (result == L2CAP_CONF_SUCCESS) { @@ -3430,7 +3435,7 @@ done: chan->omtu = mtu; set_bit(CONF_MTU_DONE, &chan->conf_state); } - l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->omtu); + l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->omtu, endptr - ptr); if (remote_efs) { if (chan->local_stype != L2CAP_SERV_NOTRAFIC && @@ -3444,7 +3449,7 @@ done: l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS, sizeof(efs), - (unsigned long) &efs); + (unsigned long) &efs, endptr - ptr); } else { /* Send PENDING Conf Rsp */ result = L2CAP_CONF_PENDING; @@ -3477,7 +3482,7 @@ done: set_bit(CONF_MODE_DONE, &chan->conf_state); l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, - sizeof(rfc), (unsigned long) &rfc); + sizeof(rfc), (unsigned long) &rfc, endptr - ptr); if (test_bit(FLAG_EFS_ENABLE, &chan->flags)) { chan->remote_id = efs.id; @@ -3491,7 +3496,7 @@ done: le32_to_cpu(efs.sdu_itime); l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS, sizeof(efs), - (unsigned long) &efs); + (unsigned long) &efs, endptr - ptr); } break; @@ -3505,7 +3510,7 @@ done: set_bit(CONF_MODE_DONE, &chan->conf_state); l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc), - (unsigned long) &rfc); + (unsigned long) &rfc, endptr - ptr); break; @@ -3527,10 +3532,11 @@ done: } static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len, - void *data, u16 *result) + void *data, size_t size, u16 *result) { struct l2cap_conf_req *req = data; void *ptr = req->data; + void *endptr = data + size; int type, olen; unsigned long val; struct l2cap_conf_rfc rfc = { .mode = L2CAP_MODE_BASIC }; @@ -3548,13 +3554,13 @@ static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len, chan->imtu = L2CAP_DEFAULT_MIN_MTU; } else chan->imtu = val; - l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu); + l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu, endptr - ptr); break; case L2CAP_CONF_FLUSH_TO: chan->flush_to = val; l2cap_add_conf_opt(&ptr, L2CAP_CONF_FLUSH_TO, - 2, chan->flush_to); + 2, chan->flush_to, endptr - ptr); break; case L2CAP_CONF_RFC: @@ -3568,13 +3574,13 @@ static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len, chan->fcs = 0; l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, - sizeof(rfc), (unsigned long) &rfc); + sizeof(rfc), (unsigned long) &rfc, endptr - ptr); break; case L2CAP_CONF_EWS: chan->ack_win = min_t(u16, val, chan->ack_win); l2cap_add_conf_opt(&ptr, L2CAP_CONF_EWS, 2, - chan->tx_win); + chan->tx_win, endptr - ptr); break; case L2CAP_CONF_EFS: @@ -3587,7 +3593,7 @@ static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len, return -ECONNREFUSED; l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS, sizeof(efs), - (unsigned long) &efs); + (unsigned long) &efs, endptr - ptr); break; case L2CAP_CONF_FCS: @@ -3692,7 +3698,7 @@ void __l2cap_connect_rsp_defer(struct l2cap_chan *chan) return; l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, - l2cap_build_conf_req(chan, buf), buf); + l2cap_build_conf_req(chan, buf, sizeof(buf)), buf); chan->num_conf_req++; } @@ -3900,7 +3906,7 @@ sendresp: u8 buf[128]; set_bit(CONF_REQ_SENT, &chan->conf_state); l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, - l2cap_build_conf_req(chan, buf), buf); + l2cap_build_conf_req(chan, buf, sizeof(buf)), buf); chan->num_conf_req++; } @@ -3978,7 +3984,7 @@ static int l2cap_connect_create_rsp(struct l2cap_conn *conn, break; l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, - l2cap_build_conf_req(chan, req), req); + l2cap_build_conf_req(chan, req, sizeof(req)), req); chan->num_conf_req++; break; @@ -4090,7 +4096,7 @@ static inline int l2cap_config_req(struct l2cap_conn *conn, } /* Complete config. */ - len = l2cap_parse_conf_req(chan, rsp); + len = l2cap_parse_conf_req(chan, rsp, sizeof(rsp)); if (len < 0) { l2cap_send_disconn_req(chan, ECONNRESET); goto unlock; @@ -4124,7 +4130,7 @@ static inline int l2cap_config_req(struct l2cap_conn *conn, if (!test_and_set_bit(CONF_REQ_SENT, &chan->conf_state)) { u8 buf[64]; l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, - l2cap_build_conf_req(chan, buf), buf); + l2cap_build_conf_req(chan, buf, sizeof(buf)), buf); chan->num_conf_req++; } @@ -4184,7 +4190,7 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn, char buf[64]; len = l2cap_parse_conf_rsp(chan, rsp->data, len, - buf, &result); + buf, sizeof(buf), &result); if (len < 0) { l2cap_send_disconn_req(chan, ECONNRESET); goto done; @@ -4214,7 +4220,7 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn, /* throw out any old stored conf requests */ result = L2CAP_CONF_SUCCESS; len = l2cap_parse_conf_rsp(chan, rsp->data, len, - req, &result); + req, sizeof(req), &result); if (len < 0) { l2cap_send_disconn_req(chan, ECONNRESET); goto done; @@ -4791,7 +4797,7 @@ static void l2cap_do_create(struct l2cap_chan *chan, int result, set_bit(CONF_REQ_SENT, &chan->conf_state); l2cap_send_cmd(chan->conn, l2cap_get_ident(chan->conn), L2CAP_CONF_REQ, - l2cap_build_conf_req(chan, buf), buf); + l2cap_build_conf_req(chan, buf, sizeof(buf)), buf); chan->num_conf_req++; } } @@ -7465,7 +7471,7 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) set_bit(CONF_REQ_SENT, &chan->conf_state); l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, - l2cap_build_conf_req(chan, buf), + l2cap_build_conf_req(chan, buf, sizeof(buf)), buf); chan->num_conf_req++; } diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 875675765531..63edc6e5f026 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -676,7 +676,8 @@ bad: /* * Do a synchronous statfs(). */ -int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) +int ceph_monc_do_statfs(struct ceph_mon_client *monc, u64 data_pool, + struct ceph_statfs *buf) { struct ceph_mon_generic_request *req; struct ceph_mon_statfs *h; @@ -696,6 +697,7 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) goto out; req->u.st = buf; + req->request->hdr.version = cpu_to_le16(2); mutex_lock(&monc->mutex); register_generic_request(req); @@ -705,6 +707,8 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) h->monhdr.session_mon = cpu_to_le16(-1); h->monhdr.session_mon_tid = 0; h->fsid = monc->monmap->fsid; + h->contains_data_pool = (data_pool != CEPH_NOPOOL); + h->data_pool = cpu_to_le64(data_pool); send_generic_request(monc, req); mutex_unlock(&monc->mutex); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index dcfbdd74dfd1..e02f01f534e2 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -863,8 +863,6 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst, dst->cls.method_len = src->cls.method_len; dst->cls.indata_len = cpu_to_le32(src->cls.indata_len); break; - case CEPH_OSD_OP_STARTSYNC: - break; case CEPH_OSD_OP_WATCH: dst->watch.cookie = cpu_to_le64(src->watch.cookie); dst->watch.ver = cpu_to_le64(0); @@ -916,9 +914,6 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst, * if the file was recently truncated, we include information about its * old and new size so that the object can be updated appropriately. (we * avoid synchronously deleting truncated objects because it's slow.) - * - * if @do_sync, include a 'startsync' command so that the osd will flush - * data quickly. */ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, struct ceph_file_layout *layout, diff --git a/net/core/dev.c b/net/core/dev.c index 6f845e4fec17..fb766d906148 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3981,8 +3981,13 @@ static int netif_rx_internal(struct sk_buff *skb) trace_netif_rx(skb); if (static_key_false(&generic_xdp_needed)) { - int ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), - skb); + int ret; + + preempt_disable(); + rcu_read_lock(); + ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb); + rcu_read_unlock(); + preempt_enable(); /* Consider XDP consuming the packet a success from * the netdev point of view we do not want to count @@ -4500,18 +4505,20 @@ static int netif_receive_skb_internal(struct sk_buff *skb) if (skb_defer_rx_timestamp(skb)) return NET_RX_SUCCESS; - rcu_read_lock(); - if (static_key_false(&generic_xdp_needed)) { - int ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), - skb); + int ret; - if (ret != XDP_PASS) { - rcu_read_unlock(); + preempt_disable(); + rcu_read_lock(); + ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb); + rcu_read_unlock(); + preempt_enable(); + + if (ret != XDP_PASS) return NET_RX_DROP; - } } + rcu_read_lock(); #ifdef CONFIG_RPS if (static_key_false(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; diff --git a/net/core/filter.c b/net/core/filter.c index 5912c738a7b2..3a50a9b021e2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1794,6 +1794,7 @@ struct redirect_info { u32 flags; struct bpf_map *map; struct bpf_map *map_to_flush; + const struct bpf_prog *map_owner; }; static DEFINE_PER_CPU(struct redirect_info, redirect_info); @@ -1807,7 +1808,6 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) ri->ifindex = ifindex; ri->flags = flags; - ri->map = NULL; return TC_ACT_REDIRECT; } @@ -2504,6 +2504,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); + const struct bpf_prog *map_owner = ri->map_owner; struct bpf_map *map = ri->map; u32 index = ri->ifindex; struct net_device *fwd; @@ -2511,6 +2512,15 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, ri->ifindex = 0; ri->map = NULL; + ri->map_owner = NULL; + + /* This is really only caused by a deliberately crappy + * BPF program, normally we would never hit that case, + * so no need to inform someone via tracepoints either, + * just bail out. + */ + if (unlikely(map_owner != xdp_prog)) + return -EINVAL; fwd = __dev_map_lookup_elem(map, index); if (!fwd) { @@ -2607,6 +2617,8 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) ri->ifindex = ifindex; ri->flags = flags; + ri->map = NULL; + ri->map_owner = NULL; return XDP_REDIRECT; } @@ -2619,7 +2631,8 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = { .arg2_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags) +BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags, + const struct bpf_prog *, map_owner) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); @@ -2629,10 +2642,14 @@ BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags ri->ifindex = ifindex; ri->flags = flags; ri->map = map; + ri->map_owner = map_owner; return XDP_REDIRECT; } +/* Note, arg4 is hidden from users and populated by the verifier + * with the right pointer. + */ static const struct bpf_func_proto bpf_xdp_redirect_map_proto = { .func = bpf_xdp_redirect_map, .gpl_only = false, @@ -3592,7 +3609,11 @@ static bool xdp_is_valid_access(int off, int size, void bpf_warn_invalid_xdp_action(u32 act) { - WARN_ONCE(1, "Illegal XDP return value %u, expect packet loss\n", act); + const u32 act_max = XDP_REDIRECT; + + WARN_ONCE(1, "%s XDP return value %u, expect packet loss!\n", + act > act_max ? "Illegal" : "Driver unsupported", + act); } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 68065d7d383f..16982de649b9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -710,14 +710,11 @@ EXPORT_SYMBOL(consume_skb); * consume_stateless_skb - free an skbuff, assuming it is stateless * @skb: buffer to free * - * Works like consume_skb(), but this variant assumes that all the head - * states have been already dropped. + * Alike consume_skb(), but this variant assumes that this is the last + * skb reference and all the head states have been already dropped */ -void consume_stateless_skb(struct sk_buff *skb) +void __consume_stateless_skb(struct sk_buff *skb) { - if (!skb_unref(skb)) - return; - trace_consume_skb(skb); skb_release_data(skb); kfree_skbmem(skb); diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 129d1a3616f8..e1856bfa753d 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -618,8 +618,8 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto) ip_rt_put(rt); goto tx_dropped; } - iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos, - key->ttl, df, !net_eq(tunnel->net, dev_net(dev))); + iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl, + df, !net_eq(tunnel->net, dev_net(dev))); return; tx_error: dev->stats.tx_errors++; diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index e04457198f93..9e2770fd00be 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -629,6 +629,7 @@ static void get_counters(const struct xt_table_info *t, ADD_COUNTER(counters[i], bcnt, pcnt); ++i; + cond_resched(); } } } diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 576cba2b57e9..39286e543ee6 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -776,6 +776,7 @@ get_counters(const struct xt_table_info *t, ADD_COUNTER(counters[i], bcnt, pcnt); ++i; /* macro does multi eval of i */ + cond_resched(); } } } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a63486afa7a7..d9416b5162bc 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1669,9 +1669,9 @@ process: */ sock_hold(sk); refcounted = true; - if (tcp_filter(sk, skb)) - goto discard_and_relse; - nsk = tcp_check_req(sk, skb, req, false); + nsk = NULL; + if (!tcp_filter(sk, skb)) + nsk = tcp_check_req(sk, skb, req, false); if (!nsk) { reqsk_put(req); goto discard_and_relse; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index db1c9e78c83c..ef29df8648e4 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1397,12 +1397,15 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) unlock_sock_fast(sk, slow); } + if (!skb_unref(skb)) + return; + /* In the more common cases we cleared the head states previously, * see __udp_queue_rcv_skb(). */ if (unlikely(udp_skb_has_head_state(skb))) skb_release_head_state(skb); - consume_stateless_skb(skb); + __consume_stateless_skb(skb); } EXPORT_SYMBOL_GPL(skb_consume_udp); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index a3b5c163325f..e5308d7cbd75 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -191,6 +191,12 @@ void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) } EXPORT_SYMBOL_GPL(rt6_free_pcpu); +static void fib6_free_table(struct fib6_table *table) +{ + inetpeer_invalidate_tree(&table->tb6_peers); + kfree(table); +} + static void fib6_link_table(struct net *net, struct fib6_table *tb) { unsigned int h; @@ -2022,15 +2028,22 @@ out_timer: static void fib6_net_exit(struct net *net) { + unsigned int i; + rt6_ifdown(net, NULL); del_timer_sync(&net->ipv6.ip6_fib_timer); -#ifdef CONFIG_IPV6_MULTIPLE_TABLES - inetpeer_invalidate_tree(&net->ipv6.fib6_local_tbl->tb6_peers); - kfree(net->ipv6.fib6_local_tbl); -#endif - inetpeer_invalidate_tree(&net->ipv6.fib6_main_tbl->tb6_peers); - kfree(net->ipv6.fib6_main_tbl); + for (i = 0; i < FIB6_TABLE_HASHSZ; i++) { + struct hlist_head *head = &net->ipv6.fib_table_hash[i]; + struct hlist_node *tmp; + struct fib6_table *tb; + + hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) { + hlist_del(&tb->tb6_hlist); + fib6_free_table(tb); + } + } + kfree(net->ipv6.fib_table_hash); kfree(net->ipv6.rt6_stats); fib6_notifier_exit(net); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 67ff2aaf5dcb..b7a72d409334 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -432,7 +432,9 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } break; case ICMPV6_PKT_TOOBIG: - mtu = be32_to_cpu(info) - offset; + mtu = be32_to_cpu(info) - offset - t->tun_hlen; + if (t->dev->type == ARPHRD_ETHER) + mtu -= ETH_HLEN; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; t->dev->mtu = mtu; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 3a0ba2ae4b0f..10a693a19323 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1184,6 +1184,7 @@ route_lookup: init_tel_txopt(&opt, encap_limit); ipv6_push_frag_opts(skb, &opt.ops, &proto); } + hop_limit = hop_limit ? : ip6_dst_hoplimit(dst); /* Calculate max headroom for all the headers and adjust * needed_headroom if necessary. diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 54b1e75eded1..01bd3ee5ebc6 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -795,6 +795,7 @@ get_counters(const struct xt_table_info *t, ADD_COUNTER(counters[i], bcnt, pcnt); ++i; + cond_resched(); } } } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 38f76d8b231e..64d94afa427f 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1460,9 +1460,9 @@ process: } sock_hold(sk); refcounted = true; - if (tcp_filter(sk, skb)) - goto discard_and_relse; - nsk = tcp_check_req(sk, skb, req, false); + nsk = NULL; + if (!tcp_filter(sk, skb)) + nsk = tcp_check_req(sk, skb, req, false); if (!nsk) { reqsk_put(req); goto discard_and_relse; diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 2b36eff5d97e..2849a1fc41c5 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -245,10 +245,10 @@ static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *d ieee80211_tx_skb(sdata, skb); } -void __ieee80211_start_rx_ba_session(struct sta_info *sta, - u8 dialog_token, u16 timeout, - u16 start_seq_num, u16 ba_policy, u16 tid, - u16 buf_size, bool tx, bool auto_seq) +void ___ieee80211_start_rx_ba_session(struct sta_info *sta, + u8 dialog_token, u16 timeout, + u16 start_seq_num, u16 ba_policy, u16 tid, + u16 buf_size, bool tx, bool auto_seq) { struct ieee80211_local *local = sta->sdata->local; struct tid_ampdu_rx *tid_agg_rx; @@ -267,7 +267,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, ht_dbg(sta->sdata, "STA %pM requests BA session on unsupported tid %d\n", sta->sta.addr, tid); - goto end_no_lock; + goto end; } if (!sta->sta.ht_cap.ht_supported) { @@ -275,14 +275,14 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, "STA %pM erroneously requests BA session on tid %d w/o QoS\n", sta->sta.addr, tid); /* send a response anyway, it's an error case if we get here */ - goto end_no_lock; + goto end; } if (test_sta_flag(sta, WLAN_STA_BLOCK_BA)) { ht_dbg(sta->sdata, "Suspend in progress - Denying ADDBA request (%pM tid %d)\n", sta->sta.addr, tid); - goto end_no_lock; + goto end; } /* sanity check for incoming parameters: @@ -296,7 +296,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, ht_dbg_ratelimited(sta->sdata, "AddBA Req with bad params from %pM on tid %u. policy %d, buffer size %d\n", sta->sta.addr, tid, ba_policy, buf_size); - goto end_no_lock; + goto end; } /* determine default buffer size */ if (buf_size == 0) @@ -311,7 +311,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, buf_size, sta->sta.addr); /* examine state machine */ - mutex_lock(&sta->ampdu_mlme.mtx); + lockdep_assert_held(&sta->ampdu_mlme.mtx); if (test_bit(tid, sta->ampdu_mlme.agg_session_valid)) { if (sta->ampdu_mlme.tid_rx_token[tid] == dialog_token) { @@ -415,15 +415,25 @@ end: __clear_bit(tid, sta->ampdu_mlme.unexpected_agg); sta->ampdu_mlme.tid_rx_token[tid] = dialog_token; } - mutex_unlock(&sta->ampdu_mlme.mtx); -end_no_lock: if (tx) ieee80211_send_addba_resp(sta->sdata, sta->sta.addr, tid, dialog_token, status, 1, buf_size, timeout); } +void __ieee80211_start_rx_ba_session(struct sta_info *sta, + u8 dialog_token, u16 timeout, + u16 start_seq_num, u16 ba_policy, u16 tid, + u16 buf_size, bool tx, bool auto_seq) +{ + mutex_lock(&sta->ampdu_mlme.mtx); + ___ieee80211_start_rx_ba_session(sta, dialog_token, timeout, + start_seq_num, ba_policy, tid, + buf_size, tx, auto_seq); + mutex_unlock(&sta->ampdu_mlme.mtx); +} + void ieee80211_process_addba_request(struct ieee80211_local *local, struct sta_info *sta, struct ieee80211_mgmt *mgmt, diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index cbd48762256c..bef516ec47f9 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -226,7 +226,11 @@ ieee80211_agg_start_txq(struct sta_info *sta, int tid, bool enable) clear_bit(IEEE80211_TXQ_AMPDU, &txqi->flags); clear_bit(IEEE80211_TXQ_STOP, &txqi->flags); + local_bh_disable(); + rcu_read_lock(); drv_wake_tx_queue(sta->sdata->local, txqi); + rcu_read_unlock(); + local_bh_enable(); } /* @@ -436,7 +440,7 @@ static void sta_addba_resp_timer_expired(unsigned long data) test_bit(HT_AGG_STATE_RESPONSE_RECEIVED, &tid_tx->state)) { rcu_read_unlock(); ht_dbg(sta->sdata, - "timer expired on %pM tid %d but we are not (or no longer) expecting addBA response there\n", + "timer expired on %pM tid %d not expecting addBA response\n", sta->sta.addr, tid); return; } @@ -639,7 +643,7 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, time_before(jiffies, sta->ampdu_mlme.last_addba_req_time[tid] + HT_AGG_RETRIES_PERIOD)) { ht_dbg(sdata, - "BA request denied - waiting a grace period after %d failed requests on %pM tid %u\n", + "BA request denied - %d failed requests on %pM tid %u\n", sta->ampdu_mlme.addba_req_num[tid], sta->sta.addr, tid); ret = -EBUSY; goto err_unlock_sta; diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index c92df492e898..d6d0b4201e40 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -300,6 +300,24 @@ void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, /* stopping might queue the work again - so cancel only afterwards */ cancel_work_sync(&sta->ampdu_mlme.work); + + /* + * In case the tear down is part of a reconfigure due to HW restart + * request, it is possible that the low level driver requested to stop + * the BA session, so handle it to properly clean tid_tx data. + */ + mutex_lock(&sta->ampdu_mlme.mtx); + for (i = 0; i < IEEE80211_NUM_TIDS; i++) { + struct tid_ampdu_tx *tid_tx = + rcu_dereference_protected_tid_tx(sta, i); + + if (!tid_tx) + continue; + + if (test_and_clear_bit(HT_AGG_STATE_STOP_CB, &tid_tx->state)) + ieee80211_stop_tx_ba_cb(sta, i, tid_tx); + } + mutex_unlock(&sta->ampdu_mlme.mtx); } void ieee80211_ba_session_work(struct work_struct *work) @@ -333,9 +351,9 @@ void ieee80211_ba_session_work(struct work_struct *work) if (test_and_clear_bit(tid, sta->ampdu_mlme.tid_rx_manage_offl)) - __ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid, - IEEE80211_MAX_AMPDU_BUF, - false, true); + ___ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid, + IEEE80211_MAX_AMPDU_BUF, + false, true); if (test_and_clear_bit(tid + IEEE80211_NUM_TIDS, sta->ampdu_mlme.tid_rx_manage_offl)) diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 2197c62a0a6e..9675814f64db 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1760,6 +1760,10 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, u8 dialog_token, u16 timeout, u16 start_seq_num, u16 ba_policy, u16 tid, u16 buf_size, bool tx, bool auto_seq); +void ___ieee80211_start_rx_ba_session(struct sta_info *sta, + u8 dialog_token, u16 timeout, + u16 start_seq_num, u16 ba_policy, u16 tid, + u16 buf_size, bool tx, bool auto_seq); void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, enum ieee80211_agg_stop_reason reason); void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 9228ac73c429..f75029abf728 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -731,7 +731,8 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) sdata->vif.type == NL80211_IFTYPE_AP_VLAN || local->ops->wake_tx_queue) { /* XXX: for AP_VLAN, actually track AP queues */ - netif_tx_start_all_queues(dev); + if (dev) + netif_tx_start_all_queues(dev); } else if (dev) { unsigned long flags; int n_acs = IEEE80211_NUM_ACS; @@ -792,6 +793,7 @@ static int ieee80211_open(struct net_device *dev) static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_down) { + struct ieee80211_sub_if_data *txq_sdata = sdata; struct ieee80211_local *local = sdata->local; struct fq *fq = &local->fq; unsigned long flags; @@ -937,6 +939,9 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: + txq_sdata = container_of(sdata->bss, + struct ieee80211_sub_if_data, u.ap); + mutex_lock(&local->mtx); list_del(&sdata->u.vlan.list); mutex_unlock(&local->mtx); @@ -1007,8 +1012,17 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, } spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); - if (sdata->vif.txq) { - struct txq_info *txqi = to_txq_info(sdata->vif.txq); + if (txq_sdata->vif.txq) { + struct txq_info *txqi = to_txq_info(txq_sdata->vif.txq); + + /* + * FIXME FIXME + * + * We really shouldn't purge the *entire* txqi since that + * contains frames for the other AP_VLANs (and possibly + * the AP itself) as well, but there's no API in FQ now + * to be able to filter. + */ spin_lock_bh(&fq->lock); ieee80211_txq_purge(local, txqi); diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index b588e593b0ec..3b8e2709d8de 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3155,7 +3155,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, if (len < 24 + 6) return; - reassoc = ieee80211_is_reassoc_req(mgmt->frame_control); + reassoc = ieee80211_is_reassoc_resp(mgmt->frame_control); capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info); status_code = le16_to_cpu(mgmt->u.assoc_resp.status_code); aid = le16_to_cpu(mgmt->u.assoc_resp.aid); diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index f8e7a8bbc618..faf4f6055000 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -707,6 +707,8 @@ static int ieee80211_cancel_roc(struct ieee80211_local *local, if (!cookie) return -ENOENT; + flush_work(&local->hw_roc_start); + mutex_lock(&local->mtx); list_for_each_entry_safe(roc, tmp, &local->roc_list, list) { if (!mgmt_tx && roc->cookie != cookie) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 8858f4f185e9..94826680cf2b 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1276,11 +1276,6 @@ static void ieee80211_set_skb_enqueue_time(struct sk_buff *skb) IEEE80211_SKB_CB(skb)->control.enqueue_time = codel_get_time(); } -static void ieee80211_set_skb_vif(struct sk_buff *skb, struct txq_info *txqi) -{ - IEEE80211_SKB_CB(skb)->control.vif = txqi->txq.vif; -} - static u32 codel_skb_len_func(const struct sk_buff *skb) { return skb->len; @@ -3414,6 +3409,7 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, struct ieee80211_tx_info *info; struct ieee80211_tx_data tx; ieee80211_tx_result r; + struct ieee80211_vif *vif; spin_lock_bh(&fq->lock); @@ -3430,8 +3426,6 @@ begin: if (!skb) goto out; - ieee80211_set_skb_vif(skb, txqi); - hdr = (struct ieee80211_hdr *)skb->data; info = IEEE80211_SKB_CB(skb); @@ -3488,6 +3482,34 @@ begin: } } + switch (tx.sdata->vif.type) { + case NL80211_IFTYPE_MONITOR: + if (tx.sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) { + vif = &tx.sdata->vif; + break; + } + tx.sdata = rcu_dereference(local->monitor_sdata); + if (tx.sdata) { + vif = &tx.sdata->vif; + info->hw_queue = + vif->hw_queue[skb_get_queue_mapping(skb)]; + } else if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) { + ieee80211_free_txskb(&local->hw, skb); + goto begin; + } else { + vif = NULL; + } + break; + case NL80211_IFTYPE_AP_VLAN: + tx.sdata = container_of(tx.sdata->bss, + struct ieee80211_sub_if_data, u.ap); + /* fall through */ + default: + vif = &tx.sdata->vif; + break; + } + + IEEE80211_SKB_CB(skb)->control.vif = vif; out: spin_unlock_bh(&fq->lock); diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 259698de569f..6aef6793d052 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1436,7 +1436,7 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, WLAN_EID_SSID_LIST, WLAN_EID_CHANNEL_USAGE, WLAN_EID_INTERWORKING, - /* mesh ID can't happen here */ + WLAN_EID_MESH_ID, /* 60 GHz can't happen here right now */ }; noffset = ieee80211_ie_split(ie, ie_len, diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 04fe25abc5f6..52cd2901a097 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -215,7 +215,7 @@ static void *__nf_hook_entries_try_shrink(struct nf_hook_entries __rcu **pp) if (skip == hook_entries) goto out_assign; - if (WARN_ON(skip == 0)) + if (skip == 0) return NULL; hook_entries -= skip; diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index e1efa446b305..57c8ee66491e 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -24,9 +24,13 @@ sctp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, if (sh) { sch = skb_header_pointer(skb, iph->len + sizeof(_sctph), sizeof(_schunkh), &_schunkh); - if (sch && (sch->type == SCTP_CID_INIT || - sysctl_sloppy_sctp(ipvs))) + if (sch) { + if (sch->type == SCTP_CID_ABORT || + !(sysctl_sloppy_sctp(ipvs) || + sch->type == SCTP_CID_INIT)) + return 1; ports = &sh->source; + } } } else { ports = skb_header_pointer( diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 40573aa6c133..f393a7086025 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -30,19 +30,17 @@ #include <net/netfilter/nf_conntrack_zones.h> #include <linux/netfilter/nf_nat.h> +static spinlock_t nf_nat_locks[CONNTRACK_LOCKS]; + static DEFINE_MUTEX(nf_nat_proto_mutex); static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO] __read_mostly; static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO] __read_mostly; -struct nf_nat_conn_key { - const struct net *net; - const struct nf_conntrack_tuple *tuple; - const struct nf_conntrack_zone *zone; -}; - -static struct rhltable nf_nat_bysource_table; +static struct hlist_head *nf_nat_bysource __read_mostly; +static unsigned int nf_nat_htable_size __read_mostly; +static unsigned int nf_nat_hash_rnd __read_mostly; inline const struct nf_nat_l3proto * __nf_nat_l3proto_find(u8 family) @@ -118,17 +116,19 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family) EXPORT_SYMBOL(nf_xfrm_me_harder); #endif /* CONFIG_XFRM */ -static u32 nf_nat_bysource_hash(const void *data, u32 len, u32 seed) +/* We keep an extra hash for each conntrack, for fast searching. */ +static unsigned int +hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple) { - const struct nf_conntrack_tuple *t; - const struct nf_conn *ct = data; + unsigned int hash; + + get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); - t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; /* Original src, to ensure we map it consistently if poss. */ + hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32), + tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n)); - seed ^= net_hash_mix(nf_ct_net(ct)); - return jhash2((const u32 *)&t->src, sizeof(t->src) / sizeof(u32), - t->dst.protonum ^ seed); + return reciprocal_scale(hash, nf_nat_htable_size); } /* Is this tuple already taken? (not by us) */ @@ -184,28 +184,6 @@ same_src(const struct nf_conn *ct, t->src.u.all == tuple->src.u.all); } -static int nf_nat_bysource_cmp(struct rhashtable_compare_arg *arg, - const void *obj) -{ - const struct nf_nat_conn_key *key = arg->key; - const struct nf_conn *ct = obj; - - if (!same_src(ct, key->tuple) || - !net_eq(nf_ct_net(ct), key->net) || - !nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL)) - return 1; - - return 0; -} - -static struct rhashtable_params nf_nat_bysource_params = { - .head_offset = offsetof(struct nf_conn, nat_bysource), - .obj_hashfn = nf_nat_bysource_hash, - .obj_cmpfn = nf_nat_bysource_cmp, - .nelem_hint = 256, - .min_size = 1024, -}; - /* Only called for SRC manip */ static int find_appropriate_src(struct net *net, @@ -216,26 +194,22 @@ find_appropriate_src(struct net *net, struct nf_conntrack_tuple *result, const struct nf_nat_range *range) { + unsigned int h = hash_by_src(net, tuple); const struct nf_conn *ct; - struct nf_nat_conn_key key = { - .net = net, - .tuple = tuple, - .zone = zone - }; - struct rhlist_head *hl, *h; - - hl = rhltable_lookup(&nf_nat_bysource_table, &key, - nf_nat_bysource_params); - - rhl_for_each_entry_rcu(ct, h, hl, nat_bysource) { - nf_ct_invert_tuplepr(result, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); - result->dst = tuple->dst; - if (in_range(l3proto, l4proto, result, range)) - return 1; + hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) { + if (same_src(ct, tuple) && + net_eq(net, nf_ct_net(ct)) && + nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { + /* Copy source part from reply tuple. */ + nf_ct_invert_tuplepr(result, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + result->dst = tuple->dst; + + if (in_range(l3proto, l4proto, result, range)) + return 1; + } } - return 0; } @@ -408,6 +382,7 @@ nf_nat_setup_info(struct nf_conn *ct, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype) { + struct net *net = nf_ct_net(ct); struct nf_conntrack_tuple curr_tuple, new_tuple; /* Can't setup nat info for confirmed ct. */ @@ -416,7 +391,9 @@ nf_nat_setup_info(struct nf_conn *ct, WARN_ON(maniptype != NF_NAT_MANIP_SRC && maniptype != NF_NAT_MANIP_DST); - BUG_ON(nf_nat_initialized(ct, maniptype)); + + if (WARN_ON(nf_nat_initialized(ct, maniptype))) + return NF_DROP; /* What we've got will look like inverse of reply. Normally * this is what is in the conntrack, except for prior @@ -447,19 +424,16 @@ nf_nat_setup_info(struct nf_conn *ct, } if (maniptype == NF_NAT_MANIP_SRC) { - struct nf_nat_conn_key key = { - .net = nf_ct_net(ct), - .tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - .zone = nf_ct_zone(ct), - }; - int err; - - err = rhltable_insert_key(&nf_nat_bysource_table, - &key, - &ct->nat_bysource, - nf_nat_bysource_params); - if (err) - return NF_DROP; + unsigned int srchash; + spinlock_t *lock; + + srchash = hash_by_src(net, + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + lock = &nf_nat_locks[srchash % ARRAY_SIZE(nf_nat_locks)]; + spin_lock_bh(lock); + hlist_add_head_rcu(&ct->nat_bysource, + &nf_nat_bysource[srchash]); + spin_unlock_bh(lock); } /* It's done. */ @@ -553,6 +527,16 @@ static int nf_nat_proto_remove(struct nf_conn *i, void *data) return i->status & IPS_NAT_MASK ? 1 : 0; } +static void __nf_nat_cleanup_conntrack(struct nf_conn *ct) +{ + unsigned int h; + + h = hash_by_src(nf_ct_net(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + spin_lock_bh(&nf_nat_locks[h % ARRAY_SIZE(nf_nat_locks)]); + hlist_del_rcu(&ct->nat_bysource); + spin_unlock_bh(&nf_nat_locks[h % ARRAY_SIZE(nf_nat_locks)]); +} + static int nf_nat_proto_clean(struct nf_conn *ct, void *data) { if (nf_nat_proto_remove(ct, data)) @@ -568,8 +552,7 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data) * will delete entry from already-freed table. */ clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status); - rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource, - nf_nat_bysource_params); + __nf_nat_cleanup_conntrack(ct); /* don't delete conntrack. Although that would make things a lot * simpler, we'd end up flushing all conntracks on nat rmmod. @@ -698,8 +681,7 @@ EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister); static void nf_nat_cleanup_conntrack(struct nf_conn *ct) { if (ct->status & IPS_SRC_NAT_DONE) - rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource, - nf_nat_bysource_params); + __nf_nat_cleanup_conntrack(ct); } static struct nf_ct_ext_type nat_extend __read_mostly = { @@ -821,19 +803,27 @@ static struct nf_ct_helper_expectfn follow_master_nat = { static int __init nf_nat_init(void) { - int ret; + int ret, i; - ret = rhltable_init(&nf_nat_bysource_table, &nf_nat_bysource_params); - if (ret) - return ret; + /* Leave them the same for the moment. */ + nf_nat_htable_size = nf_conntrack_htable_size; + if (nf_nat_htable_size < ARRAY_SIZE(nf_nat_locks)) + nf_nat_htable_size = ARRAY_SIZE(nf_nat_locks); + + nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); + if (!nf_nat_bysource) + return -ENOMEM; ret = nf_ct_extend_register(&nat_extend); if (ret < 0) { - rhltable_destroy(&nf_nat_bysource_table); + nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); return ret; } + for (i = 0; i < ARRAY_SIZE(nf_nat_locks); i++) + spin_lock_init(&nf_nat_locks[i]); + nf_ct_helper_expectfn_register(&follow_master_nat); BUG_ON(nfnetlink_parse_nat_setup_hook != NULL); @@ -863,8 +853,8 @@ static void __exit nf_nat_cleanup(void) for (i = 0; i < NFPROTO_NUMPROTO; i++) kfree(nf_nat_l4protos[i]); - - rhltable_destroy(&nf_nat_bysource_table); + synchronize_net(); + nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); } MODULE_LICENSE("GPL"); diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 10d48234f5f4..5da8746f7b88 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -35,6 +35,7 @@ #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter/xt_hashlimit.h> #include <linux/mutex.h> +#include <linux/kernel.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); @@ -279,7 +280,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, size = cfg->size; } else { size = (totalram_pages << PAGE_SHIFT) / 16384 / - sizeof(struct list_head); + sizeof(struct hlist_head); if (totalram_pages > 1024 * 1024 * 1024 / PAGE_SIZE) size = 8192; if (size < 16) @@ -287,7 +288,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, } /* FIXME: don't use vmalloc() here or anywhere else -HW */ hinfo = vmalloc(sizeof(struct xt_hashlimit_htable) + - sizeof(struct list_head) * size); + sizeof(struct hlist_head) * size); if (hinfo == NULL) return -ENOMEM; *out_hinfo = hinfo; @@ -527,12 +528,12 @@ static u64 user2rate(u64 user) } } -static u64 user2rate_bytes(u64 user) +static u64 user2rate_bytes(u32 user) { u64 r; - r = user ? 0xFFFFFFFFULL / user : 0xFFFFFFFFULL; - r = (r - 1) << 4; + r = user ? U32_MAX / user : U32_MAX; + r = (r - 1) << XT_HASHLIMIT_BYTE_SHIFT; return r; } @@ -588,7 +589,8 @@ static void rateinfo_init(struct dsthash_ent *dh, dh->rateinfo.prev_window = 0; dh->rateinfo.current_rate = 0; if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) { - dh->rateinfo.rate = user2rate_bytes(hinfo->cfg.avg); + dh->rateinfo.rate = + user2rate_bytes((u32)hinfo->cfg.avg); if (hinfo->cfg.burst) dh->rateinfo.burst = hinfo->cfg.burst * dh->rateinfo.rate; @@ -870,7 +872,7 @@ static int hashlimit_mt_check_common(const struct xt_mtchk_param *par, /* Check for overflow. */ if (revision >= 3 && cfg->mode & XT_HASHLIMIT_RATE_MATCH) { - if (cfg->avg == 0) { + if (cfg->avg == 0 || cfg->avg > U32_MAX) { pr_info("hashlimit invalid rate\n"); return -ERANGE; } diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 5acee49db90b..327807731b44 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -691,6 +691,9 @@ static void deferred_put_nlk_sk(struct rcu_head *head) struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu); struct sock *sk = &nlk->sk; + kfree(nlk->groups); + nlk->groups = NULL; + if (!refcount_dec_and_test(&sk->sk_refcnt)) return; @@ -769,9 +772,6 @@ static int netlink_release(struct socket *sock) netlink_table_ungrab(); } - kfree(nlk->groups); - nlk->groups = NULL; - local_bh_disable(); sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1); local_bh_enable(); @@ -955,7 +955,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, struct net *net = sock_net(sk); struct netlink_sock *nlk = nlk_sk(sk); struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; - int err; + int err = 0; long unsigned int groups = nladdr->nl_groups; bool bound; @@ -983,6 +983,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, return -EINVAL; } + netlink_lock_table(); if (nlk->netlink_bind && groups) { int group; @@ -993,7 +994,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, if (!err) continue; netlink_undo_bind(group, groups, sk); - return err; + goto unlock; } } @@ -1006,12 +1007,13 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, netlink_autobind(sock); if (err) { netlink_undo_bind(nlk->ngroups, groups, sk); - return err; + goto unlock; } } if (!groups && (nlk->groups == NULL || !(u32)nlk->groups[0])) - return 0; + goto unlock; + netlink_unlock_table(); netlink_table_grab(); netlink_update_subscriptions(sk, nlk->subscriptions + @@ -1022,6 +1024,10 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, netlink_table_ungrab(); return 0; + +unlock: + netlink_unlock_table(); + return err; } static int netlink_connect(struct socket *sock, struct sockaddr *addr, @@ -1079,7 +1085,9 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr, nladdr->nl_groups = netlink_group_mask(nlk->dst_group); } else { nladdr->nl_pid = nlk->portid; + netlink_lock_table(); nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0; + netlink_unlock_table(); } return 0; } diff --git a/net/rds/send.c b/net/rds/send.c index 058a40743041..b52cdc8ae428 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -428,14 +428,18 @@ over_batch: * some work and we will skip our goto */ if (ret == 0) { + bool raced; + smp_mb(); + raced = send_gen != READ_ONCE(cp->cp_send_gen); + if ((test_bit(0, &conn->c_map_queued) || - !list_empty(&cp->cp_send_queue)) && - send_gen == READ_ONCE(cp->cp_send_gen)) { - rds_stats_inc(s_send_lock_queue_raced); + !list_empty(&cp->cp_send_queue)) && !raced) { if (batch_count < send_batch_count) goto restart; queue_delayed_work(rds_wq, &cp->cp_send_w, 1); + } else if (raced) { + rds_stats_inc(s_send_lock_queue_raced); } } out: diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index ea6c65fd5fc5..c743f03cfebd 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -182,7 +182,7 @@ static struct tcf_chain *tcf_chain_create(struct tcf_block *block, list_add_tail(&chain->list, &block->chain_list); chain->block = block; chain->index = chain_index; - chain->refcnt = 1; + chain->refcnt = 0; return chain; } @@ -217,15 +217,15 @@ struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, struct tcf_chain *chain; list_for_each_entry(chain, &block->chain_list, list) { - if (chain->index == chain_index) { - chain->refcnt++; - return chain; - } + if (chain->index == chain_index) + goto incref; } - if (create) - return tcf_chain_create(block, chain_index); - else - return NULL; + chain = create ? tcf_chain_create(block, chain_index) : NULL; + +incref: + if (chain) + chain->refcnt++; + return chain; } EXPORT_SYMBOL(tcf_chain_get); diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index f31b28f788c0..2dd6c68ae91e 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -80,7 +80,7 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); - kfree_skb(skb); + __qdisc_drop(skb, to_free); return ret; } #endif diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index cd661a7f81e6..6ddfd4991108 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -1215,7 +1215,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (cl == NULL) { if (err & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); - kfree_skb(skb); + __qdisc_drop(skb, to_free); return err; } pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid); diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index 0225d62a869f..a71be33f3afe 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -265,7 +265,8 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) sctp_ulpq_clear_pd(ulpq); if (queue == &sk->sk_receive_queue && !sp->data_ready_signalled) { - sp->data_ready_signalled = 1; + if (!sock_owned_by_user(sk)) + sp->data_ready_signalled = 1; sk->sk_data_ready(sk); } return 1; diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index ac701c28f44f..c2c68a15b59d 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -171,10 +171,10 @@ int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs) /* * Add the temporary list to the backchannel preallocation list */ - spin_lock_bh(&xprt->bc_pa_lock); + spin_lock(&xprt->bc_pa_lock); list_splice(&tmp_list, &xprt->bc_pa_list); xprt_inc_alloc_count(xprt, min_reqs); - spin_unlock_bh(&xprt->bc_pa_lock); + spin_unlock(&xprt->bc_pa_lock); dprintk("RPC: setup backchannel transport done\n"); return 0; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 2e49d1f892b7..2ad827db2704 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1903,6 +1903,14 @@ call_connect_status(struct rpc_task *task) task->tk_status = 0; switch (status) { case -ECONNREFUSED: + /* A positive refusal suggests a rebind is needed. */ + if (RPC_IS_SOFTCONN(task)) + break; + if (clnt->cl_autobind) { + rpc_force_rebind(clnt); + task->tk_action = call_bind; + return; + } case -ECONNRESET: case -ECONNABORTED: case -ENETUNREACH: @@ -2139,10 +2147,6 @@ call_status(struct rpc_task *task) rpc_delay(task, 3*HZ); case -ETIMEDOUT: task->tk_action = call_timeout; - if (!(task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT) - && task->tk_client->cl_discrtry) - xprt_conditional_disconnect(req->rq_xprt, - req->rq_connect_cookie); break; case -ECONNREFUSED: case -ECONNRESET: diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 85ce0db5b0a6..aa04666f929d 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -421,7 +421,7 @@ __svc_init_bc(struct svc_serv *serv) */ static struct svc_serv * __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, - struct svc_serv_ops *ops) + const struct svc_serv_ops *ops) { struct svc_serv *serv; unsigned int vers; @@ -486,7 +486,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, struct svc_serv * svc_create(struct svc_program *prog, unsigned int bufsize, - struct svc_serv_ops *ops) + const struct svc_serv_ops *ops) { return __svc_create(prog, bufsize, /*npools*/1, ops); } @@ -494,7 +494,7 @@ EXPORT_SYMBOL_GPL(svc_create); struct svc_serv * svc_create_pooled(struct svc_program *prog, unsigned int bufsize, - struct svc_serv_ops *ops) + const struct svc_serv_ops *ops) { struct svc_serv *serv; unsigned int npools = svc_pool_map_get(); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index e18500151236..ff8e06cd067e 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -693,7 +693,7 @@ static struct svc_xprt *svc_udp_create(struct svc_serv *serv, return svc_create_socket(serv, IPPROTO_UDP, net, sa, salen, flags); } -static struct svc_xprt_ops svc_udp_ops = { +static const struct svc_xprt_ops svc_udp_ops = { .xpo_create = svc_udp_create, .xpo_recvfrom = svc_udp_recvfrom, .xpo_sendto = svc_udp_sendto, @@ -1013,7 +1013,7 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp) if (!bc_xprt) return -EAGAIN; - spin_lock_bh(&bc_xprt->transport_lock); + spin_lock(&bc_xprt->recv_lock); req = xprt_lookup_rqst(bc_xprt, xid); if (!req) goto unlock_notfound; @@ -1031,7 +1031,7 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp) memcpy(dst->iov_base, src->iov_base, src->iov_len); xprt_complete_rqst(req->rq_task, rqstp->rq_arg.len); rqstp->rq_arg.len = 0; - spin_unlock_bh(&bc_xprt->transport_lock); + spin_unlock(&bc_xprt->recv_lock); return 0; unlock_notfound: printk(KERN_NOTICE @@ -1040,7 +1040,7 @@ unlock_notfound: __func__, ntohl(calldir), bc_xprt, ntohl(xid)); unlock_eagain: - spin_unlock_bh(&bc_xprt->transport_lock); + spin_unlock(&bc_xprt->recv_lock); return -EAGAIN; } @@ -1241,7 +1241,7 @@ static void svc_bc_tcp_sock_detach(struct svc_xprt *xprt) { } -static struct svc_xprt_ops svc_tcp_bc_ops = { +static const struct svc_xprt_ops svc_tcp_bc_ops = { .xpo_create = svc_bc_tcp_create, .xpo_detach = svc_bc_tcp_sock_detach, .xpo_free = svc_bc_sock_free, @@ -1275,7 +1275,7 @@ static void svc_cleanup_bc_xprt_sock(void) } #endif /* CONFIG_SUNRPC_BACKCHANNEL */ -static struct svc_xprt_ops svc_tcp_ops = { +static const struct svc_xprt_ops svc_tcp_ops = { .xpo_create = svc_tcp_create, .xpo_recvfrom = svc_tcp_recvfrom, .xpo_sendto = svc_tcp_sendto, diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 4654a9934269..e741ec2b4d8e 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -844,6 +844,50 @@ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid) } EXPORT_SYMBOL_GPL(xprt_lookup_rqst); +/** + * xprt_pin_rqst - Pin a request on the transport receive list + * @req: Request to pin + * + * Caller must ensure this is atomic with the call to xprt_lookup_rqst() + * so should be holding the xprt transport lock. + */ +void xprt_pin_rqst(struct rpc_rqst *req) +{ + set_bit(RPC_TASK_MSG_RECV, &req->rq_task->tk_runstate); +} +EXPORT_SYMBOL_GPL(xprt_pin_rqst); + +/** + * xprt_unpin_rqst - Unpin a request on the transport receive list + * @req: Request to pin + * + * Caller should be holding the xprt transport lock. + */ +void xprt_unpin_rqst(struct rpc_rqst *req) +{ + struct rpc_task *task = req->rq_task; + + clear_bit(RPC_TASK_MSG_RECV, &task->tk_runstate); + if (test_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate)) + wake_up_bit(&task->tk_runstate, RPC_TASK_MSG_RECV); +} +EXPORT_SYMBOL_GPL(xprt_unpin_rqst); + +static void xprt_wait_on_pinned_rqst(struct rpc_rqst *req) +__must_hold(&req->rq_xprt->recv_lock) +{ + struct rpc_task *task = req->rq_task; + + if (task && test_bit(RPC_TASK_MSG_RECV, &task->tk_runstate)) { + spin_unlock(&req->rq_xprt->recv_lock); + set_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate); + wait_on_bit(&task->tk_runstate, RPC_TASK_MSG_RECV, + TASK_UNINTERRUPTIBLE); + clear_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate); + spin_lock(&req->rq_xprt->recv_lock); + } +} + static void xprt_update_rtt(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; @@ -966,13 +1010,13 @@ void xprt_transmit(struct rpc_task *task) /* * Add to the list only if we're expecting a reply */ - spin_lock_bh(&xprt->transport_lock); /* Update the softirq receive buffer */ memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(req->rq_private_buf)); /* Add request to the receive list */ + spin_lock(&xprt->recv_lock); list_add_tail(&req->rq_list, &xprt->recv); - spin_unlock_bh(&xprt->transport_lock); + spin_unlock(&xprt->recv_lock); xprt_reset_majortimeo(req); /* Turn off autodisconnect */ del_singleshot_timer_sync(&xprt->timer); @@ -1287,12 +1331,16 @@ void xprt_release(struct rpc_task *task) task->tk_ops->rpc_count_stats(task, task->tk_calldata); else if (task->tk_client) rpc_count_iostats(task, task->tk_client->cl_metrics); + spin_lock(&xprt->recv_lock); + if (!list_empty(&req->rq_list)) { + list_del(&req->rq_list); + xprt_wait_on_pinned_rqst(req); + } + spin_unlock(&xprt->recv_lock); spin_lock_bh(&xprt->transport_lock); xprt->ops->release_xprt(xprt, task); if (xprt->ops->release_request) xprt->ops->release_request(task); - if (!list_empty(&req->rq_list)) - list_del(&req->rq_list); xprt->last_used = jiffies; xprt_schedule_autodisconnect(xprt); spin_unlock_bh(&xprt->transport_lock); @@ -1318,6 +1366,7 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net) spin_lock_init(&xprt->transport_lock); spin_lock_init(&xprt->reserve_lock); + spin_lock_init(&xprt->recv_lock); INIT_LIST_HEAD(&xprt->free); INIT_LIST_HEAD(&xprt->recv); diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 03f6b5840764..d31d0ac5ada9 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -49,6 +49,7 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, if (IS_ERR(rb)) goto out_fail; req->rl_rdmabuf = rb; + xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb)); size = r_xprt->rx_data.inline_rsize; rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL); @@ -202,20 +203,24 @@ size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt) */ int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) { - struct rpc_xprt *xprt = rqst->rq_xprt; - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - struct rpcrdma_msg *headerp; - - headerp = rdmab_to_msg(req->rl_rdmabuf); - headerp->rm_xid = rqst->rq_xid; - headerp->rm_vers = rpcrdma_version; - headerp->rm_credit = - cpu_to_be32(r_xprt->rx_buf.rb_bc_srv_max_requests); - headerp->rm_type = rdma_msg; - headerp->rm_body.rm_chunks[0] = xdr_zero; - headerp->rm_body.rm_chunks[1] = xdr_zero; - headerp->rm_body.rm_chunks[2] = xdr_zero; + __be32 *p; + + rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0); + xdr_init_encode(&req->rl_stream, &req->rl_hdrbuf, + req->rl_rdmabuf->rg_base); + + p = xdr_reserve_space(&req->rl_stream, 28); + if (unlikely(!p)) + return -EIO; + *p++ = rqst->rq_xid; + *p++ = rpcrdma_version; + *p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_srv_max_requests); + *p++ = rdma_msg; + *p++ = xdr_zero; + *p++ = xdr_zero; + *p = xdr_zero; if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, RPCRDMA_HDRLEN_MIN, &rqst->rq_snd_buf, rpcrdma_noch)) @@ -271,9 +276,6 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst) * @xprt: transport receiving the call * @rep: receive buffer containing the call * - * Called in the RPC reply handler, which runs in a tasklet. - * Be quick about it. - * * Operational assumptions: * o Backchannel credits are ignored, just as the NFS server * forechannel currently does @@ -284,7 +286,6 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) { struct rpc_xprt *xprt = &r_xprt->rx_xprt; - struct rpcrdma_msg *headerp; struct svc_serv *bc_serv; struct rpcrdma_req *req; struct rpc_rqst *rqst; @@ -292,24 +293,15 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, size_t size; __be32 *p; - headerp = rdmab_to_msg(rep->rr_rdmabuf); + p = xdr_inline_decode(&rep->rr_stream, 0); + size = xdr_stream_remaining(&rep->rr_stream); + #ifdef RPCRDMA_BACKCHANNEL_DEBUG pr_info("RPC: %s: callback XID %08x, length=%u\n", - __func__, be32_to_cpu(headerp->rm_xid), rep->rr_len); - pr_info("RPC: %s: %*ph\n", __func__, rep->rr_len, headerp); + __func__, be32_to_cpup(p), size); + pr_info("RPC: %s: %*ph\n", __func__, size, p); #endif - /* Sanity check: - * Need at least enough bytes for RPC/RDMA header, as code - * here references the header fields by array offset. Also, - * backward calls are always inline, so ensure there - * are some bytes beyond the RPC/RDMA header. - */ - if (rep->rr_len < RPCRDMA_HDRLEN_MIN + 24) - goto out_short; - p = (__be32 *)((unsigned char *)headerp + RPCRDMA_HDRLEN_MIN); - size = rep->rr_len - RPCRDMA_HDRLEN_MIN; - /* Grab a free bc rqst */ spin_lock(&xprt->bc_pa_lock); if (list_empty(&xprt->bc_pa_list)) { @@ -325,7 +317,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, /* Prepare rqst */ rqst->rq_reply_bytes_recvd = 0; rqst->rq_bytes_sent = 0; - rqst->rq_xid = headerp->rm_xid; + rqst->rq_xid = *p; rqst->rq_private_buf.len = size; set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); @@ -337,9 +329,9 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, buf->len = size; /* The receive buffer has to be hooked to the rpcrdma_req - * so that it can be reposted after the server is done - * parsing it but just before sending the backward - * direction reply. + * so that it is not released while the req is pointing + * to its buffer, and so that it can be reposted after + * the Upper Layer is done decoding it. */ req = rpcr_to_rdmar(rqst); dprintk("RPC: %s: attaching rep %p to req %p\n", @@ -367,13 +359,4 @@ out_overflow: * when the connection is re-established. */ return; - -out_short: - pr_warn("RPC/RDMA short backward direction call\n"); - - if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep)) - xprt_disconnect_done(xprt); - else - pr_warn("RPC: %s: reposting rep %p\n", - __func__, rep); } diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index d3f84bb1d443..6c7151341194 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -177,7 +177,7 @@ fmr_op_maxpages(struct rpcrdma_xprt *r_xprt) /* Use the ib_map_phys_fmr() verb to register a memory region * for remote access via RDMA READ or RDMA WRITE. */ -static int +static struct rpcrdma_mr_seg * fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, int nsegs, bool writing, struct rpcrdma_mw **out) { @@ -188,7 +188,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mw = rpcrdma_get_mw(r_xprt); if (!mw) - return -ENOBUFS; + return ERR_PTR(-ENOBUFS); pageoff = offset_in_page(seg1->mr_offset); seg1->mr_offset -= pageoff; /* start of page */ @@ -232,13 +232,13 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mw->mw_offset = dma_pages[0] + pageoff; *out = mw; - return mw->mw_nents; + return seg; out_dmamap_err: pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n", mw->mw_sg, i); rpcrdma_put_mw(r_xprt, mw); - return -EIO; + return ERR_PTR(-EIO); out_maperr: pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", @@ -247,7 +247,7 @@ out_maperr: ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir); rpcrdma_put_mw(r_xprt, mw); - return -EIO; + return ERR_PTR(-EIO); } /* Invalidate all memory regions that were registered for "req". diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 6aea36a38bfd..5a936a6a31a3 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -344,7 +344,7 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) /* Post a REG_MR Work Request to register a memory region * for remote access via RDMA READ or RDMA WRITE. */ -static int +static struct rpcrdma_mr_seg * frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, int nsegs, bool writing, struct rpcrdma_mw **out) { @@ -364,7 +364,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, rpcrdma_defer_mr_recovery(mw); mw = rpcrdma_get_mw(r_xprt); if (!mw) - return -ENOBUFS; + return ERR_PTR(-ENOBUFS); } while (mw->frmr.fr_state != FRMR_IS_INVALID); frmr = &mw->frmr; frmr->fr_state = FRMR_IS_VALID; @@ -429,25 +429,25 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mw->mw_offset = mr->iova; *out = mw; - return mw->mw_nents; + return seg; out_dmamap_err: pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n", mw->mw_sg, i); frmr->fr_state = FRMR_IS_INVALID; rpcrdma_put_mw(r_xprt, mw); - return -EIO; + return ERR_PTR(-EIO); out_mapmr_err: pr_err("rpcrdma: failed to map mr %p (%d/%d)\n", frmr->fr_mr, n, mw->mw_nents); rpcrdma_defer_mr_recovery(mw); - return -EIO; + return ERR_PTR(-EIO); out_senderr: pr_err("rpcrdma: FRMR registration ib_post_send returned %i\n", rc); rpcrdma_defer_mr_recovery(mw); - return -ENOTCONN; + return ERR_PTR(-ENOTCONN); } /* Invalidate all memory regions that were registered for "req". diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index ca4d6e4528f3..f1889f4d4803 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -169,40 +169,41 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt, return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read; } -/* Split "vec" on page boundaries into segments. FMR registers pages, - * not a byte range. Other modes coalesce these segments into a single - * MR when they can. +/* Split @vec on page boundaries into SGEs. FMR registers pages, not + * a byte range. Other modes coalesce these SGEs into a single MR + * when they can. + * + * Returns pointer to next available SGE, and bumps the total number + * of SGEs consumed. */ -static int -rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n) +static struct rpcrdma_mr_seg * +rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, + unsigned int *n) { - size_t page_offset; - u32 remaining; + u32 remaining, page_offset; char *base; base = vec->iov_base; page_offset = offset_in_page(base); remaining = vec->iov_len; - while (remaining && n < RPCRDMA_MAX_SEGS) { - seg[n].mr_page = NULL; - seg[n].mr_offset = base; - seg[n].mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining); - remaining -= seg[n].mr_len; - base += seg[n].mr_len; - ++n; + while (remaining) { + seg->mr_page = NULL; + seg->mr_offset = base; + seg->mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining); + remaining -= seg->mr_len; + base += seg->mr_len; + ++seg; + ++(*n); page_offset = 0; } - return n; + return seg; } -/* - * Chunk assembly from upper layer xdr_buf. - * - * Prepare the passed-in xdr_buf into representation as RPC/RDMA chunk - * elements. Segments are then coalesced when registered, if possible - * within the selected memreg mode. +/* Convert @xdrbuf into SGEs no larger than a page each. As they + * are registered, these SGEs are then coalesced into RDMA segments + * when the selected memreg mode supports it. * - * Returns positive number of segments converted, or a negative errno. + * Returns positive number of SGEs consumed, or a negative errno. */ static int @@ -210,47 +211,41 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, unsigned int pos, enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg) { - int len, n, p, page_base; + unsigned long page_base; + unsigned int len, n; struct page **ppages; n = 0; - if (pos == 0) { - n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n); - if (n == RPCRDMA_MAX_SEGS) - goto out_overflow; - } + if (pos == 0) + seg = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, &n); len = xdrbuf->page_len; ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); page_base = offset_in_page(xdrbuf->page_base); - p = 0; - while (len && n < RPCRDMA_MAX_SEGS) { - if (!ppages[p]) { - /* alloc the pagelist for receiving buffer */ - ppages[p] = alloc_page(GFP_ATOMIC); - if (!ppages[p]) + while (len) { + if (unlikely(!*ppages)) { + /* XXX: Certain upper layer operations do + * not provide receive buffer pages. + */ + *ppages = alloc_page(GFP_ATOMIC); + if (!*ppages) return -EAGAIN; } - seg[n].mr_page = ppages[p]; - seg[n].mr_offset = (void *)(unsigned long) page_base; - seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len); - if (seg[n].mr_len > PAGE_SIZE) - goto out_overflow; - len -= seg[n].mr_len; + seg->mr_page = *ppages; + seg->mr_offset = (char *)page_base; + seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len); + len -= seg->mr_len; + ++ppages; + ++seg; ++n; - ++p; - page_base = 0; /* page offset only applies to first page */ + page_base = 0; } - /* Message overflows the seg array */ - if (len && n == RPCRDMA_MAX_SEGS) - goto out_overflow; - /* When encoding a Read chunk, the tail iovec contains an * XDR pad and may be omitted. */ if (type == rpcrdma_readch && r_xprt->rx_ia.ri_implicit_roundup) - return n; + goto out; /* When encoding a Write chunk, some servers need to see an * extra segment for non-XDR-aligned Write chunks. The upper @@ -258,30 +253,81 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, * for this purpose. */ if (type == rpcrdma_writech && r_xprt->rx_ia.ri_implicit_roundup) - return n; + goto out; - if (xdrbuf->tail[0].iov_len) { - n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n); - if (n == RPCRDMA_MAX_SEGS) - goto out_overflow; - } + if (xdrbuf->tail[0].iov_len) + seg = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n); +out: + if (unlikely(n > RPCRDMA_MAX_SEGS)) + return -EIO; return n; +} -out_overflow: - pr_err("rpcrdma: segment array overflow\n"); - return -EIO; +static inline int +encode_item_present(struct xdr_stream *xdr) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, sizeof(*p)); + if (unlikely(!p)) + return -EMSGSIZE; + + *p = xdr_one; + return 0; } -static inline __be32 * +static inline int +encode_item_not_present(struct xdr_stream *xdr) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, sizeof(*p)); + if (unlikely(!p)) + return -EMSGSIZE; + + *p = xdr_zero; + return 0; +} + +static void xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mw *mw) { *iptr++ = cpu_to_be32(mw->mw_handle); *iptr++ = cpu_to_be32(mw->mw_length); - return xdr_encode_hyper(iptr, mw->mw_offset); + xdr_encode_hyper(iptr, mw->mw_offset); } -/* XDR-encode the Read list. Supports encoding a list of read +static int +encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4 * sizeof(*p)); + if (unlikely(!p)) + return -EMSGSIZE; + + xdr_encode_rdma_segment(p, mw); + return 0; +} + +static int +encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw, + u32 position) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 6 * sizeof(*p)); + if (unlikely(!p)) + return -EMSGSIZE; + + *p++ = xdr_one; /* Item present */ + *p++ = cpu_to_be32(position); + xdr_encode_rdma_segment(p, mw); + return 0; +} + +/* Register and XDR encode the Read list. Supports encoding a list of read * segments that belong to a single read chunk. * * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): @@ -290,23 +336,20 @@ xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mw *mw) * N elements, position P (same P for all chunks of same arg!): * 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0 * - * Returns a pointer to the XDR word in the RDMA header following - * the end of the Read list, or an error pointer. + * Returns zero on success, or a negative errno if a failure occurred. + * @xdr is advanced to the next position in the stream. + * + * Only a single @pos value is currently supported. */ -static __be32 * -rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, - struct rpcrdma_req *req, struct rpc_rqst *rqst, - __be32 *iptr, enum rpcrdma_chunktype rtype) +static noinline int +rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, + struct rpc_rqst *rqst, enum rpcrdma_chunktype rtype) { + struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_mr_seg *seg; struct rpcrdma_mw *mw; unsigned int pos; - int n, nsegs; - - if (rtype == rpcrdma_noch) { - *iptr++ = xdr_zero; /* item not present */ - return iptr; - } + int nsegs; pos = rqst->rq_snd_buf.head[0].iov_len; if (rtype == rpcrdma_areadch) @@ -315,40 +358,33 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos, rtype, seg); if (nsegs < 0) - return ERR_PTR(nsegs); + return nsegs; do { - n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, - false, &mw); - if (n < 0) - return ERR_PTR(n); + seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, + false, &mw); + if (IS_ERR(seg)) + return PTR_ERR(seg); rpcrdma_push_mw(mw, &req->rl_registered); - *iptr++ = xdr_one; /* item present */ - - /* All read segments in this chunk - * have the same "position". - */ - *iptr++ = cpu_to_be32(pos); - iptr = xdr_encode_rdma_segment(iptr, mw); + if (encode_read_segment(xdr, mw, pos) < 0) + return -EMSGSIZE; dprintk("RPC: %5u %s: pos %u %u@0x%016llx:0x%08x (%s)\n", rqst->rq_task->tk_pid, __func__, pos, mw->mw_length, (unsigned long long)mw->mw_offset, - mw->mw_handle, n < nsegs ? "more" : "last"); + mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last"); r_xprt->rx_stats.read_chunk_count++; - seg += n; - nsegs -= n; + nsegs -= mw->mw_nents; } while (nsegs); - /* Finish Read list */ - *iptr++ = xdr_zero; /* Next item not present */ - return iptr; + return 0; } -/* XDR-encode the Write list. Supports encoding a list containing - * one array of plain segments that belong to a single write chunk. +/* Register and XDR encode the Write list. Supports encoding a list + * containing one array of plain segments that belong to a single + * write chunk. * * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): * @@ -356,66 +392,65 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, * N elements: * 1 - N - HLOO - HLOO - ... - HLOO - 0 * - * Returns a pointer to the XDR word in the RDMA header following - * the end of the Write list, or an error pointer. + * Returns zero on success, or a negative errno if a failure occurred. + * @xdr is advanced to the next position in the stream. + * + * Only a single Write chunk is currently supported. */ -static __be32 * +static noinline int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, - struct rpc_rqst *rqst, __be32 *iptr, - enum rpcrdma_chunktype wtype) + struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype) { + struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_mr_seg *seg; struct rpcrdma_mw *mw; - int n, nsegs, nchunks; + int nsegs, nchunks; __be32 *segcount; - if (wtype != rpcrdma_writech) { - *iptr++ = xdr_zero; /* no Write list present */ - return iptr; - } - seg = req->rl_segments; nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, rqst->rq_rcv_buf.head[0].iov_len, wtype, seg); if (nsegs < 0) - return ERR_PTR(nsegs); + return nsegs; - *iptr++ = xdr_one; /* Write list present */ - segcount = iptr++; /* save location of segment count */ + if (encode_item_present(xdr) < 0) + return -EMSGSIZE; + segcount = xdr_reserve_space(xdr, sizeof(*segcount)); + if (unlikely(!segcount)) + return -EMSGSIZE; + /* Actual value encoded below */ nchunks = 0; do { - n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, - true, &mw); - if (n < 0) - return ERR_PTR(n); + seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, + true, &mw); + if (IS_ERR(seg)) + return PTR_ERR(seg); rpcrdma_push_mw(mw, &req->rl_registered); - iptr = xdr_encode_rdma_segment(iptr, mw); + if (encode_rdma_segment(xdr, mw) < 0) + return -EMSGSIZE; dprintk("RPC: %5u %s: %u@0x016%llx:0x%08x (%s)\n", rqst->rq_task->tk_pid, __func__, mw->mw_length, (unsigned long long)mw->mw_offset, - mw->mw_handle, n < nsegs ? "more" : "last"); + mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last"); r_xprt->rx_stats.write_chunk_count++; r_xprt->rx_stats.total_rdma_request += seg->mr_len; nchunks++; - seg += n; - nsegs -= n; + nsegs -= mw->mw_nents; } while (nsegs); /* Update count of segments in this Write chunk */ *segcount = cpu_to_be32(nchunks); - /* Finish Write list */ - *iptr++ = xdr_zero; /* Next item not present */ - return iptr; + return 0; } -/* XDR-encode the Reply chunk. Supports encoding an array of plain - * segments that belong to a single write (reply) chunk. +/* Register and XDR encode the Reply chunk. Supports encoding an array + * of plain segments that belong to a single write (reply) chunk. * * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): * @@ -423,58 +458,57 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, * N elements: * 1 - N - HLOO - HLOO - ... - HLOO * - * Returns a pointer to the XDR word in the RDMA header following - * the end of the Reply chunk, or an error pointer. + * Returns zero on success, or a negative errno if a failure occurred. + * @xdr is advanced to the next position in the stream. */ -static __be32 * -rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, - struct rpcrdma_req *req, struct rpc_rqst *rqst, - __be32 *iptr, enum rpcrdma_chunktype wtype) +static noinline int +rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, + struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype) { + struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_mr_seg *seg; struct rpcrdma_mw *mw; - int n, nsegs, nchunks; + int nsegs, nchunks; __be32 *segcount; - if (wtype != rpcrdma_replych) { - *iptr++ = xdr_zero; /* no Reply chunk present */ - return iptr; - } - seg = req->rl_segments; nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg); if (nsegs < 0) - return ERR_PTR(nsegs); + return nsegs; - *iptr++ = xdr_one; /* Reply chunk present */ - segcount = iptr++; /* save location of segment count */ + if (encode_item_present(xdr) < 0) + return -EMSGSIZE; + segcount = xdr_reserve_space(xdr, sizeof(*segcount)); + if (unlikely(!segcount)) + return -EMSGSIZE; + /* Actual value encoded below */ nchunks = 0; do { - n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, - true, &mw); - if (n < 0) - return ERR_PTR(n); + seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, + true, &mw); + if (IS_ERR(seg)) + return PTR_ERR(seg); rpcrdma_push_mw(mw, &req->rl_registered); - iptr = xdr_encode_rdma_segment(iptr, mw); + if (encode_rdma_segment(xdr, mw) < 0) + return -EMSGSIZE; dprintk("RPC: %5u %s: %u@0x%016llx:0x%08x (%s)\n", rqst->rq_task->tk_pid, __func__, mw->mw_length, (unsigned long long)mw->mw_offset, - mw->mw_handle, n < nsegs ? "more" : "last"); + mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last"); r_xprt->rx_stats.reply_chunk_count++; r_xprt->rx_stats.total_rdma_request += seg->mr_len; nchunks++; - seg += n; - nsegs -= n; + nsegs -= mw->mw_nents; } while (nsegs); /* Update count of segments in the Reply chunk */ *segcount = cpu_to_be32(nchunks); - return iptr; + return 0; } /* Prepare the RPC-over-RDMA header SGE. @@ -651,37 +685,52 @@ rpcrdma_unmap_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req) req->rl_mapped_sges = 0; } -/* - * Marshal a request: the primary job of this routine is to choose - * the transfer modes. See comments below. +/** + * rpcrdma_marshal_req - Marshal and send one RPC request + * @r_xprt: controlling transport + * @rqst: RPC request to be marshaled + * + * For the RPC in "rqst", this function: + * - Chooses the transfer mode (eg., RDMA_MSG or RDMA_NOMSG) + * - Registers Read, Write, and Reply chunks + * - Constructs the transport header + * - Posts a Send WR to send the transport header and request * - * Returns zero on success, otherwise a negative errno. + * Returns: + * %0 if the RPC was sent successfully, + * %-ENOTCONN if the connection was lost, + * %-EAGAIN if not enough pages are available for on-demand reply buffer, + * %-ENOBUFS if no MRs are available to register chunks, + * %-EMSGSIZE if the transport header is too small, + * %-EIO if a permanent problem occurred while marshaling. */ - int -rpcrdma_marshal_req(struct rpc_rqst *rqst) +rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) { - struct rpc_xprt *xprt = rqst->rq_xprt; - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + struct xdr_stream *xdr = &req->rl_stream; enum rpcrdma_chunktype rtype, wtype; - struct rpcrdma_msg *headerp; bool ddp_allowed; - ssize_t hdrlen; - size_t rpclen; - __be32 *iptr; + __be32 *p; + int ret; #if defined(CONFIG_SUNRPC_BACKCHANNEL) if (test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state)) return rpcrdma_bc_marshal_reply(rqst); #endif - headerp = rdmab_to_msg(req->rl_rdmabuf); - /* don't byte-swap XID, it's already done in request */ - headerp->rm_xid = rqst->rq_xid; - headerp->rm_vers = rpcrdma_version; - headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests); - headerp->rm_type = rdma_msg; + rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0); + xdr_init_encode(xdr, &req->rl_hdrbuf, + req->rl_rdmabuf->rg_base); + + /* Fixed header fields */ + ret = -EMSGSIZE; + p = xdr_reserve_space(xdr, 4 * sizeof(*p)); + if (!p) + goto out_err; + *p++ = rqst->rq_xid; + *p++ = rpcrdma_version; + *p++ = cpu_to_be32(r_xprt->rx_buf.rb_max_requests); /* When the ULP employs a GSS flavor that guarantees integrity * or privacy, direct data placement of individual data items @@ -721,22 +770,17 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * by themselves are larger than the inline threshold. */ if (rpcrdma_args_inline(r_xprt, rqst)) { + *p++ = rdma_msg; rtype = rpcrdma_noch; - rpclen = rqst->rq_snd_buf.len; } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) { + *p++ = rdma_msg; rtype = rpcrdma_readch; - rpclen = rqst->rq_snd_buf.head[0].iov_len + - rqst->rq_snd_buf.tail[0].iov_len; } else { r_xprt->rx_stats.nomsg_call_count++; - headerp->rm_type = htonl(RDMA_NOMSG); + *p++ = rdma_nomsg; rtype = rpcrdma_areadch; - rpclen = 0; } - req->rl_xid = rqst->rq_xid; - rpcrdma_insert_req(&r_xprt->rx_buf, req); - /* This implementation supports the following combinations * of chunk lists in one RPC-over-RDMA Call message: * @@ -759,79 +803,50 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * send a Call message with a Position Zero Read chunk and a * regular Read chunk at the same time. */ - iptr = headerp->rm_body.rm_chunks; - iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype); - if (IS_ERR(iptr)) + if (rtype != rpcrdma_noch) { + ret = rpcrdma_encode_read_list(r_xprt, req, rqst, rtype); + if (ret) + goto out_err; + } + ret = encode_item_not_present(xdr); + if (ret) goto out_err; - iptr = rpcrdma_encode_write_list(r_xprt, req, rqst, iptr, wtype); - if (IS_ERR(iptr)) + + if (wtype == rpcrdma_writech) { + ret = rpcrdma_encode_write_list(r_xprt, req, rqst, wtype); + if (ret) + goto out_err; + } + ret = encode_item_not_present(xdr); + if (ret) goto out_err; - iptr = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, iptr, wtype); - if (IS_ERR(iptr)) + + if (wtype != rpcrdma_replych) + ret = encode_item_not_present(xdr); + else + ret = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, wtype); + if (ret) goto out_err; - hdrlen = (unsigned char *)iptr - (unsigned char *)headerp; - dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n", + dprintk("RPC: %5u %s: %s/%s: hdrlen %u rpclen\n", rqst->rq_task->tk_pid, __func__, transfertypes[rtype], transfertypes[wtype], - hdrlen, rpclen); + xdr_stream_pos(xdr)); - if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, hdrlen, + if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, + xdr_stream_pos(xdr), &rqst->rq_snd_buf, rtype)) { - iptr = ERR_PTR(-EIO); + ret = -EIO; goto out_err; } return 0; out_err: - if (PTR_ERR(iptr) != -ENOBUFS) { - pr_err("rpcrdma: rpcrdma_marshal_req failed, status %ld\n", - PTR_ERR(iptr)); + if (ret != -ENOBUFS) { + pr_err("rpcrdma: header marshaling failed (%d)\n", ret); r_xprt->rx_stats.failed_marshal_count++; } - return PTR_ERR(iptr); -} - -/* - * Chase down a received write or reply chunklist to get length - * RDMA'd by server. See map at rpcrdma_create_chunks()! :-) - */ -static int -rpcrdma_count_chunks(struct rpcrdma_rep *rep, int wrchunk, __be32 **iptrp) -{ - unsigned int i, total_len; - struct rpcrdma_write_chunk *cur_wchunk; - char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf); - - i = be32_to_cpu(**iptrp); - cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1); - total_len = 0; - while (i--) { - struct rpcrdma_segment *seg = &cur_wchunk->wc_target; - ifdebug(FACILITY) { - u64 off; - xdr_decode_hyper((__be32 *)&seg->rs_offset, &off); - dprintk("RPC: %s: chunk %d@0x%016llx:0x%08x\n", - __func__, - be32_to_cpu(seg->rs_length), - (unsigned long long)off, - be32_to_cpu(seg->rs_handle)); - } - total_len += be32_to_cpu(seg->rs_length); - ++cur_wchunk; - } - /* check and adjust for properly terminated write chunk */ - if (wrchunk) { - __be32 *w = (__be32 *) cur_wchunk; - if (*w++ != xdr_zero) - return -1; - cur_wchunk = (struct rpcrdma_write_chunk *) w; - } - if ((char *)cur_wchunk > base + rep->rr_len) - return -1; - - *iptrp = (__be32 *) cur_wchunk; - return total_len; + return ret; } /** @@ -949,37 +964,254 @@ rpcrdma_mark_remote_invalidation(struct list_head *mws, } } -#if defined(CONFIG_SUNRPC_BACKCHANNEL) /* By convention, backchannel calls arrive via rdma_msg type * messages, and never populate the chunk lists. This makes * the RPC/RDMA header small and fixed in size, so it is * straightforward to check the RPC header's direction field. */ static bool -rpcrdma_is_bcall(struct rpcrdma_msg *headerp) +rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, + __be32 xid, __be32 proc) +#if defined(CONFIG_SUNRPC_BACKCHANNEL) { - __be32 *p = (__be32 *)headerp; + struct xdr_stream *xdr = &rep->rr_stream; + __be32 *p; - if (headerp->rm_type != rdma_msg) + if (proc != rdma_msg) return false; - if (headerp->rm_body.rm_chunks[0] != xdr_zero) + + /* Peek at stream contents without advancing. */ + p = xdr_inline_decode(xdr, 0); + + /* Chunk lists */ + if (*p++ != xdr_zero) return false; - if (headerp->rm_body.rm_chunks[1] != xdr_zero) + if (*p++ != xdr_zero) return false; - if (headerp->rm_body.rm_chunks[2] != xdr_zero) + if (*p++ != xdr_zero) return false; - /* sanity */ - if (p[7] != headerp->rm_xid) + /* RPC header */ + if (*p++ != xid) return false; - /* call direction */ - if (p[8] != cpu_to_be32(RPC_CALL)) + if (*p != cpu_to_be32(RPC_CALL)) return false; + /* Now that we are sure this is a backchannel call, + * advance to the RPC header. + */ + p = xdr_inline_decode(xdr, 3 * sizeof(*p)); + if (unlikely(!p)) + goto out_short; + + rpcrdma_bc_receive_call(r_xprt, rep); + return true; + +out_short: + pr_warn("RPC/RDMA short backward direction call\n"); + if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep)) + xprt_disconnect_done(&r_xprt->rx_xprt); return true; } +#else /* CONFIG_SUNRPC_BACKCHANNEL */ +{ + return false; +} #endif /* CONFIG_SUNRPC_BACKCHANNEL */ +static int decode_rdma_segment(struct xdr_stream *xdr, u32 *length) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4 * sizeof(*p)); + if (unlikely(!p)) + return -EIO; + + ifdebug(FACILITY) { + u64 offset; + u32 handle; + + handle = be32_to_cpup(p++); + *length = be32_to_cpup(p++); + xdr_decode_hyper(p, &offset); + dprintk("RPC: %s: segment %u@0x%016llx:0x%08x\n", + __func__, *length, (unsigned long long)offset, + handle); + } else { + *length = be32_to_cpup(p + 1); + } + + return 0; +} + +static int decode_write_chunk(struct xdr_stream *xdr, u32 *length) +{ + u32 segcount, seglength; + __be32 *p; + + p = xdr_inline_decode(xdr, sizeof(*p)); + if (unlikely(!p)) + return -EIO; + + *length = 0; + segcount = be32_to_cpup(p); + while (segcount--) { + if (decode_rdma_segment(xdr, &seglength)) + return -EIO; + *length += seglength; + } + + dprintk("RPC: %s: segcount=%u, %u bytes\n", + __func__, be32_to_cpup(p), *length); + return 0; +} + +/* In RPC-over-RDMA Version One replies, a Read list is never + * expected. This decoder is a stub that returns an error if + * a Read list is present. + */ +static int decode_read_list(struct xdr_stream *xdr) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, sizeof(*p)); + if (unlikely(!p)) + return -EIO; + if (unlikely(*p != xdr_zero)) + return -EIO; + return 0; +} + +/* Supports only one Write chunk in the Write list + */ +static int decode_write_list(struct xdr_stream *xdr, u32 *length) +{ + u32 chunklen; + bool first; + __be32 *p; + + *length = 0; + first = true; + do { + p = xdr_inline_decode(xdr, sizeof(*p)); + if (unlikely(!p)) + return -EIO; + if (*p == xdr_zero) + break; + if (!first) + return -EIO; + + if (decode_write_chunk(xdr, &chunklen)) + return -EIO; + *length += chunklen; + first = false; + } while (true); + return 0; +} + +static int decode_reply_chunk(struct xdr_stream *xdr, u32 *length) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, sizeof(*p)); + if (unlikely(!p)) + return -EIO; + + *length = 0; + if (*p != xdr_zero) + if (decode_write_chunk(xdr, length)) + return -EIO; + return 0; +} + +static int +rpcrdma_decode_msg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, + struct rpc_rqst *rqst) +{ + struct xdr_stream *xdr = &rep->rr_stream; + u32 writelist, replychunk, rpclen; + char *base; + + /* Decode the chunk lists */ + if (decode_read_list(xdr)) + return -EIO; + if (decode_write_list(xdr, &writelist)) + return -EIO; + if (decode_reply_chunk(xdr, &replychunk)) + return -EIO; + + /* RDMA_MSG sanity checks */ + if (unlikely(replychunk)) + return -EIO; + + /* Build the RPC reply's Payload stream in rqst->rq_rcv_buf */ + base = (char *)xdr_inline_decode(xdr, 0); + rpclen = xdr_stream_remaining(xdr); + r_xprt->rx_stats.fixup_copy_count += + rpcrdma_inline_fixup(rqst, base, rpclen, writelist & 3); + + r_xprt->rx_stats.total_rdma_reply += writelist; + return rpclen + xdr_align_size(writelist); +} + +static noinline int +rpcrdma_decode_nomsg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) +{ + struct xdr_stream *xdr = &rep->rr_stream; + u32 writelist, replychunk; + + /* Decode the chunk lists */ + if (decode_read_list(xdr)) + return -EIO; + if (decode_write_list(xdr, &writelist)) + return -EIO; + if (decode_reply_chunk(xdr, &replychunk)) + return -EIO; + + /* RDMA_NOMSG sanity checks */ + if (unlikely(writelist)) + return -EIO; + if (unlikely(!replychunk)) + return -EIO; + + /* Reply chunk buffer already is the reply vector */ + r_xprt->rx_stats.total_rdma_reply += replychunk; + return replychunk; +} + +static noinline int +rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, + struct rpc_rqst *rqst) +{ + struct xdr_stream *xdr = &rep->rr_stream; + __be32 *p; + + p = xdr_inline_decode(xdr, sizeof(*p)); + if (unlikely(!p)) + return -EIO; + + switch (*p) { + case err_vers: + p = xdr_inline_decode(xdr, 2 * sizeof(*p)); + if (!p) + break; + dprintk("RPC: %5u: %s: server reports version error (%u-%u)\n", + rqst->rq_task->tk_pid, __func__, + be32_to_cpup(p), be32_to_cpu(*(p + 1))); + break; + case err_chunk: + dprintk("RPC: %5u: %s: server reports header decoding error\n", + rqst->rq_task->tk_pid, __func__); + break; + default: + dprintk("RPC: %5u: %s: server reports unrecognized error %d\n", + rqst->rq_task->tk_pid, __func__, be32_to_cpup(p)); + } + + r_xprt->rx_stats.bad_reply_count++; + return -EREMOTEIO; +} + /* Process received RPC/RDMA messages. * * Errors must result in the RPC task either being awakened, or @@ -991,51 +1223,48 @@ rpcrdma_reply_handler(struct work_struct *work) struct rpcrdma_rep *rep = container_of(work, struct rpcrdma_rep, rr_work); struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpc_xprt *xprt = &r_xprt->rx_xprt; - struct rpcrdma_msg *headerp; + struct xdr_stream *xdr = &rep->rr_stream; struct rpcrdma_req *req; struct rpc_rqst *rqst; - __be32 *iptr; - int rdmalen, status, rmerr; + __be32 *p, xid, vers, proc; unsigned long cwnd; - struct list_head mws; + int status; dprintk("RPC: %s: incoming rep %p\n", __func__, rep); - if (rep->rr_len == RPCRDMA_BAD_LEN) + if (rep->rr_hdrbuf.head[0].iov_len == 0) goto out_badstatus; - if (rep->rr_len < RPCRDMA_HDRLEN_ERR) + + xdr_init_decode(xdr, &rep->rr_hdrbuf, + rep->rr_hdrbuf.head[0].iov_base); + + /* Fixed transport header fields */ + p = xdr_inline_decode(xdr, 4 * sizeof(*p)); + if (unlikely(!p)) goto out_shortreply; + xid = *p++; + vers = *p++; + p++; /* credits */ + proc = *p++; - headerp = rdmab_to_msg(rep->rr_rdmabuf); -#if defined(CONFIG_SUNRPC_BACKCHANNEL) - if (rpcrdma_is_bcall(headerp)) - goto out_bcall; -#endif + if (rpcrdma_is_bcall(r_xprt, rep, xid, proc)) + return; /* Match incoming rpcrdma_rep to an rpcrdma_req to * get context for handling any incoming chunks. */ - spin_lock(&buf->rb_lock); - req = rpcrdma_lookup_req_locked(&r_xprt->rx_buf, - headerp->rm_xid); - if (!req) - goto out_nomatch; - if (req->rl_reply) - goto out_duplicate; - - list_replace_init(&req->rl_registered, &mws); - rpcrdma_mark_remote_invalidation(&mws, rep); - - /* Avoid races with signals and duplicate replies - * by marking this req as matched. - */ + spin_lock(&xprt->recv_lock); + rqst = xprt_lookup_rqst(xprt, xid); + if (!rqst) + goto out_norqst; + xprt_pin_rqst(rqst); + spin_unlock(&xprt->recv_lock); + req = rpcr_to_rdmar(rqst); req->rl_reply = rep; - spin_unlock(&buf->rb_lock); dprintk("RPC: %s: reply %p completes request %p (xid 0x%08x)\n", - __func__, rep, req, be32_to_cpu(headerp->rm_xid)); + __func__, rep, req, be32_to_cpu(xid)); /* Invalidate and unmap the data payloads before waking the * waiting application. This guarantees the memory regions @@ -1044,99 +1273,42 @@ rpcrdma_reply_handler(struct work_struct *work) * waking the next RPC waits until this RPC has relinquished * all its Send Queue entries. */ - if (!list_empty(&mws)) - r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, &mws); + if (!list_empty(&req->rl_registered)) { + rpcrdma_mark_remote_invalidation(&req->rl_registered, rep); + r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, + &req->rl_registered); + } - /* Perform XID lookup, reconstruction of the RPC reply, and - * RPC completion while holding the transport lock to ensure - * the rep, rqst, and rq_task pointers remain stable. - */ - spin_lock_bh(&xprt->transport_lock); - rqst = xprt_lookup_rqst(xprt, headerp->rm_xid); - if (!rqst) - goto out_norqst; xprt->reestablish_timeout = 0; - if (headerp->rm_vers != rpcrdma_version) + if (vers != rpcrdma_version) goto out_badversion; - /* check for expected message types */ - /* The order of some of these tests is important. */ - switch (headerp->rm_type) { + switch (proc) { case rdma_msg: - /* never expect read chunks */ - /* never expect reply chunks (two ways to check) */ - if (headerp->rm_body.rm_chunks[0] != xdr_zero || - (headerp->rm_body.rm_chunks[1] == xdr_zero && - headerp->rm_body.rm_chunks[2] != xdr_zero)) - goto badheader; - if (headerp->rm_body.rm_chunks[1] != xdr_zero) { - /* count any expected write chunks in read reply */ - /* start at write chunk array count */ - iptr = &headerp->rm_body.rm_chunks[2]; - rdmalen = rpcrdma_count_chunks(rep, 1, &iptr); - /* check for validity, and no reply chunk after */ - if (rdmalen < 0 || *iptr++ != xdr_zero) - goto badheader; - rep->rr_len -= - ((unsigned char *)iptr - (unsigned char *)headerp); - status = rep->rr_len + rdmalen; - r_xprt->rx_stats.total_rdma_reply += rdmalen; - /* special case - last chunk may omit padding */ - if (rdmalen &= 3) { - rdmalen = 4 - rdmalen; - status += rdmalen; - } - } else { - /* else ordinary inline */ - rdmalen = 0; - iptr = (__be32 *)((unsigned char *)headerp + - RPCRDMA_HDRLEN_MIN); - rep->rr_len -= RPCRDMA_HDRLEN_MIN; - status = rep->rr_len; - } - - r_xprt->rx_stats.fixup_copy_count += - rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, - rdmalen); + status = rpcrdma_decode_msg(r_xprt, rep, rqst); break; - case rdma_nomsg: - /* never expect read or write chunks, always reply chunks */ - if (headerp->rm_body.rm_chunks[0] != xdr_zero || - headerp->rm_body.rm_chunks[1] != xdr_zero || - headerp->rm_body.rm_chunks[2] != xdr_one) - goto badheader; - iptr = (__be32 *)((unsigned char *)headerp + - RPCRDMA_HDRLEN_MIN); - rdmalen = rpcrdma_count_chunks(rep, 0, &iptr); - if (rdmalen < 0) - goto badheader; - r_xprt->rx_stats.total_rdma_reply += rdmalen; - /* Reply chunk buffer already is the reply vector - no fixup. */ - status = rdmalen; + status = rpcrdma_decode_nomsg(r_xprt, rep); break; - case rdma_error: - goto out_rdmaerr; - -badheader: + status = rpcrdma_decode_error(r_xprt, rep, rqst); + break; default: - dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n", - rqst->rq_task->tk_pid, __func__, - be32_to_cpu(headerp->rm_type)); status = -EIO; - r_xprt->rx_stats.bad_reply_count++; - break; } + if (status < 0) + goto out_badheader; out: + spin_lock(&xprt->recv_lock); cwnd = xprt->cwnd; xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT; if (xprt->cwnd > cwnd) xprt_release_rqst_cong(rqst->rq_task); xprt_complete_rqst(rqst->rq_task, status); - spin_unlock_bh(&xprt->transport_lock); + xprt_unpin_rqst(rqst); + spin_unlock(&xprt->recv_lock); dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n", __func__, xprt, rqst, status); return; @@ -1149,72 +1321,38 @@ out_badstatus: } return; -#if defined(CONFIG_SUNRPC_BACKCHANNEL) -out_bcall: - rpcrdma_bc_receive_call(r_xprt, rep); - return; -#endif - /* If the incoming reply terminated a pending RPC, the next * RPC call will post a replacement receive buffer as it is * being marshaled. */ out_badversion: dprintk("RPC: %s: invalid version %d\n", - __func__, be32_to_cpu(headerp->rm_vers)); + __func__, be32_to_cpu(vers)); status = -EIO; r_xprt->rx_stats.bad_reply_count++; goto out; -out_rdmaerr: - rmerr = be32_to_cpu(headerp->rm_body.rm_error.rm_err); - switch (rmerr) { - case ERR_VERS: - pr_err("%s: server reports header version error (%u-%u)\n", - __func__, - be32_to_cpu(headerp->rm_body.rm_error.rm_vers_low), - be32_to_cpu(headerp->rm_body.rm_error.rm_vers_high)); - break; - case ERR_CHUNK: - pr_err("%s: server reports header decoding error\n", - __func__); - break; - default: - pr_err("%s: server reports unknown error %d\n", - __func__, rmerr); - } - status = -EREMOTEIO; +out_badheader: + dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n", + rqst->rq_task->tk_pid, __func__, be32_to_cpu(proc)); r_xprt->rx_stats.bad_reply_count++; + status = -EIO; goto out; -/* The req was still available, but by the time the transport_lock +/* The req was still available, but by the time the recv_lock * was acquired, the rqst and task had been released. Thus the RPC * has already been terminated. */ out_norqst: - spin_unlock_bh(&xprt->transport_lock); - rpcrdma_buffer_put(req); - dprintk("RPC: %s: race, no rqst left for req %p\n", - __func__, req); - return; + spin_unlock(&xprt->recv_lock); + dprintk("RPC: %s: no match for incoming xid 0x%08x\n", + __func__, be32_to_cpu(xid)); + goto repost; out_shortreply: dprintk("RPC: %s: short/invalid reply\n", __func__); goto repost; -out_nomatch: - spin_unlock(&buf->rb_lock); - dprintk("RPC: %s: no match for incoming xid 0x%08x len %d\n", - __func__, be32_to_cpu(headerp->rm_xid), - rep->rr_len); - goto repost; - -out_duplicate: - spin_unlock(&buf->rb_lock); - dprintk("RPC: %s: " - "duplicate reply %p to RPC request %p: xid 0x%08x\n", - __func__, rep, req, be32_to_cpu(headerp->rm_xid)); - /* If no pending RPC transaction was matched, post a replacement * receive buffer before returning. */ diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index c676ed0efb5a..ec37ad83b068 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -52,7 +52,7 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, if (src->iov_len < 24) goto out_shortreply; - spin_lock_bh(&xprt->transport_lock); + spin_lock(&xprt->recv_lock); req = xprt_lookup_rqst(xprt, xid); if (!req) goto out_notfound; @@ -69,17 +69,20 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, else if (credits > r_xprt->rx_buf.rb_bc_max_requests) credits = r_xprt->rx_buf.rb_bc_max_requests; + spin_lock_bh(&xprt->transport_lock); cwnd = xprt->cwnd; xprt->cwnd = credits << RPC_CWNDSHIFT; if (xprt->cwnd > cwnd) xprt_release_rqst_cong(req->rq_task); + spin_unlock_bh(&xprt->transport_lock); + ret = 0; xprt_complete_rqst(req->rq_task, rcvbuf->len); rcvbuf->len = 0; out_unlock: - spin_unlock_bh(&xprt->transport_lock); + spin_unlock(&xprt->recv_lock); out: return ret; @@ -266,7 +269,7 @@ xprt_rdma_bc_put(struct rpc_xprt *xprt) module_put(THIS_MODULE); } -static struct rpc_xprt_ops xprt_rdma_bc_procs = { +static const struct rpc_xprt_ops xprt_rdma_bc_procs = { .reserve_xprt = xprt_reserve_xprt_cong, .release_xprt = xprt_release_xprt_cong, .alloc_slot = xprt_alloc_slot, diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 933f79bed270..7dcda4597057 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -660,19 +660,21 @@ out_initerr: return -EIO; } +/* Walk the segments in the Read chunk starting at @p and construct + * RDMA Read operations to pull the chunk to the server. + */ static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp, struct svc_rdma_read_info *info, __be32 *p) { int ret; + ret = -EINVAL; info->ri_chunklen = 0; - while (*p++ != xdr_zero) { + while (*p++ != xdr_zero && be32_to_cpup(p++) == info->ri_position) { u32 rs_handle, rs_length; u64 rs_offset; - if (be32_to_cpup(p++) != info->ri_position) - break; rs_handle = be32_to_cpup(p++); rs_length = be32_to_cpup(p++); p = xdr_decode_hyper(p, &rs_offset); @@ -689,78 +691,6 @@ static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp, return ret; } -/* If there is inline content following the Read chunk, append it to - * the page list immediately following the data payload. This has to - * be done after the reader function has determined how many pages - * were consumed for RDMA Read. - * - * On entry, ri_pageno and ri_pageoff point directly to the end of the - * page list. On exit, both have been updated to the new "next byte". - * - * Assumptions: - * - Inline content fits entirely in rq_pages[0] - * - Trailing content is only a handful of bytes - */ -static int svc_rdma_copy_tail(struct svc_rqst *rqstp, - struct svc_rdma_read_info *info) -{ - struct svc_rdma_op_ctxt *head = info->ri_readctxt; - unsigned int tail_length, remaining; - u8 *srcp, *destp; - - /* Assert that all inline content fits in page 0. This is an - * implementation limit, not a protocol limit. - */ - if (head->arg.head[0].iov_len > PAGE_SIZE) { - pr_warn_once("svcrdma: too much trailing inline content\n"); - return -EINVAL; - } - - srcp = head->arg.head[0].iov_base; - srcp += info->ri_position; - tail_length = head->arg.head[0].iov_len - info->ri_position; - remaining = tail_length; - - /* If there is room on the last page in the page list, try to - * fit the trailing content there. - */ - if (info->ri_pageoff > 0) { - unsigned int len; - - len = min_t(unsigned int, remaining, - PAGE_SIZE - info->ri_pageoff); - destp = page_address(rqstp->rq_pages[info->ri_pageno]); - destp += info->ri_pageoff; - - memcpy(destp, srcp, len); - srcp += len; - destp += len; - info->ri_pageoff += len; - remaining -= len; - - if (info->ri_pageoff == PAGE_SIZE) { - info->ri_pageno++; - info->ri_pageoff = 0; - } - } - - /* Otherwise, a fresh page is needed. */ - if (remaining) { - head->arg.pages[info->ri_pageno] = - rqstp->rq_pages[info->ri_pageno]; - head->count++; - - destp = page_address(rqstp->rq_pages[info->ri_pageno]); - memcpy(destp, srcp, remaining); - info->ri_pageoff += remaining; - } - - head->arg.page_len += tail_length; - head->arg.len += tail_length; - head->arg.buflen += tail_length; - return 0; -} - /* Construct RDMA Reads to pull over a normal Read chunk. The chunk * data lands in the page list of head->arg.pages. * @@ -785,34 +715,28 @@ static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp, if (ret < 0) goto out; - /* Read chunk may need XDR round-up (see RFC 5666, s. 3.7). + /* Split the Receive buffer between the head and tail + * buffers at Read chunk's position. XDR roundup of the + * chunk is not included in either the pagelist or in + * the tail. */ - if (info->ri_chunklen & 3) { - u32 padlen = 4 - (info->ri_chunklen & 3); - - info->ri_chunklen += padlen; + head->arg.tail[0].iov_base = + head->arg.head[0].iov_base + info->ri_position; + head->arg.tail[0].iov_len = + head->arg.head[0].iov_len - info->ri_position; + head->arg.head[0].iov_len = info->ri_position; - /* NB: data payload always starts on XDR alignment, - * thus the pad can never contain a page boundary. - */ - info->ri_pageoff += padlen; - if (info->ri_pageoff == PAGE_SIZE) { - info->ri_pageno++; - info->ri_pageoff = 0; - } - } + /* Read chunk may need XDR roundup (see RFC 5666, s. 3.7). + * + * NFSv2/3 write decoders need the length of the tail to + * contain the size of the roundup padding. + */ + head->arg.tail[0].iov_len += 4 - (info->ri_chunklen & 3); head->arg.page_len = info->ri_chunklen; head->arg.len += info->ri_chunklen; head->arg.buflen += info->ri_chunklen; - if (info->ri_position < head->arg.head[0].iov_len) { - ret = svc_rdma_copy_tail(rqstp, info); - if (ret < 0) - goto out; - } - head->arg.head[0].iov_len = info->ri_position; - out: return ret; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index e660d4965b18..5caf8e722a11 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -51,6 +51,7 @@ #include <linux/workqueue.h> #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> +#include <rdma/rw.h> #include <linux/sunrpc/svc_rdma.h> #include <linux/export.h> #include "xprt_rdma.h" @@ -70,7 +71,7 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt); static int svc_rdma_secure_port(struct svc_rqst *); static void svc_rdma_kill_temp_xprt(struct svc_xprt *); -static struct svc_xprt_ops svc_rdma_ops = { +static const struct svc_xprt_ops svc_rdma_ops = { .xpo_create = svc_rdma_create, .xpo_recvfrom = svc_rdma_recvfrom, .xpo_sendto = svc_rdma_sendto, @@ -98,7 +99,7 @@ static struct svc_xprt *svc_rdma_bc_create(struct svc_serv *, struct net *, static void svc_rdma_bc_detach(struct svc_xprt *); static void svc_rdma_bc_free(struct svc_xprt *); -static struct svc_xprt_ops svc_rdma_bc_ops = { +static const struct svc_xprt_ops svc_rdma_bc_ops = { .xpo_create = svc_rdma_bc_create, .xpo_detach = svc_rdma_bc_detach, .xpo_free = svc_rdma_bc_free, @@ -167,8 +168,8 @@ static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt) { unsigned int i; - /* Each RPC/RDMA credit can consume a number of send - * and receive WQEs. One ctxt is allocated for each. + /* Each RPC/RDMA credit can consume one Receive and + * one Send WQE at the same time. */ i = xprt->sc_sq_depth + xprt->sc_rq_depth; @@ -713,7 +714,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) struct ib_qp_init_attr qp_attr; struct ib_device *dev; struct sockaddr *sap; - unsigned int i; + unsigned int i, ctxts; int ret = 0; listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); @@ -742,14 +743,26 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_max_sge = min((size_t)dev->attrs.max_sge, (size_t)RPCSVC_MAXPAGES); newxprt->sc_max_req_size = svcrdma_max_req_size; - newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr, - svcrdma_max_requests); - newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests); - newxprt->sc_max_bc_requests = min_t(u32, dev->attrs.max_qp_wr, - svcrdma_max_bc_requests); + newxprt->sc_max_requests = svcrdma_max_requests; + newxprt->sc_max_bc_requests = svcrdma_max_bc_requests; newxprt->sc_rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests; - newxprt->sc_sq_depth = newxprt->sc_rq_depth; + if (newxprt->sc_rq_depth > dev->attrs.max_qp_wr) { + pr_warn("svcrdma: reducing receive depth to %d\n", + dev->attrs.max_qp_wr); + newxprt->sc_rq_depth = dev->attrs.max_qp_wr; + newxprt->sc_max_requests = newxprt->sc_rq_depth - 2; + newxprt->sc_max_bc_requests = 2; + } + newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests); + ctxts = rdma_rw_mr_factor(dev, newxprt->sc_port_num, RPCSVC_MAXPAGES); + ctxts *= newxprt->sc_max_requests; + newxprt->sc_sq_depth = newxprt->sc_rq_depth + ctxts; + if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr) { + pr_warn("svcrdma: reducing send depth to %d\n", + dev->attrs.max_qp_wr); + newxprt->sc_sq_depth = dev->attrs.max_qp_wr; + } atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth); if (!svc_rdma_prealloc_ctxts(newxprt)) @@ -784,8 +797,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) qp_attr.event_handler = qp_event_handler; qp_attr.qp_context = &newxprt->sc_xprt; qp_attr.port_num = newxprt->sc_port_num; - qp_attr.cap.max_rdma_ctxs = newxprt->sc_max_requests; - qp_attr.cap.max_send_wr = newxprt->sc_sq_depth; + qp_attr.cap.max_rdma_ctxs = ctxts; + qp_attr.cap.max_send_wr = newxprt->sc_sq_depth - ctxts; qp_attr.cap.max_recv_wr = newxprt->sc_rq_depth; qp_attr.cap.max_send_sge = newxprt->sc_max_sge; qp_attr.cap.max_recv_sge = newxprt->sc_max_sge; @@ -853,6 +866,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dprintk(" remote address : %pIS:%u\n", sap, rpc_get_port(sap)); dprintk(" max_sge : %d\n", newxprt->sc_max_sge); dprintk(" sq_depth : %d\n", newxprt->sc_sq_depth); + dprintk(" rdma_rw_ctxs : %d\n", ctxts); dprintk(" max_requests : %d\n", newxprt->sc_max_requests); dprintk(" ord : %d\n", newxprt->sc_ord); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index d1c458e5ec4d..c84e2b644e13 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -149,7 +149,7 @@ static struct ctl_table sunrpc_table[] = { #endif -static struct rpc_xprt_ops xprt_rdma_procs; /*forward reference */ +static const struct rpc_xprt_ops xprt_rdma_procs; static void xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap) @@ -559,6 +559,7 @@ rpcrdma_get_rdmabuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, r_xprt->rx_stats.hardway_register_count += size; req->rl_rdmabuf = rb; + xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb)); return true; } @@ -684,7 +685,6 @@ xprt_rdma_free(struct rpc_task *task) dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply); - rpcrdma_remove_req(&r_xprt->rx_buf, req); if (!list_empty(&req->rl_registered)) ia->ri_ops->ro_unmap_safe(r_xprt, req, !RPC_IS_ASYNC(task)); rpcrdma_unmap_sges(ia, req); @@ -730,7 +730,7 @@ xprt_rdma_send_request(struct rpc_task *task) if (unlikely(!list_empty(&req->rl_registered))) r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); - rc = rpcrdma_marshal_req(rqst); + rc = rpcrdma_marshal_req(r_xprt, rqst); if (rc < 0) goto failed_marshal; @@ -811,7 +811,7 @@ xprt_rdma_disable_swap(struct rpc_xprt *xprt) * Plumbing for rpc transport switch and kernel module */ -static struct rpc_xprt_ops xprt_rdma_procs = { +static const struct rpc_xprt_ops xprt_rdma_procs = { .reserve_xprt = xprt_reserve_xprt_cong, .release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */ .alloc_slot = xprt_alloc_slot, diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index e4171f2abe37..11a1fbf7e59e 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -139,14 +139,11 @@ rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) static void rpcrdma_update_granted_credits(struct rpcrdma_rep *rep) { - struct rpcrdma_msg *rmsgp = rdmab_to_msg(rep->rr_rdmabuf); struct rpcrdma_buffer *buffer = &rep->rr_rxprt->rx_buf; + __be32 *p = rep->rr_rdmabuf->rg_base; u32 credits; - if (rep->rr_len < RPCRDMA_HDRLEN_ERR) - return; - - credits = be32_to_cpu(rmsgp->rm_credit); + credits = be32_to_cpup(p + 2); if (credits == 0) credits = 1; /* don't deadlock */ else if (credits > buffer->rb_max_requests) @@ -173,21 +170,19 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) goto out_fail; /* status == SUCCESS means all fields in wc are trustworthy */ - if (wc->opcode != IB_WC_RECV) - return; - dprintk("RPC: %s: rep %p opcode 'recv', length %u: success\n", __func__, rep, wc->byte_len); - rep->rr_len = wc->byte_len; + rpcrdma_set_xdrlen(&rep->rr_hdrbuf, wc->byte_len); rep->rr_wc_flags = wc->wc_flags; rep->rr_inv_rkey = wc->ex.invalidate_rkey; ib_dma_sync_single_for_cpu(rdmab_device(rep->rr_rdmabuf), rdmab_addr(rep->rr_rdmabuf), - rep->rr_len, DMA_FROM_DEVICE); + wc->byte_len, DMA_FROM_DEVICE); - rpcrdma_update_granted_credits(rep); + if (wc->byte_len >= RPCRDMA_HDRLEN_ERR) + rpcrdma_update_granted_credits(rep); out_schedule: queue_work(rpcrdma_receive_wq, &rep->rr_work); @@ -198,7 +193,7 @@ out_fail: pr_err("rpcrdma: Recv: %s (%u/0x%x)\n", ib_wc_status_msg(wc->status), wc->status, wc->vendor_err); - rep->rr_len = RPCRDMA_BAD_LEN; + rpcrdma_set_xdrlen(&rep->rr_hdrbuf, 0); goto out_schedule; } @@ -974,6 +969,8 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) rc = PTR_ERR(rep->rr_rdmabuf); goto out_free; } + xdr_buf_init(&rep->rr_hdrbuf, rep->rr_rdmabuf->rg_base, + rdmab_length(rep->rr_rdmabuf)); rep->rr_cqe.done = rpcrdma_wc_receive; rep->rr_rxprt = r_xprt; @@ -1004,7 +1001,6 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) spin_lock_init(&buf->rb_recovery_lock); INIT_LIST_HEAD(&buf->rb_mws); INIT_LIST_HEAD(&buf->rb_all); - INIT_LIST_HEAD(&buf->rb_pending); INIT_LIST_HEAD(&buf->rb_stale_mrs); INIT_DELAYED_WORK(&buf->rb_refresh_worker, rpcrdma_mr_refresh_worker); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index b282d3f8cdd8..e26a97d2f922 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -218,18 +218,17 @@ enum { struct rpcrdma_rep { struct ib_cqe rr_cqe; - unsigned int rr_len; int rr_wc_flags; u32 rr_inv_rkey; + struct rpcrdma_regbuf *rr_rdmabuf; struct rpcrdma_xprt *rr_rxprt; struct work_struct rr_work; + struct xdr_buf rr_hdrbuf; + struct xdr_stream rr_stream; struct list_head rr_list; struct ib_recv_wr rr_recv_wr; - struct rpcrdma_regbuf *rr_rdmabuf; }; -#define RPCRDMA_BAD_LEN (~0U) - /* * struct rpcrdma_mw - external memory region metadata * @@ -341,11 +340,12 @@ enum { struct rpcrdma_buffer; struct rpcrdma_req { struct list_head rl_list; - __be32 rl_xid; unsigned int rl_mapped_sges; unsigned int rl_connect_cookie; struct rpcrdma_buffer *rl_buffer; struct rpcrdma_rep *rl_reply; + struct xdr_stream rl_stream; + struct xdr_buf rl_hdrbuf; struct ib_send_wr rl_send_wr; struct ib_sge rl_send_sge[RPCRDMA_MAX_SEND_SGES]; struct rpcrdma_regbuf *rl_rdmabuf; /* xprt header */ @@ -403,7 +403,6 @@ struct rpcrdma_buffer { int rb_send_count, rb_recv_count; struct list_head rb_send_bufs; struct list_head rb_recv_bufs; - struct list_head rb_pending; u32 rb_max_requests; atomic_t rb_credits; /* most recent credit grant */ @@ -440,24 +439,27 @@ struct rpcrdma_create_data_internal { * Statistics for RPCRDMA */ struct rpcrdma_stats { + /* accessed when sending a call */ unsigned long read_chunk_count; unsigned long write_chunk_count; unsigned long reply_chunk_count; - unsigned long long total_rdma_request; - unsigned long long total_rdma_reply; + /* rarely accessed error counters */ unsigned long long pullup_copy_count; - unsigned long long fixup_copy_count; unsigned long hardway_register_count; unsigned long failed_marshal_count; unsigned long bad_reply_count; - unsigned long nomsg_call_count; - unsigned long bcall_count; unsigned long mrs_recovered; unsigned long mrs_orphaned; unsigned long mrs_allocated; + + /* accessed when receiving a reply */ + unsigned long long total_rdma_reply; + unsigned long long fixup_copy_count; unsigned long local_inv_needed; + unsigned long nomsg_call_count; + unsigned long bcall_count; }; /* @@ -465,7 +467,8 @@ struct rpcrdma_stats { */ struct rpcrdma_xprt; struct rpcrdma_memreg_ops { - int (*ro_map)(struct rpcrdma_xprt *, + struct rpcrdma_mr_seg * + (*ro_map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool, struct rpcrdma_mw **); void (*ro_unmap_sync)(struct rpcrdma_xprt *, @@ -552,34 +555,6 @@ void rpcrdma_destroy_req(struct rpcrdma_req *); int rpcrdma_buffer_create(struct rpcrdma_xprt *); void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); -static inline void -rpcrdma_insert_req(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) -{ - spin_lock(&buffers->rb_lock); - if (list_empty(&req->rl_list)) - list_add_tail(&req->rl_list, &buffers->rb_pending); - spin_unlock(&buffers->rb_lock); -} - -static inline struct rpcrdma_req * -rpcrdma_lookup_req_locked(struct rpcrdma_buffer *buffers, __be32 xid) -{ - struct rpcrdma_req *pos; - - list_for_each_entry(pos, &buffers->rb_pending, rl_list) - if (pos->rl_xid == xid) - return pos; - return NULL; -} - -static inline void -rpcrdma_remove_req(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) -{ - spin_lock(&buffers->rb_lock); - list_del(&req->rl_list); - spin_unlock(&buffers->rb_lock); -} - struct rpcrdma_mw *rpcrdma_get_mw(struct rpcrdma_xprt *); void rpcrdma_put_mw(struct rpcrdma_xprt *, struct rpcrdma_mw *); struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); @@ -638,10 +613,16 @@ enum rpcrdma_chunktype { bool rpcrdma_prepare_send_sges(struct rpcrdma_ia *, struct rpcrdma_req *, u32, struct xdr_buf *, enum rpcrdma_chunktype); void rpcrdma_unmap_sges(struct rpcrdma_ia *, struct rpcrdma_req *); -int rpcrdma_marshal_req(struct rpc_rqst *); +int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst); void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *); void rpcrdma_reply_handler(struct work_struct *work); +static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len) +{ + xdr->head[0].iov_len = len; + xdr->len = len; +} + /* RPC/RDMA module init - xprtrdma/transport.c */ extern unsigned int xprt_rdma_max_inline_read; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 4f154d388748..9b5de31aa429 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -969,10 +969,12 @@ static void xs_local_data_read_skb(struct rpc_xprt *xprt, return; /* Look up and lock the request corresponding to the given XID */ - spin_lock_bh(&xprt->transport_lock); + spin_lock(&xprt->recv_lock); rovr = xprt_lookup_rqst(xprt, *xp); if (!rovr) goto out_unlock; + xprt_pin_rqst(rovr); + spin_unlock(&xprt->recv_lock); task = rovr->rq_task; copied = rovr->rq_private_buf.buflen; @@ -981,13 +983,16 @@ static void xs_local_data_read_skb(struct rpc_xprt *xprt, if (xs_local_copy_to_xdr(&rovr->rq_private_buf, skb)) { dprintk("RPC: sk_buff copy failed\n"); - goto out_unlock; + spin_lock(&xprt->recv_lock); + goto out_unpin; } + spin_lock(&xprt->recv_lock); xprt_complete_rqst(task, copied); - +out_unpin: + xprt_unpin_rqst(rovr); out_unlock: - spin_unlock_bh(&xprt->transport_lock); + spin_unlock(&xprt->recv_lock); } static void xs_local_data_receive(struct sock_xprt *transport) @@ -1050,10 +1055,12 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt, return; /* Look up and lock the request corresponding to the given XID */ - spin_lock_bh(&xprt->transport_lock); + spin_lock(&xprt->recv_lock); rovr = xprt_lookup_rqst(xprt, *xp); if (!rovr) goto out_unlock; + xprt_pin_rqst(rovr); + spin_unlock(&xprt->recv_lock); task = rovr->rq_task; if ((copied = rovr->rq_private_buf.buflen) > repsize) @@ -1062,16 +1069,21 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt, /* Suck it into the iovec, verify checksum if not done by hw. */ if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) { __UDPX_INC_STATS(sk, UDP_MIB_INERRORS); - goto out_unlock; + spin_lock(&xprt->recv_lock); + goto out_unpin; } __UDPX_INC_STATS(sk, UDP_MIB_INDATAGRAMS); + spin_lock_bh(&xprt->transport_lock); xprt_adjust_cwnd(xprt, task, copied); + spin_unlock_bh(&xprt->transport_lock); + spin_lock(&xprt->recv_lock); xprt_complete_rqst(task, copied); - +out_unpin: + xprt_unpin_rqst(rovr); out_unlock: - spin_unlock_bh(&xprt->transport_lock); + spin_unlock(&xprt->recv_lock); } static void xs_udp_data_receive(struct sock_xprt *transport) @@ -1277,25 +1289,12 @@ static inline void xs_tcp_read_common(struct rpc_xprt *xprt, } len = desc->count; - if (len > transport->tcp_reclen - transport->tcp_offset) { - struct xdr_skb_reader my_desc; - - len = transport->tcp_reclen - transport->tcp_offset; - memcpy(&my_desc, desc, sizeof(my_desc)); - my_desc.count = len; - r = xdr_partial_copy_from_skb(rcvbuf, transport->tcp_copied, - &my_desc, xdr_skb_read_bits); - desc->count -= r; - desc->offset += r; - } else - r = xdr_partial_copy_from_skb(rcvbuf, transport->tcp_copied, + if (len > transport->tcp_reclen - transport->tcp_offset) + desc->count = transport->tcp_reclen - transport->tcp_offset; + r = xdr_partial_copy_from_skb(rcvbuf, transport->tcp_copied, desc, xdr_skb_read_bits); - if (r > 0) { - transport->tcp_copied += r; - transport->tcp_offset += r; - } - if (r != len) { + if (desc->count) { /* Error when copying to the receive buffer, * usually because we weren't able to allocate * additional buffer pages. All we can do now @@ -1315,6 +1314,10 @@ static inline void xs_tcp_read_common(struct rpc_xprt *xprt, return; } + transport->tcp_copied += r; + transport->tcp_offset += r; + desc->count = len - r; + dprintk("RPC: XID %08x read %zd bytes\n", ntohl(transport->tcp_xid), r); dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, " @@ -1343,21 +1346,24 @@ static inline int xs_tcp_read_reply(struct rpc_xprt *xprt, dprintk("RPC: read reply XID %08x\n", ntohl(transport->tcp_xid)); /* Find and lock the request corresponding to this xid */ - spin_lock_bh(&xprt->transport_lock); + spin_lock(&xprt->recv_lock); req = xprt_lookup_rqst(xprt, transport->tcp_xid); if (!req) { dprintk("RPC: XID %08x request not found!\n", ntohl(transport->tcp_xid)); - spin_unlock_bh(&xprt->transport_lock); + spin_unlock(&xprt->recv_lock); return -1; } + xprt_pin_rqst(req); + spin_unlock(&xprt->recv_lock); xs_tcp_read_common(xprt, desc, req); + spin_lock(&xprt->recv_lock); if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) xprt_complete_rqst(req->rq_task, transport->tcp_copied); - - spin_unlock_bh(&xprt->transport_lock); + xprt_unpin_rqst(req); + spin_unlock(&xprt->recv_lock); return 0; } @@ -1376,11 +1382,9 @@ static int xs_tcp_read_callback(struct rpc_xprt *xprt, container_of(xprt, struct sock_xprt, xprt); struct rpc_rqst *req; - /* Look up and lock the request corresponding to the given XID */ - spin_lock_bh(&xprt->transport_lock); + /* Look up the request corresponding to the given XID */ req = xprt_lookup_bc_request(xprt, transport->tcp_xid); if (req == NULL) { - spin_unlock_bh(&xprt->transport_lock); printk(KERN_WARNING "Callback slot table overflowed\n"); xprt_force_disconnect(xprt); return -1; @@ -1391,7 +1395,6 @@ static int xs_tcp_read_callback(struct rpc_xprt *xprt, if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) xprt_complete_bc_request(req, transport->tcp_copied); - spin_unlock_bh(&xprt->transport_lock); return 0; } @@ -1516,6 +1519,7 @@ static void xs_tcp_data_receive(struct sock_xprt *transport) .arg.data = xprt, }; unsigned long total = 0; + int loop; int read = 0; mutex_lock(&transport->recv_mutex); @@ -1524,20 +1528,20 @@ static void xs_tcp_data_receive(struct sock_xprt *transport) goto out; /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */ - for (;;) { + for (loop = 0; loop < 64; loop++) { lock_sock(sk); read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); if (read <= 0) { clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); release_sock(sk); - if (!test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) - break; - } else { - release_sock(sk); - total += read; + break; } + release_sock(sk); + total += read; rd_desc.count = 65536; } + if (test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) + queue_work(xprtiod_workqueue, &transport->recv_worker); out: mutex_unlock(&transport->recv_mutex); trace_xs_tcp_data_ready(xprt, read, total); @@ -2724,7 +2728,7 @@ static void bc_destroy(struct rpc_xprt *xprt) module_put(THIS_MODULE); } -static struct rpc_xprt_ops xs_local_ops = { +static const struct rpc_xprt_ops xs_local_ops = { .reserve_xprt = xprt_reserve_xprt, .release_xprt = xs_tcp_release_xprt, .alloc_slot = xprt_alloc_slot, @@ -2742,7 +2746,7 @@ static struct rpc_xprt_ops xs_local_ops = { .disable_swap = xs_disable_swap, }; -static struct rpc_xprt_ops xs_udp_ops = { +static const struct rpc_xprt_ops xs_udp_ops = { .set_buffer_size = xs_udp_set_buffer_size, .reserve_xprt = xprt_reserve_xprt_cong, .release_xprt = xprt_release_xprt_cong, @@ -2764,7 +2768,7 @@ static struct rpc_xprt_ops xs_udp_ops = { .inject_disconnect = xs_inject_disconnect, }; -static struct rpc_xprt_ops xs_tcp_ops = { +static const struct rpc_xprt_ops xs_tcp_ops = { .reserve_xprt = xprt_reserve_xprt, .release_xprt = xs_tcp_release_xprt, .alloc_slot = xprt_lock_and_alloc_slot, @@ -2795,7 +2799,7 @@ static struct rpc_xprt_ops xs_tcp_ops = { * The rpc_xprt_ops for the server backchannel */ -static struct rpc_xprt_ops bc_tcp_ops = { +static const struct rpc_xprt_ops bc_tcp_ops = { .reserve_xprt = xprt_reserve_xprt, .release_xprt = xprt_release_xprt, .alloc_slot = xprt_alloc_slot, diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index ac1d66d7e1fd..47ec121574ce 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -637,7 +637,7 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt, break; case NETDEV_UNREGISTER: case NETDEV_CHANGENAME: - bearer_disable(dev_net(dev), b); + bearer_disable(net, b); break; } return NOTIFY_OK; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 8ce85420ecb0..0df8023f480b 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3791,8 +3791,8 @@ static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params, static void nl80211_calculate_ap_params(struct cfg80211_ap_settings *params) { const struct cfg80211_beacon_data *bcn = ¶ms->beacon; - size_t ies_len = bcn->beacon_ies_len; - const u8 *ies = bcn->beacon_ies; + size_t ies_len = bcn->tail_len; + const u8 *ies = bcn->tail; const u8 *rates; const u8 *cap; diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 5fae296a6a58..6e94f6934a0e 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -4,6 +4,7 @@ * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2008-2011 Luis R. Rodriguez <mcgrof@qca.qualcomm.com> * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright 2017 Intel Deutschland GmbH * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -1483,7 +1484,9 @@ static void reg_process_ht_flags_channel(struct wiphy *wiphy, { struct ieee80211_supported_band *sband = wiphy->bands[channel->band]; struct ieee80211_channel *channel_before = NULL, *channel_after = NULL; + const struct ieee80211_regdomain *regd; unsigned int i; + u32 flags; if (!is_ht40_allowed(channel)) { channel->flags |= IEEE80211_CHAN_NO_HT40; @@ -1503,17 +1506,30 @@ static void reg_process_ht_flags_channel(struct wiphy *wiphy, channel_after = c; } + flags = 0; + regd = get_wiphy_regdom(wiphy); + if (regd) { + const struct ieee80211_reg_rule *reg_rule = + freq_reg_info_regd(MHZ_TO_KHZ(channel->center_freq), + regd, MHZ_TO_KHZ(20)); + + if (!IS_ERR(reg_rule)) + flags = reg_rule->flags; + } + /* * Please note that this assumes target bandwidth is 20 MHz, * if that ever changes we also need to change the below logic * to include that as well. */ - if (!is_ht40_allowed(channel_before)) + if (!is_ht40_allowed(channel_before) || + flags & NL80211_RRF_NO_HT40MINUS) channel->flags |= IEEE80211_CHAN_NO_HT40MINUS; else channel->flags &= ~IEEE80211_CHAN_NO_HT40MINUS; - if (!is_ht40_allowed(channel_after)) + if (!is_ht40_allowed(channel_after) || + flags & NL80211_RRF_NO_HT40PLUS) channel->flags |= IEEE80211_CHAN_NO_HT40PLUS; else channel->flags &= ~IEEE80211_CHAN_NO_HT40PLUS; |