From 9802d21e7a0b0d2167ef745edc1f4ea7a0fc6ea3 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Thu, 12 Jun 2014 09:17:55 +0300 Subject: ipvs: stop tot_stats estimator only under CONFIG_SYSCTL The tot_stats estimator is started only when CONFIG_SYSCTL is defined. But it is stopped without checking CONFIG_SYSCTL. Fix the crash by moving ip_vs_stop_estimator into ip_vs_control_net_cleanup_sysctl. The change is needed after commit 14e405461e664b ("IPVS: Add __ip_vs_control_{init,cleanup}_sysctl()") from 2.6.39. Reported-by: Jet Chen Tested-by: Jet Chen Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_ctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index c42e83d2751c..581a6584ed0c 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -3778,6 +3778,7 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) cancel_delayed_work_sync(&ipvs->defense_work); cancel_work_sync(&ipvs->defense_work.work); unregister_net_sysctl_table(ipvs->sysctl_hdr); + ip_vs_stop_estimator(net, &ipvs->tot_stats); } #else @@ -3840,7 +3841,6 @@ void __net_exit ip_vs_control_net_cleanup(struct net *net) struct netns_ipvs *ipvs = net_ipvs(net); ip_vs_trash_cleanup(net); - ip_vs_stop_estimator(net, &ipvs->tot_stats); ip_vs_control_net_cleanup_sysctl(net); remove_proc_entry("ip_vs_stats_percpu", net->proc_net); remove_proc_entry("ip_vs_stats", net->proc_net); -- cgit v1.2.1 From b62b65055bcc5372d5c3f4103629176cb8db3678 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Thu, 5 Jun 2014 12:19:54 +0300 Subject: Bluetooth: Fix incorrectly overriding conn->src_type The src_type member of struct hci_conn should always reflect the address type of the src_member. It should never be overridden. There is already code in place in the command status handler of HCI_LE_Create_Connection to copy the right initiator address into conn->init_addr_type. Without this patch, if privacy is enabled, we will send the wrong address type in the SMP identity address information PDU (it'll e.g. contain our public address but a random address type). Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann Cc: stable@vger.kernel.org --- net/bluetooth/hci_conn.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 8671bc79a35b..b9b2bd464bec 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -610,11 +610,6 @@ static void hci_req_add_le_create_conn(struct hci_request *req, if (hci_update_random_address(req, false, &own_addr_type)) return; - /* Save the address type used for this connnection attempt so we able - * to retrieve this information if we need it. - */ - conn->src_type = own_addr_type; - cp.scan_interval = cpu_to_le16(hdev->le_scan_interval); cp.scan_window = cpu_to_le16(hdev->le_scan_window); bacpy(&cp.peer_addr, &conn->dst); -- cgit v1.2.1 From e694788d73efe139b24f78b036deb97fe57fa8cb Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Tue, 10 Jun 2014 09:54:24 +0300 Subject: Bluetooth: Fix check for connection encryption The conn->link_key variable tracks the type of link key in use. It is set whenever we respond to a link key request as well as when we get a link key notification event. These two events do not however always guarantee that encryption is enabled: getting a link key request and responding to it may only mean that the remote side has requested authentication but not encryption. On the other hand, the encrypt change event is a certain guarantee that encryption is enabled. The real encryption state is already tracked in the conn->link_mode variable through the HCI_LM_ENCRYPT bit. This patch fixes a check for encryption in the hci_conn_auth function to use the proper conn->link_mode value and thereby eliminates the chance of a false positive result. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann Cc: stable@vger.kernel.org --- net/bluetooth/hci_conn.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index b9b2bd464bec..ca01d1861854 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -889,7 +889,7 @@ static int hci_conn_auth(struct hci_conn *conn, __u8 sec_level, __u8 auth_type) /* If we're already encrypted set the REAUTH_PEND flag, * otherwise set the ENCRYPT_PEND. */ - if (conn->key_type != 0xff) + if (conn->link_mode & HCI_LM_ENCRYPT) set_bit(HCI_CONN_REAUTH_PEND, &conn->flags); else set_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags); -- cgit v1.2.1 From ba15a58b179ed76a7e887177f2b06de12c58ec8f Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Mon, 9 Jun 2014 13:58:14 +0300 Subject: Bluetooth: Fix SSP acceptor just-works confirmation without MITM From the Bluetooth Core Specification 4.1 page 1958: "if both devices have set the Authentication_Requirements parameter to one of the MITM Protection Not Required options, authentication stage 1 shall function as if both devices set their IO capabilities to DisplayOnly (e.g., Numeric comparison with automatic confirmation on both devices)" So far our implementation has done user confirmation for all just-works cases regardless of the MITM requirements, however following the specification to the word means that we should not be doing confirmation when neither side has the MITM flag set. Signed-off-by: Johan Hedberg Tested-by: Szymon Janc Signed-off-by: Marcel Holtmann Cc: stable@vger.kernel.org --- net/bluetooth/hci_event.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 21e5913d12e0..ff11f4a1ada3 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -3628,8 +3628,11 @@ static void hci_user_confirm_request_evt(struct hci_dev *hdev, /* If we're not the initiators request authorization to * proceed from user space (mgmt_user_confirm with - * confirm_hint set to 1). */ - if (!test_bit(HCI_CONN_AUTH_PEND, &conn->flags)) { + * confirm_hint set to 1). The exception is if neither + * side had MITM in which case we do auto-accept. + */ + if (!test_bit(HCI_CONN_AUTH_PEND, &conn->flags) && + (loc_mitm || rem_mitm)) { BT_DBG("Confirming auto-accept as acceptor"); confirm_hint = 1; goto confirm; -- cgit v1.2.1 From 4ad51a75c70ba1ba6802fa7ff2ee6829b1c6e61a Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Mon, 9 Jun 2014 14:41:25 +0300 Subject: Bluetooth: Add clarifying comment for conn->auth_type When responding to an IO capability request when we're the initiators of the pairing we will not yet have the remote IO capability information. Since the conn->auth_type variable is treated as an "absolute" requirement instead of a hint of what's needed later in the user confirmation request handler it's important that it doesn't have the MITM bit set if there's any chance that the remote device doesn't have the necessary IO capabilities. This patch adds a clarifying comment so that conn->auth_type is left untouched in this scenario. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_event.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index ff11f4a1ada3..3183edc25769 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -3537,7 +3537,11 @@ static void hci_io_capa_request_evt(struct hci_dev *hdev, struct sk_buff *skb) cp.authentication = conn->auth_type; /* Request MITM protection if our IO caps allow it - * except for the no-bonding case + * except for the no-bonding case. + * conn->auth_type is not updated here since + * that might cause the user confirmation to be + * rejected in case the remote doesn't have the + * IO capabilities for MITM. */ if (conn->io_capability != HCI_IO_NO_INPUT_OUTPUT && cp.authentication != HCI_AT_NO_BONDING) -- cgit v1.2.1 From fff3490f47810e2d34b91fb9e31103e923b11c2f Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Tue, 10 Jun 2014 15:19:50 +0300 Subject: Bluetooth: Fix setting correct authentication information for SMP STK When we store the STK in slave role we should set the correct authentication information for it. If the pairing is producing a HIGH security level the STK is considered authenticated, and otherwise it's considered unauthenticated. This patch fixes the value passed to the hci_add_ltk() function when adding the STK on the slave side. Signed-off-by: Johan Hedberg Tested-by: Marcin Kraglak Signed-off-by: Marcel Holtmann Cc: stable@vger.kernel.org --- net/bluetooth/smp.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 3d1cc164557d..f2829a7932e2 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -544,7 +544,7 @@ static u8 smp_random(struct smp_chan *smp) hci_le_start_enc(hcon, ediv, rand, stk); hcon->enc_key_size = smp->enc_key_size; } else { - u8 stk[16]; + u8 stk[16], auth; __le64 rand = 0; __le16 ediv = 0; @@ -556,8 +556,13 @@ static u8 smp_random(struct smp_chan *smp) memset(stk + smp->enc_key_size, 0, SMP_MAX_ENC_KEY_SIZE - smp->enc_key_size); + if (hcon->pending_sec_level == BT_SECURITY_HIGH) + auth = 1; + else + auth = 0; + hci_add_ltk(hcon->hdev, &hcon->dst, hcon->dst_type, - HCI_SMP_STK_SLAVE, 0, stk, smp->enc_key_size, + HCI_SMP_STK_SLAVE, auth, stk, smp->enc_key_size, ediv, rand); } -- cgit v1.2.1 From 50143a433b70e3145bcf8a4a4e54f0c11bdee32b Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Tue, 10 Jun 2014 14:05:57 +0300 Subject: Bluetooth: Fix indicating discovery state when canceling inquiry When inquiry is canceled through the HCI_Cancel_Inquiry command there is no Inquiry Complete event generated. Instead, all we get is the command complete for the HCI_Inquiry_Cancel command. This means that we must call the hci_discovery_set_state() function from the respective command complete handler in order to ensure that user space knows the correct discovery state. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann Cc: stable@vger.kernel.org --- net/bluetooth/hci_event.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 3183edc25769..640c54ec1bd2 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -48,6 +48,10 @@ static void hci_cc_inquiry_cancel(struct hci_dev *hdev, struct sk_buff *skb) smp_mb__after_atomic(); /* wake_up_bit advises about this barrier */ wake_up_bit(&hdev->flags, HCI_INQUIRY); + hci_dev_lock(hdev); + hci_discovery_set_state(hdev, DISCOVERY_STOPPED); + hci_dev_unlock(hdev); + hci_conn_check_pending(hdev); } -- cgit v1.2.1 From 21a60d307ddc2180cfa542a995d943d1034cf5c5 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Tue, 10 Jun 2014 14:05:58 +0300 Subject: Bluetooth: Refactor discovery stopping into its own function We'll need to reuse the same logic for stopping discovery also when cleaning up HCI state when powering off. This patch refactors the code out to its own function that can later (in a subsequent patch) be used also for the power off case. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann Cc: stable@vger.kernel.org --- net/bluetooth/mgmt.c | 93 +++++++++++++++++++++++++++------------------------- 1 file changed, 49 insertions(+), 44 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 0fce54412ffd..be6f03219121 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -1047,6 +1047,43 @@ static void clean_up_hci_complete(struct hci_dev *hdev, u8 status) } } +static void hci_stop_discovery(struct hci_request *req) +{ + struct hci_dev *hdev = req->hdev; + struct hci_cp_remote_name_req_cancel cp; + struct inquiry_entry *e; + + switch (hdev->discovery.state) { + case DISCOVERY_FINDING: + if (test_bit(HCI_INQUIRY, &hdev->flags)) { + hci_req_add(req, HCI_OP_INQUIRY_CANCEL, 0, NULL); + } else { + cancel_delayed_work(&hdev->le_scan_disable); + hci_req_add_le_scan_disable(req); + } + + break; + + case DISCOVERY_RESOLVING: + e = hci_inquiry_cache_lookup_resolve(hdev, BDADDR_ANY, + NAME_PENDING); + if (!e) + return; + + bacpy(&cp.bdaddr, &e->data.bdaddr); + hci_req_add(req, HCI_OP_REMOTE_NAME_REQ_CANCEL, sizeof(cp), + &cp); + + break; + + default: + /* Passive scanning */ + if (test_bit(HCI_LE_SCAN, &hdev->dev_flags)) + hci_req_add_le_scan_disable(req); + break; + } +} + static int clean_up_hci_state(struct hci_dev *hdev) { struct hci_request req; @@ -3574,8 +3611,6 @@ static int stop_discovery(struct sock *sk, struct hci_dev *hdev, void *data, { struct mgmt_cp_stop_discovery *mgmt_cp = data; struct pending_cmd *cmd; - struct hci_cp_remote_name_req_cancel cp; - struct inquiry_entry *e; struct hci_request req; int err; @@ -3605,52 +3640,22 @@ static int stop_discovery(struct sock *sk, struct hci_dev *hdev, void *data, hci_req_init(&req, hdev); - switch (hdev->discovery.state) { - case DISCOVERY_FINDING: - if (test_bit(HCI_INQUIRY, &hdev->flags)) { - hci_req_add(&req, HCI_OP_INQUIRY_CANCEL, 0, NULL); - } else { - cancel_delayed_work(&hdev->le_scan_disable); - - hci_req_add_le_scan_disable(&req); - } - - break; - - case DISCOVERY_RESOLVING: - e = hci_inquiry_cache_lookup_resolve(hdev, BDADDR_ANY, - NAME_PENDING); - if (!e) { - mgmt_pending_remove(cmd); - err = cmd_complete(sk, hdev->id, - MGMT_OP_STOP_DISCOVERY, 0, - &mgmt_cp->type, - sizeof(mgmt_cp->type)); - hci_discovery_set_state(hdev, DISCOVERY_STOPPED); - goto unlock; - } - - bacpy(&cp.bdaddr, &e->data.bdaddr); - hci_req_add(&req, HCI_OP_REMOTE_NAME_REQ_CANCEL, sizeof(cp), - &cp); + hci_stop_discovery(&req); - break; - - default: - BT_DBG("unknown discovery state %u", hdev->discovery.state); - - mgmt_pending_remove(cmd); - err = cmd_complete(sk, hdev->id, MGMT_OP_STOP_DISCOVERY, - MGMT_STATUS_FAILED, &mgmt_cp->type, - sizeof(mgmt_cp->type)); + err = hci_req_run(&req, stop_discovery_complete); + if (!err) { + hci_discovery_set_state(hdev, DISCOVERY_STOPPING); goto unlock; } - err = hci_req_run(&req, stop_discovery_complete); - if (err < 0) - mgmt_pending_remove(cmd); - else - hci_discovery_set_state(hdev, DISCOVERY_STOPPING); + mgmt_pending_remove(cmd); + + /* If no HCI commands were sent we're done */ + if (err == -ENODATA) { + err = cmd_complete(sk, hdev->id, MGMT_OP_STOP_DISCOVERY, 0, + &mgmt_cp->type, sizeof(mgmt_cp->type)); + hci_discovery_set_state(hdev, DISCOVERY_STOPPED); + } unlock: hci_dev_unlock(hdev); -- cgit v1.2.1 From f8680f128b01212895a9afb31032f6ffe91bd771 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Tue, 10 Jun 2014 14:05:59 +0300 Subject: Bluetooth: Reuse hci_stop_discovery function when cleaning up HCI state When cleaning up the HCI state as part of the power-off procedure we can reuse the hci_stop_discovery() function instead of explicitly sending HCI command related to discovery. The added benefit of this is that it takes care of canceling name resolving and inquiry which were not previously covered by the code. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann Cc: stable@vger.kernel.org --- net/bluetooth/mgmt.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index be6f03219121..6107e037cd8e 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -1100,9 +1100,7 @@ static int clean_up_hci_state(struct hci_dev *hdev) if (test_bit(HCI_ADVERTISING, &hdev->dev_flags)) disable_advertising(&req); - if (test_bit(HCI_LE_SCAN, &hdev->dev_flags)) { - hci_req_add_le_scan_disable(&req); - } + hci_stop_discovery(&req); list_for_each_entry(conn, &hdev->conn_hash.list, list) { struct hci_cp_disconnect dc; -- cgit v1.2.1 From 7ab56c3a6eccb215034b0cb096e0313441cbf2a4 Mon Sep 17 00:00:00 2001 From: Jukka Taimisto Date: Thu, 12 Jun 2014 10:15:13 +0000 Subject: Bluetooth: Fix deadlock in l2cap_conn_del() A deadlock occurs when PDU containing invalid SMP opcode is received on Security Manager Channel over LE link and conn->pending_rx_work worker has not run yet. When LE link is created l2cap_conn_ready() is called and before returning it schedules conn->pending_rx_work worker to hdev->workqueue. Incoming data to SMP fixed channel is handled by l2cap_recv_frame() which calls smp_sig_channel() to handle the SMP PDU. If smp_sig_channel() indicates failure l2cap_conn_del() is called to delete the connection. When deleting the connection, l2cap_conn_del() purges the pending_rx queue and calls flush_work() to wait for the pending_rx_work worker to complete. Since incoming data is handled by a worker running from the same workqueue as the pending_rx_work is being scheduled on, we will deadlock on waiting for pending_rx_work to complete. This patch fixes the deadlock by calling cancel_work_sync() instead of flush_work(). Signed-off-by: Jukka Taimisto Signed-off-by: Marcel Holtmann Cc: stable@vger.kernel.org --- net/bluetooth/l2cap_core.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 6eabbe05fe54..323f23cd2c37 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1663,7 +1663,13 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) kfree_skb(conn->rx_skb); skb_queue_purge(&conn->pending_rx); - flush_work(&conn->pending_rx_work); + + /* We can not call flush_work(&conn->pending_rx_work) here since we + * might block if we are running on a worker from the same workqueue + * pending_rx_work is waiting on. + */ + if (work_pending(&conn->pending_rx_work)) + cancel_work_sync(&conn->pending_rx_work); l2cap_unregister_all_users(conn); -- cgit v1.2.1 From c73f94b8c093a615ce80eabbde0ac6eb9abfe31a Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 13 Jun 2014 10:22:28 +0300 Subject: Bluetooth: Fix locking of hdev when calling into SMP code The SMP code expects hdev to be unlocked since e.g. crypto functions will try to (re)lock it. Therefore, we need to release the lock before calling into smp.c from mgmt.c. Without this we risk a deadlock whenever the smp_user_confirm_reply() function is called. Signed-off-by: Johan Hedberg Tested-by: Lukasz Rymanowski Signed-off-by: Marcel Holtmann Cc: stable@vger.kernel.org --- net/bluetooth/mgmt.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 6107e037cd8e..af8e0a6243b7 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -3031,8 +3031,13 @@ static int user_pairing_resp(struct sock *sk, struct hci_dev *hdev, } if (addr->type == BDADDR_LE_PUBLIC || addr->type == BDADDR_LE_RANDOM) { - /* Continue with pairing via SMP */ + /* Continue with pairing via SMP. The hdev lock must be + * released as SMP may try to recquire it for crypto + * purposes. + */ + hci_dev_unlock(hdev); err = smp_user_confirm_reply(conn, mgmt_op, passkey); + hci_dev_lock(hdev); if (!err) err = cmd_complete(sk, hdev->id, mgmt_op, -- cgit v1.2.1 From 92d1372e1a9fec00e146b74e8b9ad7a385b9b37f Mon Sep 17 00:00:00 2001 From: Marcin Kraglak Date: Fri, 13 Jun 2014 14:08:22 +0200 Subject: Bluetooth: Allow change security level on ATT_CID in slave role Kernel supports SMP Security Request so don't block increasing security when we are slave. Signed-off-by: Marcin Kraglak Acked-by: Johan Hedberg Signed-off-by: Marcel Holtmann Cc: stable@vger.kernel.org --- net/bluetooth/l2cap_sock.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index ade3fb4c23bc..e1378693cc90 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -787,11 +787,6 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, /*change security for LE channels */ if (chan->scid == L2CAP_CID_ATT) { - if (!conn->hcon->out) { - err = -EINVAL; - break; - } - if (smp_conn_security(conn->hcon, sec.level)) break; sk->sk_state = BT_CONFIG; -- cgit v1.2.1 From 266155b2de8fb721ae353688529b2f8bcdde2f90 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 5 Jun 2014 14:28:44 +0200 Subject: netfilter: ctnetlink: fix dumping of dying/unconfirmed conntracks The dumping prematurely stops, it seems the callback argument that indicates that all entries have been dumped is set after iterating on the first cpu list. The dumping also may stop before the entire per-cpu list content is also dumped. With this patch, conntrack -L dying now shows the dying list content again. Fixes: b7779d06 ("netfilter: conntrack: spinlock per cpu to protect special lists.") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 58579634427d..ef0eedd70541 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1163,9 +1163,6 @@ ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, bool dying if (cb->args[2]) return 0; - if (cb->args[0] == nr_cpu_ids) - return 0; - for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) { struct ct_pcpu *pcpu; @@ -1194,6 +1191,7 @@ restart: rcu_read_unlock(); if (res < 0) { nf_conntrack_get(&ct->ct_general); + cb->args[0] = cpu; cb->args[1] = (unsigned long)ct; spin_unlock_bh(&pcpu->lock); goto out; @@ -1202,10 +1200,10 @@ restart: if (cb->args[1]) { cb->args[1] = 0; goto restart; - } else - cb->args[2] = 1; + } spin_unlock_bh(&pcpu->lock); } + cb->args[2] = 1; out: if (last) nf_ct_put(last); -- cgit v1.2.1 From cd5f336f1780cb20e83146cde64d3d5779e175e6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 8 Jun 2014 11:41:23 +0200 Subject: netfilter: ctnetlink: fix refcnt leak in dying/unconfirmed list dumper 'last' keeps track of the ct that had its refcnt bumped during previous dump cycle. Thus it must not be overwritten until end-of-function. Another (unrelated, theoretical) issue: Don't attempt to bump refcnt of a conntrack whose reference count is already 0. Such conntrack is being destroyed right now, its memory is freed once we release the percpu dying spinlock. Fixes: b7779d06 ('netfilter: conntrack: spinlock per cpu to protect special lists.') Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index ef0eedd70541..70123f48b6c9 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1150,7 +1150,7 @@ static int ctnetlink_done_list(struct netlink_callback *cb) static int ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, bool dying) { - struct nf_conn *ct, *last = NULL; + struct nf_conn *ct, *last; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); @@ -1163,6 +1163,8 @@ ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, bool dying if (cb->args[2]) return 0; + last = (struct nf_conn *)cb->args[1]; + for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) { struct ct_pcpu *pcpu; @@ -1171,7 +1173,6 @@ ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, bool dying pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); spin_lock_bh(&pcpu->lock); - last = (struct nf_conn *)cb->args[1]; list = dying ? &pcpu->dying : &pcpu->unconfirmed; restart: hlist_nulls_for_each_entry(h, n, list, hnnode) { @@ -1190,7 +1191,8 @@ restart: ct); rcu_read_unlock(); if (res < 0) { - nf_conntrack_get(&ct->ct_general); + if (!atomic_inc_not_zero(&ct->ct_general.use)) + continue; cb->args[0] = cpu; cb->args[1] = (unsigned long)ct; spin_unlock_bh(&pcpu->lock); -- cgit v1.2.1 From 5bc5c307653cbf8fe9da6cbd8ae6c6bd5b86ff4b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 10 Jun 2014 10:53:00 +0200 Subject: netfilter: nf_tables: use RCU-safe list insertion when replacing rules The patch 5e94846 ("netfilter: nf_tables: add insert operation") did not include RCU-safe list insertion when replacing rules. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 624e083125b9..ba37c10e5139 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1796,7 +1796,7 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, goto err2; } nft_rule_disactivate_next(net, old_rule); - list_add_tail(&rule->list, &old_rule->list); + list_add_tail_rcu(&rule->list, &old_rule->list); } else { err = -ENOENT; goto err2; -- cgit v1.2.1 From a0a7379e16b6e4c229d082f24c7e3ef9e812ed46 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 10 Jun 2014 10:53:01 +0200 Subject: netfilter: nf_tables: use u32 for chain use counter Since 4fefee5 ("netfilter: nf_tables: allow to delete several objects from a batch"), every new rule bumps the chain use counter. However, this is limited to 16 bits, which means that it will overrun after 2^16 rules. Use a u32 chain counter and check for overflows (just like we do for table objects). Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index ba37c10e5139..5586426a6169 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1730,6 +1730,9 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, if (!create || nlh->nlmsg_flags & NLM_F_REPLACE) return -EINVAL; handle = nf_tables_alloc_handle(table); + + if (chain->use == UINT_MAX) + return -EOVERFLOW; } if (nla[NFTA_RULE_POSITION]) { -- cgit v1.2.1 From ac34b861979ec5057d686c890b1b8f8661e9b99f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 10 Jun 2014 10:53:02 +0200 Subject: netfilter: nf_tables: decrement chain use counter when replacing rules Thus, the chain use counter remains with the same value after the rule replacement. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 5586426a6169..19f438deeab8 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1799,6 +1799,7 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, goto err2; } nft_rule_disactivate_next(net, old_rule); + chain->use--; list_add_tail_rcu(&rule->list, &old_rule->list); } else { err = -ENOENT; @@ -1829,6 +1830,7 @@ err3: list_del_rcu(&nft_trans_rule(trans)->list); nft_rule_clear(net, nft_trans_rule(trans)); nft_trans_destroy(trans); + chain->use++; } err2: nf_tables_rule_destroy(&ctx, rule); -- cgit v1.2.1 From ac904ac835ac7879a9374dc3ef1e5cb75d9c7ceb Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 10 Jun 2014 10:53:03 +0200 Subject: netfilter: nf_tables: fix wrong type in transaction when replacing rules In b380e5c ("netfilter: nf_tables: add message type to transactions"), I used the wrong message type in the rule replacement case. The rule that is replaced needs to be handled as a deleted rule. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 19f438deeab8..39369ea2df0c 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1792,7 +1792,7 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, if (nlh->nlmsg_flags & NLM_F_REPLACE) { if (nft_rule_is_active_next(net, old_rule)) { - trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, + trans = nft_trans_rule_add(&ctx, NFT_MSG_DELRULE, old_rule); if (trans == NULL) { err = -ENOMEM; -- cgit v1.2.1 From 3d9b142131ef0cde259dbac5cce36f570fcb4902 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 11 Jun 2014 14:27:46 +0200 Subject: netfilter: nft_compat: call {target, match}->destroy() to cleanup entry Otherwise, the reference to external objects (eg. modules) are not released when the rules are removed. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_compat.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'net') diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 8a779be832fb..1840989092ed 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -195,6 +195,15 @@ static void nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct xt_target *target = expr->ops->data; + void *info = nft_expr_priv(expr); + struct xt_tgdtor_param par; + + par.net = ctx->net; + par.target = target; + par.targinfo = info; + par.family = ctx->afi->family; + if (par.target->destroy != NULL) + par.target->destroy(&par); module_put(target->me); } @@ -382,6 +391,15 @@ static void nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct xt_match *match = expr->ops->data; + void *info = nft_expr_priv(expr); + struct xt_mtdtor_param par; + + par.net = ctx->net; + par.match = match; + par.matchinfo = info; + par.family = ctx->afi->family; + if (par.match->destroy != NULL) + par.match->destroy(&par); module_put(match->me); } -- cgit v1.2.1 From 6403d96254c7c44fdfa163248b1198c714c65f6a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 11 Jun 2014 19:05:28 +0200 Subject: netfilter: nf_tables: indicate family when dumping set elements Set the nfnetlink header that indicates the family of this element. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 39369ea2df0c..ab4566cfcbe4 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2850,7 +2850,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) goto nla_put_failure; nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = NFPROTO_UNSPEC; + nfmsg->nfgen_family = ctx.afi->family; nfmsg->version = NFNETLINK_V0; nfmsg->res_id = 0; -- cgit v1.2.1 From 915136065b7ca75af4cae06281e4dc43926edbfe Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 13 Jun 2014 13:45:38 +0200 Subject: netfilter: nft_nat: don't dump port information if unset Don't include port information attributes if they are unset. Reported-by: Ana Rey Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_nat.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index a0195d28bcfc..79ff58cd36dc 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -175,12 +175,14 @@ static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr) if (nla_put_be32(skb, NFTA_NAT_REG_ADDR_MAX, htonl(priv->sreg_addr_max))) goto nla_put_failure; - if (nla_put_be32(skb, - NFTA_NAT_REG_PROTO_MIN, htonl(priv->sreg_proto_min))) - goto nla_put_failure; - if (nla_put_be32(skb, - NFTA_NAT_REG_PROTO_MAX, htonl(priv->sreg_proto_max))) - goto nla_put_failure; + if (priv->sreg_proto_min) { + if (nla_put_be32(skb, NFTA_NAT_REG_PROTO_MIN, + htonl(priv->sreg_proto_min))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_NAT_REG_PROTO_MAX, + htonl(priv->sreg_proto_max))) + goto nla_put_failure; + } return 0; nla_put_failure: -- cgit v1.2.1 From 4a001068d790366bbf64ee927a363f752abafa71 Mon Sep 17 00:00:00 2001 From: Ken-ichirou MATSUZAWA Date: Mon, 16 Jun 2014 13:52:34 +0200 Subject: netfilter: ctnetlink: add zone size to length Signed-off-by: Ken-ichirou MATSUZAWA Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 70123f48b6c9..300ed1eec729 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -596,6 +596,9 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct) #endif #ifdef CONFIG_NF_CONNTRACK_MARK + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */ +#endif +#ifdef CONFIG_NF_CONNTRACK_ZONES + + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */ #endif + ctnetlink_proto_size(ct) + ctnetlink_label_size(ct) @@ -2039,6 +2042,9 @@ ctnetlink_nfqueue_build_size(const struct nf_conn *ct) #endif #ifdef CONFIG_NF_CONNTRACK_MARK + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */ +#endif +#ifdef CONFIG_NF_CONNTRACK_ZONES + + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */ #endif + ctnetlink_proto_size(ct) ; -- cgit v1.2.1 From 945b2b2d259d1a4364a2799e80e8ff32f8c6ee6f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 7 Jun 2014 21:17:04 +0200 Subject: netfilter: nf_nat: fix oops on netns removal Quoting Samu Kallio: Basically what's happening is, during netns cleanup, nf_nat_net_exit gets called before ipv4_net_exit. As I understand it, nf_nat_net_exit is supposed to kill any conntrack entries which have NAT context (through nf_ct_iterate_cleanup), but for some reason this doesn't happen (perhaps something else is still holding refs to those entries?). When ipv4_net_exit is called, conntrack entries (including those with NAT context) are cleaned up, but the nat_bysource hashtable is long gone - freed in nf_nat_net_exit. The bug happens when attempting to free a conntrack entry whose NAT hash 'prev' field points to a slot in the freed hash table (head for that bin). We ignore conntracks with null nat bindings. But this is wrong, as these are in bysource hash table as well. Restore nat-cleaning for the netns-is-being-removed case. bug: https://bugzilla.kernel.org/show_bug.cgi?id=65191 Fixes: c2d421e1718 ('netfilter: nf_nat: fix race when unloading protocol modules') Reported-by: Samu Kallio Debugged-by: Samu Kallio Signed-off-by: Florian Westphal Tested-by: Samu Kallio Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_core.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 09096a670c45..a49907b1dabc 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -525,6 +525,39 @@ static int nf_nat_proto_remove(struct nf_conn *i, void *data) return i->status & IPS_NAT_MASK ? 1 : 0; } +static int nf_nat_proto_clean(struct nf_conn *ct, void *data) +{ + struct nf_conn_nat *nat = nfct_nat(ct); + + if (nf_nat_proto_remove(ct, data)) + return 1; + + if (!nat || !nat->ct) + return 0; + + /* This netns is being destroyed, and conntrack has nat null binding. + * Remove it from bysource hash, as the table will be freed soon. + * + * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack() + * will delete entry from already-freed table. + */ + if (!del_timer(&ct->timeout)) + return 1; + + spin_lock_bh(&nf_nat_lock); + hlist_del_rcu(&nat->bysource); + ct->status &= ~IPS_NAT_DONE_MASK; + nat->ct = NULL; + spin_unlock_bh(&nf_nat_lock); + + add_timer(&ct->timeout); + + /* don't delete conntrack. Although that would make things a lot + * simpler, we'd end up flushing all conntracks on nat rmmod. + */ + return 0; +} + static void nf_nat_l4proto_clean(u8 l3proto, u8 l4proto) { struct nf_nat_proto_clean clean = { @@ -795,7 +828,7 @@ static void __net_exit nf_nat_net_exit(struct net *net) { struct nf_nat_proto_clean clean = {}; - nf_ct_iterate_cleanup(net, &nf_nat_proto_remove, &clean, 0, 0); + nf_ct_iterate_cleanup(net, nf_nat_proto_clean, &clean, 0, 0); synchronize_rcu(); nf_ct_free_hashtable(net->ct.nat_bysource, net->ct.nat_htable_size); } -- cgit v1.2.1 From 17846376f21c07c1b9ddfdef1a01bf3828fc1e06 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Mon, 16 Jun 2014 16:30:36 -0400 Subject: tcp: remove unnecessary tcp_sk assignment. This variable is overwritten by the child socket assignment before it ever gets used. Signed-off-by: Dave Jones Signed-off-by: David S. Miller --- net/ipv4/tcp_fastopen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 62e48cf84e60..9771563ab564 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -131,7 +131,7 @@ static bool tcp_fastopen_create_child(struct sock *sk, struct dst_entry *dst, struct request_sock *req) { - struct tcp_sock *tp = tcp_sk(sk); + struct tcp_sock *tp; struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; struct sock *child; -- cgit v1.2.1 From d36a4f4b472334562b8e7252e35d3d770db83815 Mon Sep 17 00:00:00 2001 From: Jie Liu Date: Tue, 17 Jun 2014 22:32:42 +0800 Subject: net: return actual error on register_queue_kobjects Return the actual error code if call kset_create_and_add() failed Cc: David S. Miller Signed-off-by: Jie Liu Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 1cac29ebb05b..5c1c1e526a2b 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1200,8 +1200,8 @@ static int register_queue_kobjects(struct net_device *net) #ifdef CONFIG_SYSFS net->queues_kset = kset_create_and_add("queues", NULL, &net->dev.kobj); - if (!net->queues_kset) - return -ENOMEM; + if (IS_ERR(net->queues_kset)) + return PTR_ERR(net->queues_kset); real_rx = net->real_num_rx_queues; #endif real_tx = net->real_num_tx_queues; -- cgit v1.2.1 From ff5e92c1affe7166b3f6e7073e648ed65a6e2e59 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 19 Jun 2014 01:31:30 +0200 Subject: net: sctp: propagate sysctl errors from proc_do* properly sysctl handler proc_sctp_do_hmac_alg(), proc_sctp_do_rto_min() and proc_sctp_do_rto_max() do not properly reflect some error cases when writing values via sysctl from internal proc functions such as proc_dointvec() and proc_dostring(). In all these cases we pass the test for write != 0 and partially do additional work just to notice that additional sanity checks fail and we return with hard-coded -EINVAL while proc_do* functions might also return different errors. So fix this up by simply testing a successful return of proc_do* right after calling it. This also allows to propagate its return value onwards to the user. While touching this, also fix up some minor style issues. Fixes: 4f3fdf3bc59c ("sctp: add check rto_min and rto_max in sysctl") Fixes: 3c68198e7511 ("sctp: Make hmac algorithm selection for cookie generation dynamic") Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/sctp/sysctl.c | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index dcb19592761e..cc12162ba091 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -321,41 +321,40 @@ static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; - char tmp[8]; struct ctl_table tbl; - int ret; - int changed = 0; + bool changed = false; char *none = "none"; + char tmp[8]; + int ret; memset(&tbl, 0, sizeof(struct ctl_table)); if (write) { tbl.data = tmp; - tbl.maxlen = 8; + tbl.maxlen = sizeof(tmp); } else { tbl.data = net->sctp.sctp_hmac_alg ? : none; tbl.maxlen = strlen(tbl.data); } - ret = proc_dostring(&tbl, write, buffer, lenp, ppos); - if (write) { + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + if (write && ret == 0) { #ifdef CONFIG_CRYPTO_MD5 if (!strncmp(tmp, "md5", 3)) { net->sctp.sctp_hmac_alg = "md5"; - changed = 1; + changed = true; } #endif #ifdef CONFIG_CRYPTO_SHA1 if (!strncmp(tmp, "sha1", 4)) { net->sctp.sctp_hmac_alg = "sha1"; - changed = 1; + changed = true; } #endif if (!strncmp(tmp, "none", 4)) { net->sctp.sctp_hmac_alg = NULL; - changed = 1; + changed = true; } - if (!changed) ret = -EINVAL; } @@ -368,11 +367,10 @@ static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; - int new_value; - struct ctl_table tbl; unsigned int min = *(unsigned int *) ctl->extra1; unsigned int max = *(unsigned int *) ctl->extra2; - int ret; + struct ctl_table tbl; + int ret, new_value; memset(&tbl, 0, sizeof(struct ctl_table)); tbl.maxlen = sizeof(unsigned int); @@ -381,12 +379,15 @@ static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write, tbl.data = &new_value; else tbl.data = &net->sctp.rto_min; + ret = proc_dointvec(&tbl, write, buffer, lenp, ppos); - if (write) { - if (ret || new_value > max || new_value < min) + if (write && ret == 0) { + if (new_value > max || new_value < min) return -EINVAL; + net->sctp.rto_min = new_value; } + return ret; } @@ -395,11 +396,10 @@ static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; - int new_value; - struct ctl_table tbl; unsigned int min = *(unsigned int *) ctl->extra1; unsigned int max = *(unsigned int *) ctl->extra2; - int ret; + struct ctl_table tbl; + int ret, new_value; memset(&tbl, 0, sizeof(struct ctl_table)); tbl.maxlen = sizeof(unsigned int); @@ -408,12 +408,15 @@ static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, tbl.data = &new_value; else tbl.data = &net->sctp.rto_max; + ret = proc_dointvec(&tbl, write, buffer, lenp, ppos); - if (write) { - if (ret || new_value > max || new_value < min) + if (write && ret == 0) { + if (new_value > max || new_value < min) return -EINVAL; + net->sctp.rto_max = new_value; } + return ret; } -- cgit v1.2.1 From 6f9a093b66ce7cacc110d8737c03686e80ecfda6 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 18 Jun 2014 15:34:57 -0700 Subject: net: filter: fix upper BPF instruction limit The original checks (via sk_chk_filter) for instruction count uses ">", not ">=", so changing this in sk_convert_filter has the potential to break existing seccomp filters that used exactly BPF_MAXINSNS many instructions. Fixes: bd4cf0ed331a ("net: filter: rework/optimize internal BPF interpreter's instruction set") Signed-off-by: Kees Cook Cc: stable@vger.kernel.org # v3.15+ Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 735fad897496..a44e12cdde4c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -840,7 +840,7 @@ int sk_convert_filter(struct sock_filter *prog, int len, BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK); BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); - if (len <= 0 || len >= BPF_MAXINSNS) + if (len <= 0 || len > BPF_MAXINSNS) return -EINVAL; if (new_prog) { -- cgit v1.2.1 From 8e4946ccdc09e0f04d2cb21f01886bd33de8532b Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 19 Jun 2014 18:12:15 -0700 Subject: Revert "net: return actual error on register_queue_kobjects" This reverts commit d36a4f4b472334562b8e7252e35d3d770db83815. Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 5c1c1e526a2b..1cac29ebb05b 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1200,8 +1200,8 @@ static int register_queue_kobjects(struct net_device *net) #ifdef CONFIG_SYSFS net->queues_kset = kset_create_and_add("queues", NULL, &net->dev.kobj); - if (IS_ERR(net->queues_kset)) - return PTR_ERR(net->queues_kset); + if (!net->queues_kset) + return -ENOMEM; real_rx = net->real_num_rx_queues; #endif real_tx = net->real_num_tx_queues; -- cgit v1.2.1 From 2cd0d743b05e87445c54ca124a9916f22f16742e Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Wed, 18 Jun 2014 21:15:03 -0400 Subject: tcp: fix tcp_match_skb_to_sack() for unaligned SACK at end of an skb If there is an MSS change (or misbehaving receiver) that causes a SACK to arrive that covers the end of an skb but is less than one MSS, then tcp_match_skb_to_sack() was rounding up pkt_len to the full length of the skb ("Round if necessary..."), then chopping all bytes off the skb and creating a zero-byte skb in the write queue. This was visible now because the recently simplified TLP logic in bef1909ee3ed1c ("tcp: fixing TLP's FIN recovery") could find that 0-byte skb at the end of the write queue, and now that we do not check that skb's length we could send it as a TLP probe. Consider the following example scenario: mss: 1000 skb: seq: 0 end_seq: 4000 len: 4000 SACK: start_seq: 3999 end_seq: 4000 The tcp_match_skb_to_sack() code will compute: in_sack = false pkt_len = start_seq - TCP_SKB_CB(skb)->seq = 3999 - 0 = 3999 new_len = (pkt_len / mss) * mss = (3999/1000)*1000 = 3000 new_len += mss = 4000 Previously we would find the new_len > skb->len check failing, so we would fall through and set pkt_len = new_len = 4000 and chop off pkt_len of 4000 from the 4000-byte skb, leaving a 0-byte segment afterward in the write queue. With this new commit, we notice that the new new_len >= skb->len check succeeds, so that we return without trying to fragment. Fixes: adb92db857ee ("tcp: Make SACK code to split only at mss boundaries") Reported-by: Eric Dumazet Signed-off-by: Neal Cardwell Cc: Eric Dumazet Cc: Yuchung Cheng Cc: Ilpo Jarvinen Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 40661fc1e233..b5c23756965a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1162,7 +1162,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, unsigned int new_len = (pkt_len / mss) * mss; if (!in_sack && new_len < pkt_len) { new_len += mss; - if (new_len > skb->len) + if (new_len >= skb->len) return 0; } pkt_len = new_len; -- cgit v1.2.1 From 24599e61b7552673dd85971cf5a35369cd8c119e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 18 Jun 2014 23:46:31 +0200 Subject: net: sctp: check proc_dointvec result in proc_sctp_do_auth When writing to the sysctl field net.sctp.auth_enable, it can well be that the user buffer we handed over to proc_dointvec() via proc_sctp_do_auth() handler contains something other than integers. In that case, we would set an uninitialized 4-byte value from the stack to net->sctp.auth_enable that can be leaked back when reading the sysctl variable, and it can unintentionally turn auth_enable on/off based on the stack content since auth_enable is interpreted as a boolean. Fix it up by making sure proc_dointvec() returned sucessfully. Fixes: b14878ccb7fa ("net: sctp: cache auth_enable per endpoint") Reported-by: Florian Westphal Signed-off-by: Daniel Borkmann Acked-by: Neil Horman Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/sysctl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index cc12162ba091..12c7e01c2677 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -447,8 +447,7 @@ static int proc_sctp_do_auth(struct ctl_table *ctl, int write, tbl.data = &net->sctp.auth_enable; ret = proc_dointvec(&tbl, write, buffer, lenp, ppos); - - if (write) { + if (write && ret == 0) { struct sock *sk = net->sctp.ctl_sock; net->sctp.auth_enable = new_value; -- cgit v1.2.1 From c7262e711ae6e466baeb9ddc21d678c878469b1f Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Tue, 17 Jun 2014 13:07:37 +0300 Subject: Bluetooth: Fix overriding higher security level in SMP When we receive a pairing request or an internal request to start pairing we shouldn't blindly overwrite the existing pending_sec_level value as that may actually be higher than the new one. This patch fixes the SMP code to only overwrite the value in case the new one is higher than the old. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index f2829a7932e2..0189ec8b68d1 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -669,7 +669,7 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_pairing rsp, *req = (void *) skb->data; struct smp_chan *smp; - u8 key_size, auth; + u8 key_size, auth, sec_level; int ret; BT_DBG("conn %p", conn); @@ -695,7 +695,9 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) /* We didn't start the pairing, so match remote */ auth = req->auth_req; - conn->hcon->pending_sec_level = authreq_to_seclevel(auth); + sec_level = authreq_to_seclevel(auth); + if (sec_level > conn->hcon->pending_sec_level) + conn->hcon->pending_sec_level = sec_level; build_pairing_cmd(conn, req, &rsp, auth); @@ -838,6 +840,7 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) struct smp_cmd_pairing cp; struct hci_conn *hcon = conn->hcon; struct smp_chan *smp; + u8 sec_level; BT_DBG("conn %p", conn); @@ -847,7 +850,9 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) if (!(conn->hcon->link_mode & HCI_LM_MASTER)) return SMP_CMD_NOTSUPP; - hcon->pending_sec_level = authreq_to_seclevel(rp->auth_req); + sec_level = authreq_to_seclevel(rp->auth_req); + if (sec_level > hcon->pending_sec_level) + hcon->pending_sec_level = sec_level; if (smp_ltk_encrypt(conn, hcon->pending_sec_level)) return 0; @@ -901,9 +906,12 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) if (smp_sufficient_security(hcon, sec_level)) return 1; + if (sec_level > hcon->pending_sec_level) + hcon->pending_sec_level = sec_level; + if (hcon->link_mode & HCI_LM_MASTER) - if (smp_ltk_encrypt(conn, sec_level)) - goto done; + if (smp_ltk_encrypt(conn, hcon->pending_sec_level)) + return 0; if (test_and_set_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags)) return 0; @@ -918,7 +926,7 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) * requires it. */ if (hcon->io_capability != HCI_IO_NO_INPUT_OUTPUT || - sec_level > BT_SECURITY_MEDIUM) + hcon->pending_sec_level > BT_SECURITY_MEDIUM) authreq |= SMP_AUTH_MITM; if (hcon->link_mode & HCI_LM_MASTER) { @@ -937,9 +945,6 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) set_bit(SMP_FLAG_INITIATOR, &smp->flags); -done: - hcon->pending_sec_level = sec_level; - return 0; } -- cgit v1.2.1 From 581370cc74ea75426421a1f5851ef05e2e995b01 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Tue, 17 Jun 2014 13:07:38 +0300 Subject: Bluetooth: Refactor authentication method lookup into its own function We'll need to do authentication method lookups from more than one place, so refactor the lookup into its own function. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 0189ec8b68d1..7156f4720644 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -385,6 +385,16 @@ static const u8 gen_method[5][5] = { { CFM_PASSKEY, CFM_PASSKEY, REQ_PASSKEY, JUST_WORKS, OVERLAP }, }; +static u8 get_auth_method(struct smp_chan *smp, u8 local_io, u8 remote_io) +{ + /* If either side has unknown io_caps, use JUST WORKS */ + if (local_io > SMP_IO_KEYBOARD_DISPLAY || + remote_io > SMP_IO_KEYBOARD_DISPLAY) + return JUST_WORKS; + + return gen_method[remote_io][local_io]; +} + static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, u8 local_io, u8 remote_io) { @@ -401,14 +411,11 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, BT_DBG("tk_request: auth:%d lcl:%d rem:%d", auth, local_io, remote_io); /* If neither side wants MITM, use JUST WORKS */ - /* If either side has unknown io_caps, use JUST WORKS */ /* Otherwise, look up method from the table */ - if (!(auth & SMP_AUTH_MITM) || - local_io > SMP_IO_KEYBOARD_DISPLAY || - remote_io > SMP_IO_KEYBOARD_DISPLAY) + if (!(auth & SMP_AUTH_MITM)) method = JUST_WORKS; else - method = gen_method[remote_io][local_io]; + method = get_auth_method(smp, local_io, remote_io); /* If not bonding, don't ask user to confirm a Zero TK */ if (!(auth & SMP_AUTH_BONDING) && method == JUST_CFM) -- cgit v1.2.1 From 2ed8f65ca262bca778e60053f667ce11b32db6b8 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Tue, 17 Jun 2014 13:07:39 +0300 Subject: Bluetooth: Fix rejecting pairing in case of insufficient capabilities If we need an MITM protected connection but the local and remote IO capabilities cannot provide it we should reject the pairing attempt in the appropriate way. This patch adds the missing checks for such a situation to the smp_cmd_pairing_req() and smp_cmd_pairing_rsp() functions. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 7156f4720644..e33a982161c1 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -706,6 +706,16 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) if (sec_level > conn->hcon->pending_sec_level) conn->hcon->pending_sec_level = sec_level; + /* If we need MITM check that it can be acheived */ + if (conn->hcon->pending_sec_level >= BT_SECURITY_HIGH) { + u8 method; + + method = get_auth_method(smp, conn->hcon->io_capability, + req->io_capability); + if (method == JUST_WORKS || method == JUST_CFM) + return SMP_AUTH_REQUIREMENTS; + } + build_pairing_cmd(conn, req, &rsp, auth); key_size = min(req->max_key_size, rsp.max_key_size); @@ -752,6 +762,16 @@ static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb) if (check_enc_key_size(conn, key_size)) return SMP_ENC_KEY_SIZE; + /* If we need MITM check that it can be acheived */ + if (conn->hcon->pending_sec_level >= BT_SECURITY_HIGH) { + u8 method; + + method = get_auth_method(smp, req->io_capability, + rsp->io_capability); + if (method == JUST_WORKS || method == JUST_CFM) + return SMP_AUTH_REQUIREMENTS; + } + get_random_bytes(smp->prnd, sizeof(smp->prnd)); smp->prsp[0] = SMP_CMD_PAIRING_RSP; -- cgit v1.2.1 From 1d56dc4f5f7cdf0ba99062d974b7586a28fc5cf4 Mon Sep 17 00:00:00 2001 From: Lukasz Rymanowski Date: Tue, 17 Jun 2014 13:04:20 +0200 Subject: Bluetooth: Fix for ACL disconnect when pairing fails When pairing fails hci_conn refcnt drops below zero. This cause that ACL link is not disconnected when disconnect timeout fires. Probably this is because l2cap_conn_del calls l2cap_chan_del for each channel, and inside l2cap_chan_del conn is dropped. After that loop hci_chan_del is called which also drops conn. Anyway, as it is desrcibed in hci_core.h, it is known that refcnt drops below 0 sometimes and it should be fine. If so, let disconnect link when hci_conn_timeout fires and refcnt is 0 or below. This patch does it. This affects PTS test SM_TC_JW_BV_05_C Logs from scenario: [69713.706227] [6515] pair_device: [69713.706230] [6515] hci_conn_add: hci0 dst 00:1b:dc:06:06:22 [69713.706233] [6515] hci_dev_hold: hci0 orig refcnt 8 [69713.706235] [6515] hci_conn_init_sysfs: conn ffff88021f65a000 [69713.706239] [6515] hci_req_add_ev: hci0 opcode 0x200d plen 25 [69713.706242] [6515] hci_prepare_cmd: skb len 28 [69713.706243] [6515] hci_req_run: length 1 [69713.706248] [6515] hci_conn_hold: hcon ffff88021f65a000 orig refcnt 0 [69713.706251] [6515] hci_dev_put: hci0 orig refcnt 9 [69713.706281] [8909] hci_cmd_work: hci0 cmd_cnt 1 cmd queued 1 [69713.706288] [8909] hci_send_frame: hci0 type 1 len 28 [69713.706290] [8909] hci_send_to_monitor: hdev ffff88021f0c7000 len 28 [69713.706316] [5949] hci_sock_recvmsg: sock ffff8800941a9680, sk ffff88012bf4d000 [69713.706382] [5949] hci_sock_recvmsg: sock ffff8800941a9680, sk ffff88012bf4d000 [69713.711664] [8909] hci_rx_work: hci0 [69713.711668] [8909] hci_send_to_monitor: hdev ffff88021f0c7000 len 6 [69713.711680] [8909] hci_rx_work: hci0 Event packet [69713.711683] [8909] hci_cs_le_create_conn: hci0 status 0x00 [69713.711685] [8909] hci_sent_cmd_data: hci0 opcode 0x200d [69713.711688] [8909] hci_req_cmd_complete: opcode 0x200d status 0x00 [69713.711690] [8909] hci_sent_cmd_data: hci0 opcode 0x200d [69713.711695] [5949] hci_sock_recvmsg: sock ffff8800941a9680, sk ffff88012bf4d000 [69713.711744] [5949] hci_sock_recvmsg: sock ffff8800941a9680, sk ffff88012bf4d000 [69713.818875] [8909] hci_rx_work: hci0 [69713.818889] [8909] hci_send_to_monitor: hdev ffff88021f0c7000 len 21 [69713.818913] [8909] hci_rx_work: hci0 Event packet [69713.818917] [8909] hci_le_conn_complete_evt: hci0 status 0x00 [69713.818922] [8909] hci_send_to_control: len 19 [69713.818927] [5949] hci_sock_recvmsg: sock ffff8800941a9680, sk ffff88012bf4d000 [69713.818938] [8909] hci_conn_add_sysfs: conn ffff88021f65a000 [69713.818975] [6450] bt_sock_poll: sock ffff88005e758500, sk ffff88010323b800 [69713.818981] [6515] hci_sock_recvmsg: sock ffff88005e75a080, sk ffff88010323ac00 ... [69713.819021] [8909] hci_dev_hold: hci0 orig refcnt 10 [69713.819025] [8909] l2cap_connect_cfm: hcon ffff88021f65a000 bdaddr 00:1b:dc:06:06:22 status 0 [69713.819028] [8909] hci_chan_create: hci0 hcon ffff88021f65a000 [69713.819031] [8909] l2cap_conn_add: hcon ffff88021f65a000 conn ffff880221005c00 hchan ffff88020d60b1c0 [69713.819034] [8909] l2cap_conn_ready: conn ffff880221005c00 [69713.819036] [5949] hci_sock_recvmsg: sock ffff8800941a9680, sk ffff88012bf4d000 [69713.819037] [8909] smp_conn_security: conn ffff880221005c00 hcon ffff88021f65a000 level 0x02 [69713.819039] [8909] smp_chan_create: [69713.819041] [8909] hci_conn_hold: hcon ffff88021f65a000 orig refcnt 1 [69713.819043] [8909] smp_send_cmd: code 0x01 [69713.819045] [8909] hci_send_acl: hci0 chan ffff88020d60b1c0 flags 0x0000 [69713.819046] [5949] hci_sock_recvmsg: sock ffff8800941a9900, sk ffff88012bf4e800 [69713.819049] [8909] hci_queue_acl: hci0 nonfrag skb ffff88005157c100 len 15 [69713.819055] [5949] hci_sock_recvmsg: sock ffff8800941a9900, sk ffff88012bf4e800 [69713.819057] [8909] l2cap_le_conn_ready: [69713.819064] [8909] l2cap_chan_create: chan ffff88005ede2c00 [69713.819066] [8909] l2cap_chan_hold: chan ffff88005ede2c00 orig refcnt 1 [69713.819069] [8909] l2cap_sock_init: sk ffff88005ede5800 [69713.819072] [8909] bt_accept_enqueue: parent ffff880160356000, sk ffff88005ede5800 [69713.819074] [8909] __l2cap_chan_add: conn ffff880221005c00, psm 0x00, dcid 0x0004 [69713.819076] [8909] l2cap_chan_hold: chan ffff88005ede2c00 orig refcnt 2 [69713.819078] [8909] hci_conn_hold: hcon ffff88021f65a000 orig refcnt 2 [69713.819080] [8909] smp_conn_security: conn ffff880221005c00 hcon ffff88021f65a000 level 0x01 [69713.819082] [8909] l2cap_sock_ready_cb: sk ffff88005ede5800, parent ffff880160356000 [69713.819086] [8909] le_pairing_complete_cb: status 0 [69713.819091] [8909] hci_tx_work: hci0 acl 10 sco 8 le 0 [69713.819093] [8909] hci_sched_acl: hci0 [69713.819094] [8909] hci_sched_sco: hci0 [69713.819096] [8909] hci_sched_esco: hci0 [69713.819098] [8909] hci_sched_le: hci0 [69713.819099] [8909] hci_chan_sent: hci0 [69713.819101] [8909] hci_chan_sent: chan ffff88020d60b1c0 quote 10 [69713.819104] [8909] hci_sched_le: chan ffff88020d60b1c0 skb ffff88005157c100 len 15 priority 7 [69713.819106] [8909] hci_send_frame: hci0 type 2 len 15 [69713.819108] [8909] hci_send_to_monitor: hdev ffff88021f0c7000 len 15 [69713.819119] [8909] hci_chan_sent: hci0 [69713.819121] [8909] hci_prio_recalculate: hci0 [69713.819123] [8909] process_pending_rx: [69713.819226] [6450] hci_sock_recvmsg: sock ffff88005e758780, sk ffff88010323d400 ... [69713.822022] [6450] l2cap_sock_accept: sk ffff880160356000 timeo 0 [69713.822024] [6450] bt_accept_dequeue: parent ffff880160356000 [69713.822026] [6450] bt_accept_unlink: sk ffff88005ede5800 state 1 [69713.822028] [6450] l2cap_sock_accept: new socket ffff88005ede5800 [69713.822368] [6450] l2cap_sock_getname: sock ffff8800941ab700, sk ffff88005ede5800 [69713.822375] [6450] l2cap_sock_getsockopt: sk ffff88005ede5800 [69713.822383] [6450] l2cap_sock_getname: sock ffff8800941ab700, sk ffff88005ede5800 [69713.822414] [6450] bt_sock_poll: sock ffff8800941ab700, sk ffff88005ede5800 ... [69713.823255] [6450] l2cap_sock_getname: sock ffff8800941ab700, sk ffff88005ede5800 [69713.823259] [6450] l2cap_sock_getsockopt: sk ffff88005ede5800 [69713.824322] [6450] l2cap_sock_getname: sock ffff8800941ab700, sk ffff88005ede5800 [69713.824330] [6450] l2cap_sock_getsockopt: sk ffff88005ede5800 [69713.825029] [6450] bt_sock_poll: sock ffff88005e758500, sk ffff88010323b800 ... [69713.825187] [6450] l2cap_sock_sendmsg: sock ffff8800941ab700, sk ffff88005ede5800 [69713.825189] [6450] bt_sock_wait_ready: sk ffff88005ede5800 [69713.825192] [6450] l2cap_create_basic_pdu: chan ffff88005ede2c00 len 3 [69713.825196] [6450] l2cap_do_send: chan ffff88005ede2c00, skb ffff880160b0b500 len 7 priority 0 [69713.825199] [6450] hci_send_acl: hci0 chan ffff88020d60b1c0 flags 0x0000 [69713.825201] [6450] hci_queue_acl: hci0 nonfrag skb ffff880160b0b500 len 11 [69713.825210] [8909] hci_tx_work: hci0 acl 9 sco 8 le 0 [69713.825213] [8909] hci_sched_acl: hci0 [69713.825214] [8909] hci_sched_sco: hci0 [69713.825216] [8909] hci_sched_esco: hci0 [69713.825217] [8909] hci_sched_le: hci0 [69713.825219] [8909] hci_chan_sent: hci0 [69713.825221] [8909] hci_chan_sent: chan ffff88020d60b1c0 quote 9 [69713.825223] [8909] hci_sched_le: chan ffff88020d60b1c0 skb ffff880160b0b500 len 11 priority 0 [69713.825225] [8909] hci_send_frame: hci0 type 2 len 11 [69713.825227] [8909] hci_send_to_monitor: hdev ffff88021f0c7000 len 11 [69713.825242] [8909] hci_chan_sent: hci0 [69713.825253] [5949] hci_sock_recvmsg: sock ffff8800941a9680, sk ffff88012bf4d000 [69713.825253] [8909] hci_prio_recalculate: hci0 [69713.825292] [5949] hci_sock_recvmsg: sock ffff8800941a9680, sk ffff88012bf4d000 [69713.825768] [6450] bt_sock_poll: sock ffff88005e758500, sk ffff88010323b800 ... [69713.866902] [8909] hci_rx_work: hci0 [69713.866921] [8909] hci_send_to_monitor: hdev ffff88021f0c7000 len 7 [69713.866928] [8909] hci_rx_work: hci0 Event packet [69713.866931] [8909] hci_num_comp_pkts_evt: hci0 num_hndl 1 [69713.866937] [8909] hci_tx_work: hci0 acl 9 sco 8 le 0 [69713.866939] [5949] hci_sock_recvmsg: sock ffff8800941a9680, sk ffff88012bf4d000 [69713.866940] [8909] hci_sched_acl: hci0 ... [69713.866944] [8909] hci_sched_le: hci0 [69713.866953] [8909] hci_chan_sent: hci0 [69713.866997] [5949] hci_sock_recvmsg: sock ffff8800941a9680, sk ffff88012bf4d000 [69713.867840] [28074] hci_rx_work: hci0 [69713.867844] [28074] hci_send_to_monitor: hdev ffff88021f0c7000 len 7 [69713.867850] [28074] hci_rx_work: hci0 Event packet [69713.867853] [28074] hci_num_comp_pkts_evt: hci0 num_hndl 1 [69713.867857] [5949] hci_sock_recvmsg: sock ffff8800941a9680, sk ffff88012bf4d000 [69713.867858] [28074] hci_tx_work: hci0 acl 10 sco 8 le 0 [69713.867860] [28074] hci_sched_acl: hci0 [69713.867861] [28074] hci_sched_sco: hci0 [69713.867862] [28074] hci_sched_esco: hci0 [69713.867863] [28074] hci_sched_le: hci0 [69713.867865] [28074] hci_chan_sent: hci0 [69713.867888] [5949] hci_sock_recvmsg: sock ffff8800941a9680, sk ffff88012bf4d000 [69714.145661] [8909] hci_rx_work: hci0 [69714.145666] [8909] hci_send_to_monitor: hdev ffff88021f0c7000 len 10 [69714.145676] [8909] hci_rx_work: hci0 ACL data packet [69714.145679] [8909] hci_acldata_packet: hci0 len 6 handle 0x002d flags 0x0002 [69714.145681] [8909] hci_conn_enter_active_mode: hcon ffff88021f65a000 mode 0 [69714.145683] [8909] l2cap_recv_acldata: conn ffff880221005c00 len 6 flags 0x2 [69714.145693] [8909] l2cap_recv_frame: len 2, cid 0x0006 [69714.145696] [8909] hci_send_to_control: len 14 [69714.145710] [8909] smp_chan_destroy: [69714.145713] [8909] pairing_complete: status 3 [69714.145714] [8909] cmd_complete: sock ffff88010323ac00 [69714.145717] [8909] hci_conn_drop: hcon ffff88021f65a000 orig refcnt 3 [69714.145719] [5949] hci_sock_recvmsg: sock ffff8800941a9680, sk ffff88012bf4d000 [69714.145720] [6450] bt_sock_poll: sock ffff88005e758500, sk ffff88010323b800 [69714.145722] [6515] hci_sock_recvmsg: sock ffff88005e75a080, sk ffff88010323ac00 [69714.145724] [6450] bt_sock_poll: sock ffff8801db6b4f00, sk ffff880160351c00 ... [69714.145735] [6515] hci_sock_recvmsg: sock ffff88005e75a080, sk ffff88010323ac00 [69714.145737] [8909] hci_conn_drop: hcon ffff88021f65a000 orig refcnt 2 [69714.145739] [8909] l2cap_conn_del: hcon ffff88021f65a000 conn ffff880221005c00, err 13 [69714.145740] [6450] bt_sock_poll: sock ffff8801db6b5400, sk ffff88021e775000 [69714.145743] [6450] bt_sock_poll: sock ffff8801db6b5e00, sk ffff880160356000 [69714.145744] [8909] l2cap_chan_hold: chan ffff88005ede2c00 orig refcnt 3 [69714.145746] [6450] bt_sock_poll: sock ffff8800941ab700, sk ffff88005ede5800 [69714.145748] [8909] l2cap_chan_del: chan ffff88005ede2c00, conn ffff880221005c00, err 13 [69714.145749] [8909] l2cap_chan_put: chan ffff88005ede2c00 orig refcnt 4 [69714.145751] [8909] hci_conn_drop: hcon ffff88021f65a000 orig refcnt 1 [69714.145754] [6450] bt_sock_poll: sock ffff8800941ab700, sk ffff88005ede5800 [69714.145756] [8909] l2cap_chan_put: chan ffff88005ede2c00 orig refcnt 3 [69714.145759] [8909] hci_chan_del: hci0 hcon ffff88021f65a000 chan ffff88020d60b1c0 [69714.145766] [5949] hci_sock_recvmsg: sock ffff8800941a9680, sk ffff88012bf4d000 [69714.145787] [6515] hci_sock_release: sock ffff88005e75a080 sk ffff88010323ac00 [69714.146002] [6450] hci_sock_recvmsg: sock ffff88005e758780, sk ffff88010323d400 [69714.150795] [6450] l2cap_sock_release: sock ffff8800941ab700, sk ffff88005ede5800 [69714.150799] [6450] l2cap_sock_shutdown: sock ffff8800941ab700, sk ffff88005ede5800 [69714.150802] [6450] l2cap_chan_close: chan ffff88005ede2c00 state BT_CLOSED [69714.150805] [6450] l2cap_sock_kill: sk ffff88005ede5800 state BT_CLOSED [69714.150806] [6450] l2cap_chan_put: chan ffff88005ede2c00 orig refcnt 2 [69714.150808] [6450] l2cap_sock_destruct: sk ffff88005ede5800 [69714.150809] [6450] l2cap_chan_put: chan ffff88005ede2c00 orig refcnt 1 [69714.150811] [6450] l2cap_chan_destroy: chan ffff88005ede2c00 [69714.150970] [6450] bt_sock_poll: sock ffff88005e758500, sk ffff88010323b800 ... [69714.151991] [8909] hci_conn_drop: hcon ffff88021f65a000 orig refcnt 0 [69716.150339] [8909] hci_conn_timeout: hcon ffff88021f65a000 state BT_CONNECTED, refcnt -1 Signed-off-by: Lukasz Rymanowski Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_conn.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index ca01d1861854..a7a27bc2c0b1 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -289,10 +289,20 @@ static void hci_conn_timeout(struct work_struct *work) { struct hci_conn *conn = container_of(work, struct hci_conn, disc_work.work); + int refcnt = atomic_read(&conn->refcnt); BT_DBG("hcon %p state %s", conn, state_to_string(conn->state)); - if (atomic_read(&conn->refcnt)) + WARN_ON(refcnt < 0); + + /* FIXME: It was observed that in pairing failed scenario, refcnt + * drops below 0. Probably this is because l2cap_conn_del calls + * l2cap_chan_del for each channel, and inside l2cap_chan_del conn is + * dropped. After that loop hci_chan_del is called which also drops + * conn. For now make sure that ACL is alive if refcnt is higher then 0, + * otherwise drop it. + */ + if (refcnt > 0) return; switch (conn->state) { -- cgit v1.2.1 From 916c1689a09bc1ca81f2d7a34876f8d35aadd11b Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Wed, 18 Jun 2014 13:46:02 +0800 Subject: 8021q: fix a potential memory leak skb_cow called in vlan_reorder_header does not free the skb when it failed, and vlan_reorder_header returns NULL to reset original skb when it is called in vlan_untag, lead to a memory leak. Signed-off-by: Li RongQing Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/8021q/vlan_core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 9012b1c922b6..75d427763992 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -114,8 +114,11 @@ EXPORT_SYMBOL(vlan_dev_vlan_proto); static struct sk_buff *vlan_reorder_header(struct sk_buff *skb) { - if (skb_cow(skb, skb_headroom(skb)) < 0) + if (skb_cow(skb, skb_headroom(skb)) < 0) { + kfree_skb(skb); return NULL; + } + memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 2 * ETH_ALEN); skb->mac_header += VLAN_HLEN; return skb; -- cgit v1.2.1 From 744462a91ecf5d1ec64857488bf99000ef626921 Mon Sep 17 00:00:00 2001 From: Max Stepanov Date: Tue, 10 Jun 2014 20:00:08 +0300 Subject: mac80211: WEP extra head/tail room in ieee80211_send_auth After skb allocation and call to ieee80211_wep_encrypt in ieee80211_send_auth the flow fails with a warning in ieee80211_wep_add_iv on verification of available head/tailroom needed for WEP_IV and WEP_ICV. Signed-off-by: Max Stepanov Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/util.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 6886601afe1c..a6cda52ed920 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1096,11 +1096,12 @@ void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, int err; /* 24 + 6 = header + auth_algo + auth_transaction + status_code */ - skb = dev_alloc_skb(local->hw.extra_tx_headroom + 24 + 6 + extra_len); + skb = dev_alloc_skb(local->hw.extra_tx_headroom + IEEE80211_WEP_IV_LEN + + 24 + 6 + extra_len + IEEE80211_WEP_ICV_LEN); if (!skb) return; - skb_reserve(skb, local->hw.extra_tx_headroom); + skb_reserve(skb, local->hw.extra_tx_headroom + IEEE80211_WEP_IV_LEN); mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24 + 6); memset(mgmt, 0, 24 + 6); -- cgit v1.2.1 From e33e2241e272eddc38339692500bd1c7d8753a77 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 23 Jun 2014 11:06:16 +0200 Subject: Revert "cfg80211: Use 5MHz bandwidth by default when checking usable channels" This reverts commit 8eca1fb692cc9557f386eddce75c300a3855d11a. Felix notes that this broke regulatory, leaving channel 12 open for AP operation in the US regulatory domain where it isn't permitted. Link: http://mid.gmane.org/53A6C0FF.9090104@openwrt.org Reported-by: Felix Fietkau Signed-off-by: Johannes Berg --- net/wireless/reg.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 558b0e3a02d8..1afdf45db38f 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -935,7 +935,7 @@ freq_reg_info_regd(struct wiphy *wiphy, u32 center_freq, if (!band_rule_found) band_rule_found = freq_in_rule_band(fr, center_freq); - bw_fits = reg_does_bw_fit(fr, center_freq, MHZ_TO_KHZ(5)); + bw_fits = reg_does_bw_fit(fr, center_freq, MHZ_TO_KHZ(20)); if (band_rule_found && bw_fits) return rr; @@ -1019,10 +1019,10 @@ static void chan_reg_rule_print_dbg(const struct ieee80211_regdomain *regd, } #endif -/* Find an ieee80211_reg_rule such that a 5MHz channel with frequency - * chan->center_freq fits there. - * If there is no such reg_rule, disable the channel, otherwise set the - * flags corresponding to the bandwidths allowed in the particular reg_rule +/* + * Note that right now we assume the desired channel bandwidth + * is always 20 MHz for each individual channel (HT40 uses 20 MHz + * per channel, the primary and the extension channel). */ static void handle_channel(struct wiphy *wiphy, enum nl80211_reg_initiator initiator, @@ -1083,12 +1083,8 @@ static void handle_channel(struct wiphy *wiphy, if (reg_rule->flags & NL80211_RRF_AUTO_BW) max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule); - if (max_bandwidth_khz < MHZ_TO_KHZ(10)) - bw_flags = IEEE80211_CHAN_NO_10MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(20)) - bw_flags |= IEEE80211_CHAN_NO_20MHZ; if (max_bandwidth_khz < MHZ_TO_KHZ(40)) - bw_flags |= IEEE80211_CHAN_NO_HT40; + bw_flags = IEEE80211_CHAN_NO_HT40; if (max_bandwidth_khz < MHZ_TO_KHZ(80)) bw_flags |= IEEE80211_CHAN_NO_80MHZ; if (max_bandwidth_khz < MHZ_TO_KHZ(160)) @@ -1522,12 +1518,8 @@ static void handle_channel_custom(struct wiphy *wiphy, if (reg_rule->flags & NL80211_RRF_AUTO_BW) max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule); - if (max_bandwidth_khz < MHZ_TO_KHZ(10)) - bw_flags = IEEE80211_CHAN_NO_10MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(20)) - bw_flags |= IEEE80211_CHAN_NO_20MHZ; if (max_bandwidth_khz < MHZ_TO_KHZ(40)) - bw_flags |= IEEE80211_CHAN_NO_HT40; + bw_flags = IEEE80211_CHAN_NO_HT40; if (max_bandwidth_khz < MHZ_TO_KHZ(80)) bw_flags |= IEEE80211_CHAN_NO_80MHZ; if (max_bandwidth_khz < MHZ_TO_KHZ(160)) -- cgit v1.2.1 From 0ce12026d6ea4a24d52ddcde36b9643f2e26d560 Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Tue, 17 Jun 2014 13:07:02 +0300 Subject: cfg80211: fix elapsed_jiffies calculation MAX_JIFFY_OFFSET has no meaning when calculating the elapsed jiffies, as jiffies run out until ULONG_MAX. This miscalculation results in erroneous values in case of a wrap-around. Signed-off-by: Eliad Peller Signed-off-by: Johannes Berg --- net/wireless/core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/core.h b/net/wireless/core.h index e9afbf10e756..7e3a3cef7df9 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -424,7 +424,7 @@ static inline unsigned int elapsed_jiffies_msecs(unsigned long start) if (end >= start) return jiffies_to_msecs(end - start); - return jiffies_to_msecs(end + (MAX_JIFFY_OFFSET - start) + 1); + return jiffies_to_msecs(end + (ULONG_MAX - start) + 1); } void -- cgit v1.2.1 From 02df00eb0019e7d15a1fcddebe4d020226c1ccda Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 10 Jun 2014 14:06:25 +0200 Subject: nl80211: move set_qos_map command into split state The non-split wiphy state shouldn't be increased in size so move the new set_qos_map command into the split if statement. Cc: stable@vger.kernel.org (3.14+) Fixes: fa9ffc745610 ("cfg80211: Add support for QoS mapping") Reviewed-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index ba4f1723c83a..6668daf69326 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -1497,18 +1497,17 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, } CMD(start_p2p_device, START_P2P_DEVICE); CMD(set_mcast_rate, SET_MCAST_RATE); +#ifdef CONFIG_NL80211_TESTMODE + CMD(testmode_cmd, TESTMODE); +#endif if (state->split) { CMD(crit_proto_start, CRIT_PROTOCOL_START); CMD(crit_proto_stop, CRIT_PROTOCOL_STOP); if (rdev->wiphy.flags & WIPHY_FLAG_HAS_CHANNEL_SWITCH) CMD(channel_switch, CHANNEL_SWITCH); + CMD(set_qos_map, SET_QOS_MAP); } - CMD(set_qos_map, SET_QOS_MAP); - -#ifdef CONFIG_NL80211_TESTMODE - CMD(testmode_cmd, TESTMODE); -#endif - + /* add into the if now */ #undef CMD if (rdev->ops->connect || rdev->ops->auth) { -- cgit v1.2.1 From 285276e72cbaa5be2147aac93133944882bced22 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Tue, 24 Jun 2014 15:33:20 +0200 Subject: trivial: net: filter: Fix typo in comment Signed-off-by: Tobias Klauser Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index a44e12cdde4c..ff9235e10b19 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1382,7 +1382,7 @@ static struct sk_filter *__sk_migrate_realloc(struct sk_filter *fp, fp_new = sock_kmalloc(sk, len, GFP_KERNEL); if (fp_new) { *fp_new = *fp; - /* As we're kepping orig_prog in fp_new along, + /* As we're keeping orig_prog in fp_new along, * we need to make sure we're not evicting it * from the old fp. */ -- cgit v1.2.1 From 677a9fd3e6e6e03e11b979b69c9f8c813583683a Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Tue, 24 Jun 2014 15:33:21 +0200 Subject: trivial: net: filter: Change kerneldoc parameter order Change the order of the parameters to sk_unattached_filter_create() in the kerneldoc to reflect the order they appear in the actual function. This fix is only cosmetic, in the generated doc they still appear in the correct order without the fix. Signed-off-by: Tobias Klauser Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index ff9235e10b19..4d13b125ef4b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1524,8 +1524,8 @@ static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp, /** * sk_unattached_filter_create - create an unattached filter - * @fprog: the filter program * @pfp: the unattached filter that is created + * @fprog: the filter program * * Create a filter independent of any socket. We first run some * sanity checks on it to make sure it does not explode on us later. -- cgit v1.2.1 From 99e72a0fed07d118d329f3046ad2ec2ae9357d63 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Tue, 24 Jun 2014 15:33:22 +0200 Subject: net: filter: Use kcalloc/kmalloc_array to allocate arrays Use kcalloc/kmalloc_array to make it clear we're allocating arrays. No integer overflow can actually happen here, since len/flen is guaranteed to be less than BPF_MAXINSNS (4096). However, this changed makes sure we're not going to get one if BPF_MAXINSNS were ever increased. Signed-off-by: Tobias Klauser Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 4d13b125ef4b..1dbf6462f766 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -844,7 +844,7 @@ int sk_convert_filter(struct sock_filter *prog, int len, return -EINVAL; if (new_prog) { - addrs = kzalloc(len * sizeof(*addrs), GFP_KERNEL); + addrs = kcalloc(len, sizeof(*addrs), GFP_KERNEL); if (!addrs) return -ENOMEM; } @@ -1101,7 +1101,7 @@ static int check_load_and_stores(struct sock_filter *filter, int flen) BUILD_BUG_ON(BPF_MEMWORDS > 16); - masks = kmalloc(flen * sizeof(*masks), GFP_KERNEL); + masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL); if (!masks) return -ENOMEM; -- cgit v1.2.1 From f88649721268999bdff09777847080a52004f691 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 24 Jun 2014 10:05:11 -0700 Subject: ipv4: fix dst race in sk_dst_get() When IP route cache had been removed in linux-3.6, we broke assumption that dst entries were all freed after rcu grace period. DST_NOCACHE dst were supposed to be freed from dst_release(). But it appears we want to keep such dst around, either in UDP sockets or tunnels. In sk_dst_get() we need to make sure dst refcount is not 0 before incrementing it, or else we might end up freeing a dst twice. DST_NOCACHE set on a dst does not mean this dst can not be attached to a socket or a tunnel. Then, before actual freeing, we need to observe a rcu grace period to make sure all other cpus can catch the fact the dst is no longer usable. Signed-off-by: Eric Dumazet Reported-by: Dormando Signed-off-by: David S. Miller --- net/core/dst.c | 16 +++++++++++----- net/ipv4/ip_tunnel.c | 14 +++++--------- 2 files changed, 16 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/core/dst.c b/net/core/dst.c index 80d6286c8b62..a028409ee438 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -269,6 +269,15 @@ again: } EXPORT_SYMBOL(dst_destroy); +static void dst_destroy_rcu(struct rcu_head *head) +{ + struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); + + dst = dst_destroy(dst); + if (dst) + __dst_free(dst); +} + void dst_release(struct dst_entry *dst) { if (dst) { @@ -276,11 +285,8 @@ void dst_release(struct dst_entry *dst) newrefcnt = atomic_dec_return(&dst->__refcnt); WARN_ON(newrefcnt < 0); - if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) { - dst = dst_destroy(dst); - if (dst) - __dst_free(dst); - } + if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) + call_rcu(&dst->rcu_head, dst_destroy_rcu); } } EXPORT_SYMBOL(dst_release); diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 097b3e7c1e8f..54b6731dab55 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -73,12 +73,7 @@ static void __tunnel_dst_set(struct ip_tunnel_dst *idst, { struct dst_entry *old_dst; - if (dst) { - if (dst->flags & DST_NOCACHE) - dst = NULL; - else - dst_clone(dst); - } + dst_clone(dst); old_dst = xchg((__force struct dst_entry **)&idst->dst, dst); dst_release(old_dst); } @@ -108,13 +103,14 @@ static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, u32 cookie) rcu_read_lock(); dst = rcu_dereference(this_cpu_ptr(t->dst_cache)->dst); + if (dst && !atomic_inc_not_zero(&dst->__refcnt)) + dst = NULL; if (dst) { if (dst->obsolete && dst->ops->check(dst, cookie) == NULL) { - rcu_read_unlock(); tunnel_dst_reset(t); - return NULL; + dst_release(dst); + dst = NULL; } - dst_hold(dst); } rcu_read_unlock(); return (struct rtable *)dst; -- cgit v1.2.1 From de843723f9b989178762196fb24dd050cbe20ca3 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 25 Jun 2014 12:51:01 -0700 Subject: net: fix setting csum_start in skb_segment() Dave Jones reported that a crash is occurring in csum_partial tcp_gso_segment inet_gso_segment ? update_dl_migration skb_mac_gso_segment __skb_gso_segment dev_hard_start_xmit sch_direct_xmit __dev_queue_xmit ? dev_hard_start_xmit dev_queue_xmit ip_finish_output ? ip_output ip_output ip_forward_finish ip_forward ip_rcv_finish ip_rcv __netif_receive_skb_core ? __netif_receive_skb_core ? trace_hardirqs_on __netif_receive_skb netif_receive_skb_internal napi_gro_complete ? napi_gro_complete dev_gro_receive ? dev_gro_receive napi_gro_receive It looks like a likely culprit is that SKB_GSO_CB()->csum_start is not set correctly when doing non-scatter gather. We are using offset as opposed to doffset. Reported-by: Dave Jones Tested-by: Dave Jones Signed-off-by: Tom Herbert Signed-off-by: Eric Dumazet Fixes: 7e2b10c1e52ca ("net: Support for multiple checksums with gso") Acked-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 9cd5344fad73..c1a33033cbe2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2993,7 +2993,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, skb_put(nskb, len), len, 0); SKB_GSO_CB(nskb)->csum_start = - skb_headroom(nskb) + offset; + skb_headroom(nskb) + doffset; continue; } -- cgit v1.2.1 From 3e215c8d1b6b772d107f1811b5ee8eae7a046fb4 Mon Sep 17 00:00:00 2001 From: James M Leddy Date: Wed, 25 Jun 2014 17:38:13 -0400 Subject: udp: Add MIB counters for rcvbuferrors Add MIB counters for rcvbuferrors in UDP to help diagnose problems. Signed-off-by: James M Leddy Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/udp.c | 5 ++++- net/ipv6/udp.c | 6 +++++- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d92f94b7e402..7d5a8661df76 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1588,8 +1588,11 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) goto csum_error; - if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) + if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) { + UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, + is_udplite); goto drop; + } rc = 0; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 95c834799288..7092ff78fd84 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -674,8 +674,11 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) goto csum_error; } - if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) + if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) { + UDP6_INC_STATS_BH(sock_net(sk), + UDP_MIB_RCVBUFERRORS, is_udplite); goto drop; + } skb_dst_drop(skb); @@ -690,6 +693,7 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) bh_unlock_sock(sk); return rc; + csum_error: UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: -- cgit v1.2.1 From e940f5d6ba6a01f8dbb870854d5205d322452730 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 27 Jun 2014 09:57:53 +0800 Subject: ipv6: Fix MLD Query message check Based on RFC3810 6.2, we also need to check the hop limit and router alert option besides source address. Signed-off-by: Hangbin Liu Acked-by: YOSHIFUJI Hideaki Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/mcast.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 08b367c6b9cf..617f0958e164 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1301,8 +1301,17 @@ int igmp6_event_query(struct sk_buff *skb) len = ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr); len -= skb_network_header_len(skb); - /* Drop queries with not link local source */ - if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) + /* RFC3810 6.2 + * Upon reception of an MLD message that contains a Query, the node + * checks if the source address of the message is a valid link-local + * address, if the Hop Limit is set to 1, and if the Router Alert + * option is present in the Hop-By-Hop Options header of the IPv6 + * packet. If any of these checks fails, the packet is dropped. + */ + if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL) || + ipv6_hdr(skb)->hop_limit != 1 || + !(IP6CB(skb)->flags & IP6SKB_ROUTERALERT) || + IP6CB(skb)->ra != htons(IPV6_OPT_ROUTERALERT_MLD)) return -EINVAL; idev = __in6_dev_get(skb->dev); -- cgit v1.2.1 From ac5ccdba3a1659b3517e7e99ef7d35a6a2d77cf4 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 19 Jun 2014 21:22:56 +0300 Subject: iovec: move memcpy_from/toiovecend to lib/iovec.c ERROR: "memcpy_fromiovecend" [drivers/vhost/vhost_scsi.ko] undefined! commit 9f977ef7b671f6169eca78bf40f230fe84b7c7e5 vhost-scsi: Include prot_bytes into expected data transfer length in target-pending makes drivers/vhost/scsi.c call memcpy_fromiovecend(). This function is not available when CONFIG_NET is not enabled. socket.h already includes uio.h, so no callers need updating. Reported-by: Randy Dunlap Cc: Stephen Rothwell Cc: "David S. Miller" Signed-off-by: David S. Miller Signed-off-by: Michael S. Tsirkin Signed-off-by: Nicholas Bellinger --- net/core/iovec.c | 55 ------------------------------------------------------- 1 file changed, 55 deletions(-) (limited to 'net') diff --git a/net/core/iovec.c b/net/core/iovec.c index b61869429f4c..827dd6beb49c 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -74,61 +74,6 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *a return err; } -/* - * Copy kernel to iovec. Returns -EFAULT on error. - */ - -int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata, - int offset, int len) -{ - int copy; - for (; len > 0; ++iov) { - /* Skip over the finished iovecs */ - if (unlikely(offset >= iov->iov_len)) { - offset -= iov->iov_len; - continue; - } - copy = min_t(unsigned int, iov->iov_len - offset, len); - if (copy_to_user(iov->iov_base + offset, kdata, copy)) - return -EFAULT; - offset = 0; - kdata += copy; - len -= copy; - } - - return 0; -} -EXPORT_SYMBOL(memcpy_toiovecend); - -/* - * Copy iovec to kernel. Returns -EFAULT on error. - */ - -int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov, - int offset, int len) -{ - /* Skip over the finished iovecs */ - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - iov++; - } - - while (len > 0) { - u8 __user *base = iov->iov_base + offset; - int copy = min_t(unsigned int, len, iov->iov_len - offset); - - offset = 0; - if (copy_from_user(kdata, base, copy)) - return -EFAULT; - len -= copy; - kdata += copy; - iov++; - } - - return 0; -} -EXPORT_SYMBOL(memcpy_fromiovecend); - /* * And now for the all-in-one: copy and checksum from a user iovec * directly to a datagram -- cgit v1.2.1 From fe984c08e20f0fc2b4666bf8eeeb02605568387b Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Tue, 6 May 2014 17:23:48 -0700 Subject: openvswitch: Fix a double free bug for the sample action When sample action returns with an error, the skb has already been freed. This patch fix a bug to make sure we don't free it again. This bug introduced by commit ccb1352e76cff05 (net: Add Open vSwitch kernel components.) Signed-off-by: Andy Zhou Signed-off-by: Pravin B Shelar --- net/openvswitch/actions.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index c36856a457ca..e70d8b18e962 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -551,6 +551,8 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, case OVS_ACTION_ATTR_SAMPLE: err = sample(dp, skb, a); + if (unlikely(err)) /* skb already freed. */ + return err; break; } -- cgit v1.2.1 From e0bb8c44ed5cfcc56b571758ed966ee48779024c Mon Sep 17 00:00:00 2001 From: Wei Zhang Date: Sat, 28 Jun 2014 12:34:53 -0700 Subject: openvswitch: supply a dummy err_handler of gre_cisco_protocol to prevent kernel crash When use gre vport, openvswitch register a gre_cisco_protocol but does not supply a err_handler with it. The gre_cisco_err() in net/ipv4/gre_demux.c expect err_handler be provided with the gre_cisco_protocol implementation, and call ->err_handler() without existence check, cause the kernel crash. This patch provide a err_handler to fix this bug. This bug introduced by commit aa310701e787087d (openvswitch: Add gre tunnel support.) Signed-off-by: Wei Zhang Signed-off-by: Jesse Gross Signed-off-by: Pravin B Shelar --- net/openvswitch/vport-gre.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'net') diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index 35ec4fed09e2..f49148a07da2 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -110,6 +110,22 @@ static int gre_rcv(struct sk_buff *skb, return PACKET_RCVD; } +/* Called with rcu_read_lock and BH disabled. */ +static int gre_err(struct sk_buff *skb, u32 info, + const struct tnl_ptk_info *tpi) +{ + struct ovs_net *ovs_net; + struct vport *vport; + + ovs_net = net_generic(dev_net(skb->dev), ovs_net_id); + vport = rcu_dereference(ovs_net->vport_net.gre_vport); + + if (unlikely(!vport)) + return PACKET_REJECT; + else + return PACKET_RCVD; +} + static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) { struct net *net = ovs_dp_get_net(vport->dp); @@ -186,6 +202,7 @@ error: static struct gre_cisco_protocol gre_protocol = { .handler = gre_rcv, + .err_handler = gre_err, .priority = 1, }; -- cgit v1.2.1 From ad55200734c65a3ec5d0c39d6ea904008baea536 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Tue, 6 May 2014 16:48:38 -0700 Subject: openvswitch: Fix tracking of flags seen in TCP flows. Flow statistics need to take into account the TCP flags from the packet currently being processed (in 'key'), not the TCP flags matched by the flow found in the kernel flow table (in 'flow'). This bug made the Open vSwitch userspace fin_timeout action have no effect in many cases. This bug is introduced by commit 88d73f6c411ac2f0578 (openvswitch: Use TCP flags in the flow key for stats.) Reported-by: Len Gao Signed-off-by: Ben Pfaff Acked-by: Jarno Rajahalme Acked-by: Jesse Gross Signed-off-by: Pravin B Shelar --- net/openvswitch/datapath.c | 4 ++-- net/openvswitch/flow.c | 4 ++-- net/openvswitch/flow.h | 5 +++-- 3 files changed, 7 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 0d407bca81e3..a863678c50ac 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2013 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -276,7 +276,7 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) OVS_CB(skb)->flow = flow; OVS_CB(skb)->pkt_key = &key; - ovs_flow_stats_update(OVS_CB(skb)->flow, skb); + ovs_flow_stats_update(OVS_CB(skb)->flow, key.tp.flags, skb); ovs_execute_actions(dp, skb); stats_counter = &stats->n_hit; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 334751cb1528..d07ab538fc9d 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -61,10 +61,10 @@ u64 ovs_flow_used_time(unsigned long flow_jiffies) #define TCP_FLAGS_BE16(tp) (*(__be16 *)&tcp_flag_word(tp) & htons(0x0FFF)) -void ovs_flow_stats_update(struct sw_flow *flow, struct sk_buff *skb) +void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags, + struct sk_buff *skb) { struct flow_stats *stats; - __be16 tcp_flags = flow->key.tp.flags; int node = numa_node_id(); stats = rcu_dereference(flow->stats[node]); diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index ac395d2cd821..5e5aaed3a85b 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2013 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -180,7 +180,8 @@ struct arp_eth_header { unsigned char ar_tip[4]; /* target IP address */ } __packed; -void ovs_flow_stats_update(struct sw_flow *, struct sk_buff *); +void ovs_flow_stats_update(struct sw_flow *, __be16 tcp_flags, + struct sk_buff *); void ovs_flow_stats_get(const struct sw_flow *, struct ovs_flow_stats *, unsigned long *used, __be16 *tcp_flags); void ovs_flow_stats_clear(struct sw_flow *); -- cgit v1.2.1 From 4a46b24e147dfa9b858026da02cad0bdd4e149d2 Mon Sep 17 00:00:00 2001 From: Alex Wang Date: Mon, 30 Jun 2014 20:30:29 -0700 Subject: openvswitch: Use exact lookup for flow_get and flow_del. Due to the race condition in userspace, there is chance that two overlapping megaflows could be installed in datapath. And this causes userspace unable to delete the less inclusive megaflow flow even after it timeout, since the flow_del logic will stop at the first match of masked flow. This commit fixes the bug by making the kernel flow_del and flow_get logic check all masks in that case. Introduced by 03f0d916a (openvswitch: Mega flow implementation). Signed-off-by: Alex Wang Acked-by: Andy Zhou Signed-off-by: Pravin B Shelar --- net/openvswitch/datapath.c | 23 +++++++++++------------ net/openvswitch/flow_table.c | 16 ++++++++++++++++ net/openvswitch/flow_table.h | 3 ++- 3 files changed, 29 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index a863678c50ac..9db4bf6740d1 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -889,8 +889,11 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) } /* The unmasked key has to be the same for flow updates. */ if (unlikely(!ovs_flow_cmp_unmasked_key(flow, &match))) { - error = -EEXIST; - goto err_unlock_ovs; + flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); + if (!flow) { + error = -ENOENT; + goto err_unlock_ovs; + } } /* Update actions. */ old_acts = ovsl_dereference(flow->sf_acts); @@ -981,16 +984,12 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) goto err_unlock_ovs; } /* Check that the flow exists. */ - flow = ovs_flow_tbl_lookup(&dp->table, &key); + flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); if (unlikely(!flow)) { error = -ENOENT; goto err_unlock_ovs; } - /* The unmasked key has to be the same for flow updates. */ - if (unlikely(!ovs_flow_cmp_unmasked_key(flow, &match))) { - error = -EEXIST; - goto err_unlock_ovs; - } + /* Update actions, if present. */ if (likely(acts)) { old_acts = ovsl_dereference(flow->sf_acts); @@ -1063,8 +1062,8 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) goto unlock; } - flow = ovs_flow_tbl_lookup(&dp->table, &key); - if (!flow || !ovs_flow_cmp_unmasked_key(flow, &match)) { + flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); + if (!flow) { err = -ENOENT; goto unlock; } @@ -1113,8 +1112,8 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) goto unlock; } - flow = ovs_flow_tbl_lookup(&dp->table, &key); - if (unlikely(!flow || !ovs_flow_cmp_unmasked_key(flow, &match))) { + flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); + if (unlikely(!flow)) { err = -ENOENT; goto unlock; } diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index 574c3abc9b30..cf2d853646f0 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -456,6 +456,22 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl, return ovs_flow_tbl_lookup_stats(tbl, key, &n_mask_hit); } +struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, + struct sw_flow_match *match) +{ + struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); + struct sw_flow_mask *mask; + struct sw_flow *flow; + + /* Always called under ovs-mutex. */ + list_for_each_entry(mask, &tbl->mask_list, list) { + flow = masked_flow_lookup(ti, match->key, mask); + if (flow && ovs_flow_cmp_unmasked_key(flow, match)) /* Found */ + return flow; + } + return NULL; +} + int ovs_flow_tbl_num_masks(const struct flow_table *table) { struct sw_flow_mask *mask; diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index ca8a5820f615..5918bff7f3f6 100644 --- a/net/openvswitch/flow_table.h +++ b/net/openvswitch/flow_table.h @@ -76,7 +76,8 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *, u32 *n_mask_hit); struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *, const struct sw_flow_key *); - +struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, + struct sw_flow_match *match); bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, struct sw_flow_match *match); -- cgit v1.2.1 From 7f502361531e9eecb396cf99bdc9e9a59f7ebd7f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Jun 2014 01:26:23 -0700 Subject: ipv4: irq safe sk_dst_[re]set() and ipv4_sk_update_pmtu() fix We have two different ways to handle changes to sk->sk_dst First way (used by TCP) assumes socket lock is owned by caller, and use no extra lock : __sk_dst_set() & __sk_dst_reset() Another way (used by UDP) uses sk_dst_lock because socket lock is not always taken. Note that sk_dst_lock is not softirq safe. These ways are not inter changeable for a given socket type. ipv4_sk_update_pmtu(), added in linux-3.8, added a race, as it used the socket lock as synchronization, but users might be UDP sockets. Instead of converting sk_dst_lock to a softirq safe version, use xchg() as we did for sk_rx_dst in commit e47eb5dfb296b ("udp: ipv4: do not use sk_dst_lock from softirq context") In a follow up patch, we probably can remove sk_dst_lock, as it is only used in IPv6. Signed-off-by: Eric Dumazet Cc: Steffen Klassert Fixes: 9cb3a50c5f63e ("ipv4: Invalidate the socket cached route on pmtu events if possible") Signed-off-by: David S. Miller --- net/ipv4/route.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 082239ffe34a..3162ea923ded 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1010,7 +1010,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) const struct iphdr *iph = (const struct iphdr *) skb->data; struct flowi4 fl4; struct rtable *rt; - struct dst_entry *dst; + struct dst_entry *odst = NULL; bool new = false; bh_lock_sock(sk); @@ -1018,16 +1018,17 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) if (!ip_sk_accept_pmtu(sk)) goto out; - rt = (struct rtable *) __sk_dst_get(sk); + odst = sk_dst_get(sk); - if (sock_owned_by_user(sk) || !rt) { + if (sock_owned_by_user(sk) || !odst) { __ipv4_sk_update_pmtu(skb, sk, mtu); goto out; } __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); - if (!__sk_dst_check(sk, 0)) { + rt = (struct rtable *)odst; + if (odst->obsolete && odst->ops->check(odst, 0) == NULL) { rt = ip_route_output_flow(sock_net(sk), &fl4, sk); if (IS_ERR(rt)) goto out; @@ -1037,8 +1038,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu); - dst = dst_check(&rt->dst, 0); - if (!dst) { + if (!dst_check(&rt->dst, 0)) { if (new) dst_release(&rt->dst); @@ -1050,10 +1050,11 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) } if (new) - __sk_dst_set(sk, &rt->dst); + sk_dst_set(sk, &rt->dst); out: bh_unlock_sock(sk); + dst_release(odst); } EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); -- cgit v1.2.1 From a48e5fafecfb9c0c807d7e7284b5ff884dfb7a3a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Jul 2014 02:25:15 -0700 Subject: vlan: free percpu stats in device destructor Madalin-Cristian reported crashs happening after a recent commit (5a4ae5f6e7d4 "vlan: unnecessary to check if vlan_pcpu_stats is NULL") ----------------------------------------------------------------------- root@p5040ds:~# vconfig add eth8 1 root@p5040ds:~# vconfig rem eth8.1 Unable to handle kernel paging request for data at address 0x2bc88028 Faulting instruction address: 0xc058e950 Oops: Kernel access of bad area, sig: 11 [#1] SMP NR_CPUS=8 CoreNet Generic Modules linked in: CPU: 3 PID: 2167 Comm: vconfig Tainted: G W 3.16.0-rc3-00346-g65e85bf #2 task: e7264d90 ti: e2c2c000 task.ti: e2c2c000 NIP: c058e950 LR: c058ea30 CTR: c058e900 REGS: e2c2db20 TRAP: 0300 Tainted: G W (3.16.0-rc3-00346-g65e85bf) MSR: 00029002 CR: 48000428 XER: 20000000 DEAR: 2bc88028 ESR: 00000000 GPR00: c047299c e2c2dbd0 e7264d90 00000000 2bc88000 00000000 ffffffff 00000000 GPR08: 0000000f 00000000 000000ff 00000000 28000422 10121928 10100000 10100000 GPR16: 10100000 00000000 c07c5968 00000000 00000000 00000000 e2c2dc48 e7838000 GPR24: c07c5bac c07c58a8 e77290cc c07b0000 00000000 c05de6c0 e7838000 e2c2dc48 NIP [c058e950] vlan_dev_get_stats64+0x50/0x170 LR [c058ea30] vlan_dev_get_stats64+0x130/0x170 Call Trace: [e2c2dbd0] [ffffffea] 0xffffffea (unreliable) [e2c2dc20] [c047299c] dev_get_stats+0x4c/0x140 [e2c2dc40] [c0488ca8] rtnl_fill_ifinfo+0x3d8/0x960 [e2c2dd70] [c0489f4c] rtmsg_ifinfo+0x6c/0x110 [e2c2dd90] [c04731d4] rollback_registered_many+0x344/0x3b0 [e2c2ddd0] [c047332c] rollback_registered+0x2c/0x50 [e2c2ddf0] [c0476058] unregister_netdevice_queue+0x78/0xf0 [e2c2de00] [c058d800] unregister_vlan_dev+0xc0/0x160 [e2c2de20] [c058e360] vlan_ioctl_handler+0x1c0/0x550 [e2c2de90] [c045d11c] sock_ioctl+0x28c/0x2f0 [e2c2deb0] [c010d070] do_vfs_ioctl+0x90/0x7b0 [e2c2df20] [c010d7d0] SyS_ioctl+0x40/0x80 [e2c2df40] [c000f924] ret_from_syscall+0x0/0x3c Fix this problem by freeing percpu stats from dev->destructor() instead of ndo_uninit() Reported-by: Madalin-Cristian Bucur Signed-off-by: Eric Dumazet Tested-by: Madalin-Cristian Bucur Fixes: 5a4ae5f6e7d4 ("vlan: unnecessary to check if vlan_pcpu_stats is NULL") Cc: Li RongQing Signed-off-by: David S. Miller --- net/8021q/vlan_dev.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index ad2ac3c00398..dd11f612e03e 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -627,8 +627,6 @@ static void vlan_dev_uninit(struct net_device *dev) struct vlan_dev_priv *vlan = vlan_dev_priv(dev); int i; - free_percpu(vlan->vlan_pcpu_stats); - vlan->vlan_pcpu_stats = NULL; for (i = 0; i < ARRAY_SIZE(vlan->egress_priority_map); i++) { while ((pm = vlan->egress_priority_map[i]) != NULL) { vlan->egress_priority_map[i] = pm->next; @@ -785,6 +783,15 @@ static const struct net_device_ops vlan_netdev_ops = { .ndo_get_lock_subclass = vlan_dev_get_lock_subclass, }; +static void vlan_dev_free(struct net_device *dev) +{ + struct vlan_dev_priv *vlan = vlan_dev_priv(dev); + + free_percpu(vlan->vlan_pcpu_stats); + vlan->vlan_pcpu_stats = NULL; + free_netdev(dev); +} + void vlan_setup(struct net_device *dev) { ether_setup(dev); @@ -794,7 +801,7 @@ void vlan_setup(struct net_device *dev) dev->tx_queue_len = 0; dev->netdev_ops = &vlan_netdev_ops; - dev->destructor = free_netdev; + dev->destructor = vlan_dev_free; dev->ethtool_ops = &vlan_ethtool_ops; memset(dev->broadcast, 0, ETH_ALEN); -- cgit v1.2.1 From 5924f17a8a30c2ae18d034a86ee7581b34accef6 Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Sat, 28 Jun 2014 18:26:37 +0200 Subject: tcp: Fix divide by zero when pushing during tcp-repair When in repair-mode and TCP_RECV_QUEUE is set, we end up calling tcp_push with mss_now being 0. If data is in the send-queue and tcp_set_skb_tso_segs gets called, we crash because it will divide by mss_now: [ 347.151939] divide error: 0000 [#1] SMP [ 347.152907] Modules linked in: [ 347.152907] CPU: 1 PID: 1123 Comm: packetdrill Not tainted 3.16.0-rc2 #4 [ 347.152907] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007 [ 347.152907] task: f5b88540 ti: f3c82000 task.ti: f3c82000 [ 347.152907] EIP: 0060:[] EFLAGS: 00210246 CPU: 1 [ 347.152907] EIP is at tcp_set_skb_tso_segs+0x49/0xa0 [ 347.152907] EAX: 00000b67 EBX: f5acd080 ECX: 00000000 EDX: 00000000 [ 347.152907] ESI: f5a28f40 EDI: f3c88f00 EBP: f3c83d10 ESP: f3c83d00 [ 347.152907] DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 [ 347.152907] CR0: 80050033 CR2: 083158b0 CR3: 35146000 CR4: 000006b0 [ 347.152907] Stack: [ 347.152907] c167f9d9 f5acd080 000005b4 00000002 f3c83d20 c16013e6 f3c88f00 f5acd080 [ 347.152907] f3c83da0 c1603b5a f3c83d38 c10a0188 00000000 00000000 f3c83d84 c10acc85 [ 347.152907] c1ad5ec0 00000000 00000000 c1ad679c 010003e0 00000000 00000000 f3c88fc8 [ 347.152907] Call Trace: [ 347.152907] [] ? apic_timer_interrupt+0x2d/0x34 [ 347.152907] [] tcp_init_tso_segs+0x36/0x50 [ 347.152907] [] tcp_write_xmit+0x7a/0xbf0 [ 347.152907] [] ? up+0x28/0x40 [ 347.152907] [] ? console_unlock+0x295/0x480 [ 347.152907] [] ? vprintk_emit+0x1ef/0x4b0 [ 347.152907] [] __tcp_push_pending_frames+0x36/0xd0 [ 347.152907] [] tcp_push+0xf0/0x120 [ 347.152907] [] tcp_sendmsg+0xf1/0xbf0 [ 347.152907] [] ? kmem_cache_free+0xf0/0x120 [ 347.152907] [] ? __sigqueue_free+0x32/0x40 [ 347.152907] [] ? __sigqueue_free+0x32/0x40 [ 347.152907] [] ? do_wp_page+0x3e0/0x850 [ 347.152907] [] inet_sendmsg+0x4a/0xb0 [ 347.152907] [] ? handle_mm_fault+0x709/0xfb0 [ 347.152907] [] sock_aio_write+0xbb/0xd0 [ 347.152907] [] do_sync_write+0x69/0xa0 [ 347.152907] [] vfs_write+0x123/0x160 [ 347.152907] [] SyS_write+0x55/0xb0 [ 347.152907] [] sysenter_do_call+0x12/0x28 This can easily be reproduced with the following packetdrill-script (the "magic" with netem, sk_pacing and limit_output_bytes is done to prevent the kernel from pushing all segments, because hitting the limit without doing this is not so easy with packetdrill): 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 +0 bind(3, ..., ...) = 0 +0 listen(3, 1) = 0 +0 < S 0:0(0) win 32792 +0 > S. 0:0(0) ack 1 +0.1 < . 1:1(0) ack 1 win 65000 +0 accept(3, ..., ...) = 4 // This forces that not all segments of the snd-queue will be pushed +0 `tc qdisc add dev tun0 root netem delay 10ms` +0 `sysctl -w net.ipv4.tcp_limit_output_bytes=2` +0 setsockopt(4, SOL_SOCKET, 47, [2], 4) = 0 +0 write(4,...,10000) = 10000 +0 write(4,...,10000) = 10000 // Set tcp-repair stuff, particularly TCP_RECV_QUEUE +0 setsockopt(4, SOL_TCP, 19, [1], 4) = 0 +0 setsockopt(4, SOL_TCP, 20, [1], 4) = 0 // This now will make the write push the remaining segments +0 setsockopt(4, SOL_SOCKET, 47, [20000], 4) = 0 +0 `sysctl -w net.ipv4.tcp_limit_output_bytes=130000` // Now we will crash +0 write(4,...,1000) = 1000 This happens since ec3423257508 (tcp: fix retransmission in repair mode). Prior to that, the call to tcp_push was prevented by a check for tp->repair. The patch fixes it, by adding the new goto-label out_nopush. When exiting tcp_sendmsg and a push is not required, which is the case for tp->repair, we go to this label. When repairing and calling send() with TCP_RECV_QUEUE, the data is actually put in the receive-queue. So, no push is required because no data has been added to the send-queue. Cc: Andrew Vagin Cc: Pavel Emelyanov Fixes: ec3423257508 (tcp: fix retransmission in repair mode) Signed-off-by: Christoph Paasch Acked-by: Andrew Vagin Acked-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index eb1dde37e678..9d2118e5fbc7 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1108,7 +1108,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (unlikely(tp->repair)) { if (tp->repair_queue == TCP_RECV_QUEUE) { copied = tcp_send_rcvq(sk, msg, size); - goto out; + goto out_nopush; } err = -EINVAL; @@ -1282,6 +1282,7 @@ wait_for_memory: out: if (copied) tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); +out_nopush: release_sock(sk); return copied + copied_syn; -- cgit v1.2.1 From 68b7107b62983f2cff0948292429d5f5999df096 Mon Sep 17 00:00:00 2001 From: Edward Allcutt Date: Mon, 30 Jun 2014 16:16:02 +0100 Subject: ipv4: icmp: Fix pMTU handling for rare case Some older router implementations still send Fragmentation Needed errors with the Next-Hop MTU field set to zero. This is explicitly described as an eventuality that hosts must deal with by the standard (RFC 1191) since older standards specified that those bits must be zero. Linux had a generic (for all of IPv4) implementation of the algorithm described in the RFC for searching a list of MTU plateaus for a good value. Commit 46517008e116 ("ipv4: Kill ip_rt_frag_needed().") removed this as part of the changes to remove the routing cache. Subsequently any Fragmentation Needed packet with a zero Next-Hop MTU has been discarded without being passed to the per-protocol handlers or notifying userspace for raw sockets. When there is a router which does not implement RFC 1191 on an MTU limited path then this results in stalled connections since large packets are discarded and the local protocols are not notified so they never attempt to lower the pMTU. One example I have seen is an OpenBSD router terminating IPSec tunnels. It's worth pointing out that this case is distinct from the BSD 4.2 bug which incorrectly calculated the Next-Hop MTU since the commit in question dismissed that as a valid concern. All of the per-protocols handlers implement the simple approach from RFC 1191 of immediately falling back to the minimum value. Although this is sub-optimal it is vastly preferable to connections hanging indefinitely. Remove the Next-Hop MTU != 0 check and allow such packets to follow the normal path. Fixes: 46517008e116 ("ipv4: Kill ip_rt_frag_needed().") Signed-off-by: Edward Allcutt Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 79c3d947a481..42b7bcf8045b 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -739,8 +739,6 @@ static void icmp_unreach(struct sk_buff *skb) /* fall through */ case 0: info = ntohs(icmph->un.frag.mtu); - if (!info) - goto out; } break; case ICMP_SR_FAILED: -- cgit v1.2.1 From 11ef7a8996d5d433c9cd75d80651297eccbf6d42 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 30 Jun 2014 09:50:40 -0700 Subject: net: Performance fix for process_backlog In process_backlog the input_pkt_queue is only checked once for new packets and quota is artificially reduced to reflect precisely the number of packets on the input_pkt_queue so that the loop exits appropriately. This patches changes the behavior to be more straightforward and less convoluted. Packets are processed until either the quota is met or there are no more packets to process. This patch seems to provide a small, but noticeable performance improvement. The performance improvement is a result of staying in the process_backlog loop longer which can reduce number of IPI's. Performance data using super_netperf TCP_RR with 200 flows: Before fix: 88.06% CPU utilization 125/190/309 90/95/99% latencies 1.46808e+06 tps 1145382 intrs.sec. With fix: 87.73% CPU utilization 122/183/296 90/95/99% latencies 1.4921e+06 tps 1021674.30 intrs./sec. Signed-off-by: Tom Herbert Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 30eedf677913..77c19c7bb490 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4227,9 +4227,8 @@ static int process_backlog(struct napi_struct *napi, int quota) #endif napi->weight = weight_p; local_irq_disable(); - while (work < quota) { + while (1) { struct sk_buff *skb; - unsigned int qlen; while ((skb = __skb_dequeue(&sd->process_queue))) { local_irq_enable(); @@ -4243,24 +4242,24 @@ static int process_backlog(struct napi_struct *napi, int quota) } rps_lock(sd); - qlen = skb_queue_len(&sd->input_pkt_queue); - if (qlen) - skb_queue_splice_tail_init(&sd->input_pkt_queue, - &sd->process_queue); - - if (qlen < quota - work) { + if (skb_queue_empty(&sd->input_pkt_queue)) { /* * Inline a custom version of __napi_complete(). * only current cpu owns and manipulates this napi, - * and NAPI_STATE_SCHED is the only possible flag set on backlog. - * we can use a plain write instead of clear_bit(), + * and NAPI_STATE_SCHED is the only possible flag set + * on backlog. + * We can use a plain write instead of clear_bit(), * and we dont need an smp_mb() memory barrier. */ list_del(&napi->poll_list); napi->state = 0; + rps_unlock(sd); - quota = work + qlen; + break; } + + skb_queue_splice_tail_init(&sd->input_pkt_queue, + &sd->process_queue); rps_unlock(sd); } local_irq_enable(); -- cgit v1.2.1 From 54951194656e4853e441266fd095f880bc0398f3 Mon Sep 17 00:00:00 2001 From: Loic Prylli Date: Tue, 1 Jul 2014 21:39:43 -0700 Subject: net: Fix NETDEV_CHANGE notifier usage causing spurious arp flush A bug was introduced in NETDEV_CHANGE notifier sequence causing the arp table to be sometimes spuriously cleared (including manual arp entries marked permanent), upon network link carrier changes. The changed argument for the notifier was applied only to a single caller of NETDEV_CHANGE, missing among others netdev_state_change(). So upon net_carrier events induced by the network, which are triggering a call to netdev_state_change(), arp_netdev_event() would decide whether to clear or not arp cache based on random/junk stack values (a kind of read buffer overflow). Fixes: be9efd365328 ("net: pass changed flags along with NETDEV_CHANGE event") Fixes: 6c8b4e3ff81b ("arp: flush arp cache on IFF_NOARP change") Signed-off-by: Loic Prylli Signed-off-by: David S. Miller --- net/core/dev.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 77c19c7bb490..7990984ca364 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -148,6 +148,9 @@ struct list_head ptype_all __read_mostly; /* Taps */ static struct list_head offload_base __read_mostly; static int netif_rx_internal(struct sk_buff *skb); +static int call_netdevice_notifiers_info(unsigned long val, + struct net_device *dev, + struct netdev_notifier_info *info); /* * The @dev_base_head list is protected by @dev_base_lock and the rtnl @@ -1207,7 +1210,11 @@ EXPORT_SYMBOL(netdev_features_change); void netdev_state_change(struct net_device *dev) { if (dev->flags & IFF_UP) { - call_netdevice_notifiers(NETDEV_CHANGE, dev); + struct netdev_notifier_change_info change_info; + + change_info.flags_changed = 0; + call_netdevice_notifiers_info(NETDEV_CHANGE, dev, + &change_info.info); rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); } } -- cgit v1.2.1 From 52ad353a5344f1f700c5b777175bdfa41d3cd65a Mon Sep 17 00:00:00 2001 From: dingtianhong Date: Wed, 2 Jul 2014 13:50:48 +0800 Subject: igmp: fix the problem when mc leave group The problem was triggered by these steps: 1) create socket, bind and then setsockopt for add mc group. mreq.imr_multiaddr.s_addr = inet_addr("255.0.0.37"); mreq.imr_interface.s_addr = inet_addr("192.168.1.2"); setsockopt(sockfd, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)); 2) drop the mc group for this socket. mreq.imr_multiaddr.s_addr = inet_addr("255.0.0.37"); mreq.imr_interface.s_addr = inet_addr("0.0.0.0"); setsockopt(sockfd, IPPROTO_IP, IP_DROP_MEMBERSHIP, &mreq, sizeof(mreq)); 3) and then drop the socket, I found the mc group was still used by the dev: netstat -g Interface RefCnt Group --------------- ------ --------------------- eth2 1 255.0.0.37 Normally even though the IP_DROP_MEMBERSHIP return error, the mc group still need to be released for the netdev when drop the socket, but this process was broken when route default is NULL, the reason is that: The ip_mc_leave_group() will choose the in_dev by the imr_interface.s_addr, if input addr is NULL, the default route dev will be chosen, then the ifindex is got from the dev, then polling the inet->mc_list and return -ENODEV, but if the default route dev is NULL, the in_dev and ifIndex is both NULL, when polling the inet->mc_list, the mc group will be released from the mc_list, but the dev didn't dec the refcnt for this mc group, so when dropping the socket, the mc_list is NULL and the dev still keep this group. v1->v2: According Hideaki's suggestion, we should align with IPv6 (RFC3493) and BSDs, so I add the checking for the in_dev before polling the mc_list, make sure when we remove the mc group, dec the refcnt to the real dev which was using the mc address. The problem would never happened again. Signed-off-by: Ding Tianhong Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 6748d420f714..db710b059bab 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1944,6 +1944,10 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) rtnl_lock(); in_dev = ip_mc_find_dev(net, imr); + if (!in_dev) { + ret = -ENODEV; + goto out; + } ifindex = imr->imr_ifindex; for (imlp = &inet->mc_list; (iml = rtnl_dereference(*imlp)) != NULL; @@ -1961,16 +1965,14 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) *imlp = iml->next_rcu; - if (in_dev) - ip_mc_dec_group(in_dev, group); + ip_mc_dec_group(in_dev, group); rtnl_unlock(); /* decrease mem now to avoid the memleak warning */ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); kfree_rcu(iml, rcu); return 0; } - if (!in_dev) - ret = -ENODEV; +out: rtnl_unlock(); return ret; } -- cgit v1.2.1 From 6e08d5e3c8236e7484229e46fdf92006e1dd4c49 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 2 Jul 2014 12:07:16 -0700 Subject: tcp: fix false undo corner cases The undo code assumes that, upon entering loss recovery, TCP 1) always retransmit something 2) the retransmission never fails locally (e.g., qdisc drop) so undo_marker is set in tcp_enter_recovery() and undo_retrans is incremented only when tcp_retransmit_skb() is successful. When the assumption is broken because TCP's cwnd is too small to retransmit or the retransmit fails locally. The next (DUP)ACK would incorrectly revert the cwnd and the congestion state in tcp_try_undo_dsack() or tcp_may_undo(). Subsequent (DUP)ACKs may enter the recovery state. The sender repeatedly enter and (incorrectly) exit recovery states if the retransmits continue to fail locally while receiving (DUP)ACKs. The fix is to initialize undo_retrans to -1 and start counting on the first retransmission. Always increment undo_retrans even if the retransmissions fail locally because they couldn't cause DSACKs to undo the cwnd reduction. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 8 ++++---- net/ipv4/tcp_output.c | 6 ++++-- 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b5c23756965a..40639c288dc2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1106,7 +1106,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, } /* D-SACK for already forgotten data... Do dumb counting. */ - if (dup_sack && tp->undo_marker && tp->undo_retrans && + if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 && !after(end_seq_0, prior_snd_una) && after(end_seq_0, tp->undo_marker)) tp->undo_retrans--; @@ -1187,7 +1187,7 @@ static u8 tcp_sacktag_one(struct sock *sk, /* Account D-SACK for retransmitted packet. */ if (dup_sack && (sacked & TCPCB_RETRANS)) { - if (tp->undo_marker && tp->undo_retrans && + if (tp->undo_marker && tp->undo_retrans > 0 && after(end_seq, tp->undo_marker)) tp->undo_retrans--; if (sacked & TCPCB_SACKED_ACKED) @@ -1893,7 +1893,7 @@ static void tcp_clear_retrans_partial(struct tcp_sock *tp) tp->lost_out = 0; tp->undo_marker = 0; - tp->undo_retrans = 0; + tp->undo_retrans = -1; } void tcp_clear_retrans(struct tcp_sock *tp) @@ -2665,7 +2665,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) tp->prior_ssthresh = 0; tp->undo_marker = tp->snd_una; - tp->undo_retrans = tp->retrans_out; + tp->undo_retrans = tp->retrans_out ? : -1; if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { if (!ece_ack) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index d92bce0ea24e..179b51e6bda3 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2525,8 +2525,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (!tp->retrans_stamp) tp->retrans_stamp = TCP_SKB_CB(skb)->when; - tp->undo_retrans += tcp_skb_pcount(skb); - /* snd_nxt is stored to detect loss of retransmitted segment, * see tcp_input.c tcp_sacktag_write_queue(). */ @@ -2534,6 +2532,10 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) } else if (err != -EBUSY) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); } + + if (tp->undo_retrans < 0) + tp->undo_retrans = 0; + tp->undo_retrans += tcp_skb_pcount(skb); return err; } -- cgit v1.2.1 From 29322d0db98e5a84f5cc6a55655bee3dc4ffb5ab Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Sat, 5 Jul 2014 13:44:13 -0400 Subject: tipc: fix bug in multicast/broadcast message reassembly Since commit 37e22164a8a3c39bdad45aa463b1e69a1fdf4110 ("tipc: rename and move message reassembly function") reassembly of long broadcast messages has been broken. This is because we test for a non-NULL return value of the *buf parameter as criteria for succesful reassembly. However, this parameter is left defined even after reception of the first fragment, when reassebly is still incomplete. This leads to a kernel crash as soon as a the first fragment of a long broadcast message is received. We fix this with this commit, by implementing a stricter behavior of the function and its return values. This commit should be applied to both net and net-next. Signed-off-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/msg.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 8be6e94a1ca9..0a37a472c29f 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -101,9 +101,11 @@ int tipc_msg_build(struct tipc_msg *hdr, struct iovec const *msg_sect, } /* tipc_buf_append(): Append a buffer to the fragment list of another buffer - * Let first buffer become head buffer - * Returns 1 and sets *buf to headbuf if chain is complete, otherwise 0 - * Leaves headbuf pointer at NULL if failure + * @*headbuf: in: NULL for first frag, otherwise value returned from prev call + * out: set when successful non-complete reassembly, otherwise NULL + * @*buf: in: the buffer to append. Always defined + * out: head buf after sucessful complete reassembly, otherwise NULL + * Returns 1 when reassembly complete, otherwise 0 */ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) { @@ -122,6 +124,7 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) goto out_free; head = *headbuf = frag; skb_frag_list_init(head); + *buf = NULL; return 0; } if (!head) @@ -150,5 +153,7 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) out_free: pr_warn_ratelimited("Unable to build fragment list\n"); kfree_skb(*buf); + kfree_skb(*headbuf); + *buf = *headbuf = NULL; return 0; } -- cgit v1.2.1 From e0056593b61253f1a8a9941dacda22e73b963cdc Mon Sep 17 00:00:00 2001 From: Dmitry Popov Date: Sat, 5 Jul 2014 02:26:37 +0400 Subject: ip_tunnel: fix ip_tunnel_lookup This patch fixes 3 similar bugs where incoming packets might be routed into wrong non-wildcard tunnels: 1) Consider the following setup: ip address add 1.1.1.1/24 dev eth0 ip address add 1.1.1.2/24 dev eth0 ip tunnel add ipip1 remote 2.2.2.2 local 1.1.1.1 mode ipip dev eth0 ip link set ipip1 up Incoming ipip packets from 2.2.2.2 were routed into ipip1 even if it has dst = 1.1.1.2. Moreover even if there was wildcard tunnel like ip tunnel add ipip0 remote 2.2.2.2 local any mode ipip dev eth0 but it was created before explicit one (with local 1.1.1.1), incoming ipip packets with src = 2.2.2.2 and dst = 1.1.1.2 were still routed into ipip1. Same issue existed with all tunnels that use ip_tunnel_lookup (gre, vti) 2) ip address add 1.1.1.1/24 dev eth0 ip tunnel add ipip1 remote 2.2.146.85 local 1.1.1.1 mode ipip dev eth0 ip link set ipip1 up Incoming ipip packets with dst = 1.1.1.1 were routed into ipip1, no matter what src address is. Any remote ip address which has ip_tunnel_hash = 0 raised this issue, 2.2.146.85 is just an example, there are more than 4 million of them. And again, wildcard tunnel like ip tunnel add ipip0 remote any local 1.1.1.1 mode ipip dev eth0 wouldn't be ever matched if it was created before explicit tunnel like above. Gre & vti tunnels had the same issue. 3) ip address add 1.1.1.1/24 dev eth0 ip tunnel add gre1 remote 2.2.146.84 local 1.1.1.1 key 1 mode gre dev eth0 ip link set gre1 up Any incoming gre packet with key = 1 were routed into gre1, no matter what src/dst addresses are. Any remote ip address which has ip_tunnel_hash = 0 raised the issue, 2.2.146.84 is just an example, there are more than 4 million of them. Wildcard tunnel like ip tunnel add gre2 remote any local any key 1 mode gre dev eth0 wouldn't be ever matched if it was created before explicit tunnel like above. All this stuff happened because while looking for a wildcard tunnel we didn't check that matched tunnel is a wildcard one. Fixed. Signed-off-by: Dmitry Popov Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 54b6731dab55..6f9de61dce5f 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -169,6 +169,7 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, hlist_for_each_entry_rcu(t, head, hash_node) { if (remote != t->parms.iph.daddr || + t->parms.iph.saddr != 0 || !(t->dev->flags & IFF_UP)) continue; @@ -185,10 +186,11 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, head = &itn->tunnels[hash]; hlist_for_each_entry_rcu(t, head, hash_node) { - if ((local != t->parms.iph.saddr && - (local != t->parms.iph.daddr || - !ipv4_is_multicast(local))) || - !(t->dev->flags & IFF_UP)) + if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) && + (local != t->parms.iph.daddr || !ipv4_is_multicast(local))) + continue; + + if (!(t->dev->flags & IFF_UP)) continue; if (!ip_tunnel_key_match(&t->parms, flags, key)) @@ -205,6 +207,8 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, hlist_for_each_entry_rcu(t, head, hash_node) { if (t->parms.i_key != key || + t->parms.iph.saddr != 0 || + t->parms.iph.daddr != 0 || !(t->dev->flags & IFF_UP)) continue; -- cgit v1.2.1 From 36beddc272c111689f3042bf3d10a64d8a805f93 Mon Sep 17 00:00:00 2001 From: Andrey Utkin Date: Mon, 7 Jul 2014 23:22:50 +0300 Subject: appletalk: Fix socket referencing in skb Setting just skb->sk without taking its reference and setting a destructor is invalid. However, in the places where this was done, skb is used in a way not requiring skb->sk setting. So dropping the setting of skb->sk. Thanks to Eric Dumazet for correct solution. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=79441 Reported-by: Ed Martin Signed-off-by: Andrey Utkin Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/appletalk/ddp.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 01a1082e02b3..bfcf6be1d665 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1489,8 +1489,6 @@ static int atalk_rcv(struct sk_buff *skb, struct net_device *dev, goto drop; /* Queue packet (standard) */ - skb->sk = sock; - if (sock_queue_rcv_skb(sock, skb) < 0) goto drop; @@ -1644,7 +1642,6 @@ static int atalk_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr if (!skb) goto out; - skb->sk = sk; skb_reserve(skb, ddp_dl->header_length); skb_reserve(skb, dev->hard_header_len); skb->dev = dev; -- cgit v1.2.1 From ac30ef832e6af0505b6f0251a6659adcfa74975e Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Wed, 9 Jul 2014 10:31:22 -0700 Subject: netlink: Fix handling of error from netlink_dump(). netlink_dump() returns a negative errno value on error. Until now, netlink_recvmsg() directly recorded that negative value in sk->sk_err, but that's wrong since sk_err takes positive errno values. (This manifests as userspace receiving a positive return value from the recv() system call, falsely indicating success.) This bug was introduced in the commit that started checking the netlink_dump() return value, commit b44d211 (netlink: handle errors from netlink_dump()). Multithreaded Netlink dumps are one way to trigger this behavior in practice, as described in the commit message for the userspace workaround posted here: http://openvswitch.org/pipermail/dev/2014-June/042339.html This commit also fixes the same bug in netlink_poll(), introduced in commit cd1df525d (netlink: add flow control for memory mapped I/O). Signed-off-by: Ben Pfaff Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 15c731f03fa6..e6fac7e3db52 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -636,7 +636,7 @@ static unsigned int netlink_poll(struct file *file, struct socket *sock, while (nlk->cb_running && netlink_dump_space(nlk)) { err = netlink_dump(sk); if (err < 0) { - sk->sk_err = err; + sk->sk_err = -err; sk->sk_error_report(sk); break; } @@ -2483,7 +2483,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) { ret = netlink_dump(sk); if (ret) { - sk->sk_err = ret; + sk->sk_err = -ret; sk->sk_error_report(sk); } } -- cgit v1.2.1 From d0a7ebbc119738439ff00f7fadbd343ae20ea5e8 Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Thu, 10 Jul 2014 17:29:21 -0700 Subject: GRE: enable offloads for GRE To get offloads to work with Generic Routing Encapsulation (GRE), the outer transport header has to be reset after skb_push is done. This patch has the support for this fix and hence GRE offloading. Signed-off-by: Amritha Nambiar Signed-off-by: Joseph Gasparakis Tested-By: Jim Young Signed-off-by: Jeff Kirsher Signed-off-by: David S. Miller --- net/ipv4/gre_demux.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 4e9619bca732..0485bf7f8f03 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -68,6 +68,7 @@ void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, skb_push(skb, hdr_len); + skb_reset_transport_header(skb); greh = (struct gre_base_hdr *)skb->data; greh->flags = tnl_flags_to_gre_flags(tpi->flags); greh->protocol = tpi->proto; -- cgit v1.2.1 From 999417549c16dd0e3a382aa9f6ae61688db03181 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Fri, 11 Jul 2014 08:45:27 -0400 Subject: tipc: clear 'next'-pointer of message fragments before reassembly If the 'next' pointer of the last fragment buffer in a message is not zeroed before reassembly, we risk ending up with a corrupt message, since the reassembly function itself isn't doing this. Currently, when a buffer is retrieved from the deferred queue of the broadcast link, the next pointer is not cleared, with the result as described above. This commit corrects this, and thereby fixes a bug that may occur when long broadcast messages are transmitted across dual interfaces. The bug has been present since 40ba3cdf542a469aaa9083fa041656e59b109b90 ("tipc: message reassembly using fragment chain") This commit should be applied to both net and net-next. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/bcast.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 26631679a1fa..55c6c9d3e1ce 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -559,6 +559,7 @@ receive: buf = node->bclink.deferred_head; node->bclink.deferred_head = buf->next; + buf->next = NULL; node->bclink.deferred_size--; goto receive; } -- cgit v1.2.1 From 8f2e5ae40ec193bc0a0ed99e95315c3eebca84ea Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 12 Jul 2014 20:30:35 +0200 Subject: net: sctp: fix information leaks in ulpevent layer While working on some other SCTP code, I noticed that some structures shared with user space are leaking uninitialized stack or heap buffer. In particular, struct sctp_sndrcvinfo has a 2 bytes hole between .sinfo_flags and .sinfo_ppid that remains unfilled by us in sctp_ulpevent_read_sndrcvinfo() when putting this into cmsg. But also struct sctp_remote_error contains a 2 bytes hole that we don't fill but place into a skb through skb_copy_expand() via sctp_ulpevent_make_remote_error(). Both structures are defined by the IETF in RFC6458: * Section 5.3.2. SCTP Header Information Structure: The sctp_sndrcvinfo structure is defined below: struct sctp_sndrcvinfo { uint16_t sinfo_stream; uint16_t sinfo_ssn; uint16_t sinfo_flags; <-- 2 bytes hole --> uint32_t sinfo_ppid; uint32_t sinfo_context; uint32_t sinfo_timetolive; uint32_t sinfo_tsn; uint32_t sinfo_cumtsn; sctp_assoc_t sinfo_assoc_id; }; * 6.1.3. SCTP_REMOTE_ERROR: A remote peer may send an Operation Error message to its peer. This message indicates a variety of error conditions on an association. The entire ERROR chunk as it appears on the wire is included in an SCTP_REMOTE_ERROR event. Please refer to the SCTP specification [RFC4960] and any extensions for a list of possible error formats. An SCTP error notification has the following format: struct sctp_remote_error { uint16_t sre_type; uint16_t sre_flags; uint32_t sre_length; uint16_t sre_error; <-- 2 bytes hole --> sctp_assoc_t sre_assoc_id; uint8_t sre_data[]; }; Fix this by setting both to 0 before filling them out. We also have other structures shared between user and kernel space in SCTP that contains holes (e.g. struct sctp_paddrthlds), but we copy that buffer over from user space first and thus don't need to care about it in that cases. While at it, we can also remove lengthy comments copied from the draft, instead, we update the comment with the correct RFC number where one can look it up. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/sctp/ulpevent.c | 122 +++++++--------------------------------------------- 1 file changed, 15 insertions(+), 107 deletions(-) (limited to 'net') diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index 85c64658bd0b..b6842fdb53d4 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -366,9 +366,10 @@ fail: * specification [SCTP] and any extensions for a list of possible * error formats. */ -struct sctp_ulpevent *sctp_ulpevent_make_remote_error( - const struct sctp_association *asoc, struct sctp_chunk *chunk, - __u16 flags, gfp_t gfp) +struct sctp_ulpevent * +sctp_ulpevent_make_remote_error(const struct sctp_association *asoc, + struct sctp_chunk *chunk, __u16 flags, + gfp_t gfp) { struct sctp_ulpevent *event; struct sctp_remote_error *sre; @@ -387,8 +388,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_remote_error( /* Copy the skb to a new skb with room for us to prepend * notification with. */ - skb = skb_copy_expand(chunk->skb, sizeof(struct sctp_remote_error), - 0, gfp); + skb = skb_copy_expand(chunk->skb, sizeof(*sre), 0, gfp); /* Pull off the rest of the cause TLV from the chunk. */ skb_pull(chunk->skb, elen); @@ -399,62 +399,21 @@ struct sctp_ulpevent *sctp_ulpevent_make_remote_error( event = sctp_skb2event(skb); sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize); - sre = (struct sctp_remote_error *) - skb_push(skb, sizeof(struct sctp_remote_error)); + sre = (struct sctp_remote_error *) skb_push(skb, sizeof(*sre)); /* Trim the buffer to the right length. */ - skb_trim(skb, sizeof(struct sctp_remote_error) + elen); + skb_trim(skb, sizeof(*sre) + elen); - /* Socket Extensions for SCTP - * 5.3.1.3 SCTP_REMOTE_ERROR - * - * sre_type: - * It should be SCTP_REMOTE_ERROR. - */ + /* RFC6458, Section 6.1.3. SCTP_REMOTE_ERROR */ + memset(sre, 0, sizeof(*sre)); sre->sre_type = SCTP_REMOTE_ERROR; - - /* - * Socket Extensions for SCTP - * 5.3.1.3 SCTP_REMOTE_ERROR - * - * sre_flags: 16 bits (unsigned integer) - * Currently unused. - */ sre->sre_flags = 0; - - /* Socket Extensions for SCTP - * 5.3.1.3 SCTP_REMOTE_ERROR - * - * sre_length: sizeof (__u32) - * - * This field is the total length of the notification data, - * including the notification header. - */ sre->sre_length = skb->len; - - /* Socket Extensions for SCTP - * 5.3.1.3 SCTP_REMOTE_ERROR - * - * sre_error: 16 bits (unsigned integer) - * This value represents one of the Operational Error causes defined in - * the SCTP specification, in network byte order. - */ sre->sre_error = cause; - - /* Socket Extensions for SCTP - * 5.3.1.3 SCTP_REMOTE_ERROR - * - * sre_assoc_id: sizeof (sctp_assoc_t) - * - * The association id field, holds the identifier for the association. - * All notifications for a given association have the same association - * identifier. For TCP style socket, this field is ignored. - */ sctp_ulpevent_set_owner(event, asoc); sre->sre_assoc_id = sctp_assoc2id(asoc); return event; - fail: return NULL; } @@ -899,7 +858,9 @@ __u16 sctp_ulpevent_get_notification_type(const struct sctp_ulpevent *event) return notification->sn_header.sn_type; } -/* Copy out the sndrcvinfo into a msghdr. */ +/* RFC6458, Section 5.3.2. SCTP Header Information Structure + * (SCTP_SNDRCV, DEPRECATED) + */ void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event, struct msghdr *msghdr) { @@ -908,74 +869,21 @@ void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event, if (sctp_ulpevent_is_notification(event)) return; - /* Sockets API Extensions for SCTP - * Section 5.2.2 SCTP Header Information Structure (SCTP_SNDRCV) - * - * sinfo_stream: 16 bits (unsigned integer) - * - * For recvmsg() the SCTP stack places the message's stream number in - * this value. - */ + memset(&sinfo, 0, sizeof(sinfo)); sinfo.sinfo_stream = event->stream; - /* sinfo_ssn: 16 bits (unsigned integer) - * - * For recvmsg() this value contains the stream sequence number that - * the remote endpoint placed in the DATA chunk. For fragmented - * messages this is the same number for all deliveries of the message - * (if more than one recvmsg() is needed to read the message). - */ sinfo.sinfo_ssn = event->ssn; - /* sinfo_ppid: 32 bits (unsigned integer) - * - * In recvmsg() this value is - * the same information that was passed by the upper layer in the peer - * application. Please note that byte order issues are NOT accounted - * for and this information is passed opaquely by the SCTP stack from - * one end to the other. - */ sinfo.sinfo_ppid = event->ppid; - /* sinfo_flags: 16 bits (unsigned integer) - * - * This field may contain any of the following flags and is composed of - * a bitwise OR of these values. - * - * recvmsg() flags: - * - * SCTP_UNORDERED - This flag is present when the message was sent - * non-ordered. - */ sinfo.sinfo_flags = event->flags; - /* sinfo_tsn: 32 bit (unsigned integer) - * - * For the receiving side, this field holds a TSN that was - * assigned to one of the SCTP Data Chunks. - */ sinfo.sinfo_tsn = event->tsn; - /* sinfo_cumtsn: 32 bit (unsigned integer) - * - * This field will hold the current cumulative TSN as - * known by the underlying SCTP layer. Note this field is - * ignored when sending and only valid for a receive - * operation when sinfo_flags are set to SCTP_UNORDERED. - */ sinfo.sinfo_cumtsn = event->cumtsn; - /* sinfo_assoc_id: sizeof (sctp_assoc_t) - * - * The association handle field, sinfo_assoc_id, holds the identifier - * for the association announced in the COMMUNICATION_UP notification. - * All notifications for a given association have the same identifier. - * Ignored for one-to-one style sockets. - */ sinfo.sinfo_assoc_id = sctp_assoc2id(event->asoc); - - /* context value that is set via SCTP_CONTEXT socket option. */ + /* Context value that is set via SCTP_CONTEXT socket option. */ sinfo.sinfo_context = event->asoc->default_rcv_context; - /* These fields are not used while receiving. */ sinfo.sinfo_timetolive = 0; put_cmsg(msghdr, IPPROTO_SCTP, SCTP_SNDRCV, - sizeof(struct sctp_sndrcvinfo), (void *)&sinfo); + sizeof(sinfo), &sinfo); } /* Do accounting for bytes received and hold a reference to the association -- cgit v1.2.1 From 9ecf07a1d8f70f72ec99a0f102c8aa24609d84f4 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sat, 12 Jul 2014 22:36:44 +0200 Subject: neigh: sysctl - simplify address calculation of gc_* variables The code in neigh_sysctl_register() relies on a specific layout of struct neigh_table, namely that the 'gc_*' variables are directly following the 'parms' member in a specific order. The code, though, expresses this in the most ugly way. Get rid of the ugly casts and use the 'tbl' pointer to get a handle to the table. This way we can refer to the 'gc_*' variables directly. Similarly seen in the grsecurity patch, written by Brad Spengler. Signed-off-by: Mathias Krause Cc: Brad Spengler Signed-off-by: David S. Miller --- net/core/neighbour.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 32d872eec7f5..559890b0f0a2 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3059,11 +3059,12 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0, sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL])); } else { + struct neigh_table *tbl = p->tbl; dev_name_source = "default"; - t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = (int *)(p + 1); - t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = (int *)(p + 1) + 1; - t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = (int *)(p + 1) + 2; - t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = (int *)(p + 1) + 3; + t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = &tbl->gc_interval; + t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = &tbl->gc_thresh1; + t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = &tbl->gc_thresh2; + t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = &tbl->gc_thresh3; } if (handler) { -- cgit v1.2.1 From 3cf521f7dc87c031617fd47e4b7aa2593c2f3daf Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Mon, 14 Jul 2014 17:02:31 -0700 Subject: net/l2tp: don't fall back on UDP [get|set]sockopt The l2tp [get|set]sockopt() code has fallen back to the UDP functions for socket option levels != SOL_PPPOL2TP since day one, but that has never actually worked, since the l2tp socket isn't an inet socket. As David Miller points out: "If we wanted this to work, it'd have to look up the tunnel and then use tunnel->sk, but I wonder how useful that would be" Since this can never have worked so nobody could possibly have depended on that functionality, just remove the broken code and return -EINVAL. Reported-by: Sasha Levin Acked-by: James Chapman Acked-by: David Miller Cc: Phil Turnbull Cc: Vegard Nossum Cc: Willy Tarreau Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- net/l2tp/l2tp_ppp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 950909f04ee6..13752d96275e 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -1365,7 +1365,7 @@ static int pppol2tp_setsockopt(struct socket *sock, int level, int optname, int err; if (level != SOL_PPPOL2TP) - return udp_prot.setsockopt(sk, level, optname, optval, optlen); + return -EINVAL; if (optlen < sizeof(int)) return -EINVAL; @@ -1491,7 +1491,7 @@ static int pppol2tp_getsockopt(struct socket *sock, int level, int optname, struct pppol2tp_session *ps; if (level != SOL_PPPOL2TP) - return udp_prot.getsockopt(sk, level, optname, optval, optlen); + return -EINVAL; if (get_user(len, optlen)) return -EFAULT; -- cgit v1.2.1 From bf858ab0adcb36fc60a1d941749a7dd0a9bf3d8a Mon Sep 17 00:00:00 2001 From: Yan Burman Date: Thu, 19 Jun 2014 16:06:30 +0300 Subject: xprtrdma: Fix DMA-API-DEBUG warning by checking dma_map result Fix the following warning when DMA-API debug is enabled by checking ib_dma_map_single result: [ 1455.345548] ------------[ cut here ]------------ [ 1455.346863] WARNING: CPU: 3 PID: 3929 at /home/yanb/kernel/net-next/lib/dma-debug.c:1140 check_unmap+0x4e5/0x990() [ 1455.349350] mlx4_core 0000:00:07.0: DMA-API: device driver failed to check map error[device address=0x000000007c9f2090] [size=2656 bytes] [mapped as single] [ 1455.349350] Modules linked in: xprtrdma netconsole configfs nfsv3 nfs_acl ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm autofs4 auth_rpcgss oid_registry nfsv4 nfs fscache lockd sunrpc dm_mirror dm_region_hash dm_log microcode pcspkr mlx4_ib ib_sa ib_mad ib_core ib_addr mlx4_en ipv6 ptp pps_core vxlan mlx4_core virtio_balloon cirrus ttm drm_kms_helper drm sysimgblt sysfillrect syscopyarea i2c_piix4 i2c_core button ext3 jbd virtio_blk virtio_net virtio_pci virtio_ring virtio uhci_hcd ata_generic ata_piix libata [ 1455.349350] CPU: 3 PID: 3929 Comm: mount.nfs Not tainted 3.15.0-rc1-dbg+ #13 [ 1455.349350] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2007 [ 1455.349350] 0000000000000474 ffff880069dcf628 ffffffff8151c341 ffffffff817b69d8 [ 1455.349350] ffff880069dcf678 ffff880069dcf668 ffffffff8105b5fc 0000000069dcf658 [ 1455.349350] ffff880069dcf778 ffff88007b0c9f00 ffffffff8255ec40 0000000000000a60 [ 1455.349350] Call Trace: [ 1455.349350] [] dump_stack+0x52/0x81 [ 1455.349350] [] warn_slowpath_common+0x8c/0xc0 [ 1455.349350] [] warn_slowpath_fmt+0x46/0x50 [ 1455.349350] [] check_unmap+0x4e5/0x990 [ 1455.349350] [] ? _raw_spin_unlock_irq+0x30/0x60 [ 1455.349350] [] debug_dma_unmap_page+0x5a/0x60 [ 1455.349350] [] rpcrdma_deregister_internal+0xb3/0xd0 [xprtrdma] [ 1455.349350] [] rpcrdma_buffer_destroy+0x69/0x170 [xprtrdma] [ 1455.349350] [] xprt_rdma_destroy+0x3f/0xb0 [xprtrdma] [ 1455.349350] [] xprt_destroy+0x6f/0x80 [sunrpc] [ 1455.349350] [] xprt_put+0x15/0x20 [sunrpc] [ 1455.349350] [] rpc_free_client+0x8a/0xe0 [sunrpc] [ 1455.349350] [] rpc_release_client+0x68/0xa0 [sunrpc] [ 1455.349350] [] rpc_shutdown_client+0xb0/0xc0 [sunrpc] [ 1455.349350] [] ? rpc_ping+0x5d/0x70 [sunrpc] [ 1455.349350] [] rpc_create_xprt+0xbb/0xd0 [sunrpc] [ 1455.349350] [] rpc_create+0xb3/0x160 [sunrpc] [ 1455.349350] [] ? __probe_kernel_read+0x69/0xb0 [ 1455.349350] [] nfs_create_rpc_client+0xdc/0x100 [nfs] [ 1455.349350] [] nfs_init_client+0x3a/0x90 [nfs] [ 1455.349350] [] nfs_get_client+0x478/0x5b0 [nfs] [ 1455.349350] [] ? nfs_get_client+0x100/0x5b0 [nfs] [ 1455.349350] [] ? kmem_cache_alloc_trace+0x24d/0x260 [ 1455.349350] [] nfs_create_server+0xf3/0x4c0 [nfs] [ 1455.349350] [] ? nfs_request_mount+0xf0/0x1a0 [nfs] [ 1455.349350] [] nfs3_create_server+0x13/0x30 [nfsv3] [ 1455.349350] [] nfs_try_mount+0x1f3/0x230 [nfs] [ 1455.349350] [] ? get_parent_ip+0x11/0x50 [ 1455.349350] [] ? __this_cpu_preempt_check+0x13/0x20 [ 1455.349350] [] ? try_module_get+0x6b/0x190 [ 1455.349350] [] nfs_fs_mount+0x187/0x9d0 [nfs] [ 1455.349350] [] ? nfs_clone_super+0x140/0x140 [nfs] [ 1455.349350] [] ? nfs_auth_info_match+0x40/0x40 [nfs] [ 1455.349350] [] mount_fs+0x20/0xe0 [ 1455.349350] [] vfs_kern_mount+0x76/0x160 [ 1455.349350] [] do_mount+0x428/0xae0 [ 1455.349350] [] SyS_mount+0x90/0xe0 [ 1455.349350] [] system_call_fastpath+0x16/0x1b [ 1455.349350] ---[ end trace f1f31572972e211d ]--- Signed-off-by: Yan Burman Reviewed-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 13dbd1c389ff..176dafc6e6d7 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1388,6 +1388,9 @@ rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, */ iov->addr = ib_dma_map_single(ia->ri_id->device, va, len, DMA_BIDIRECTIONAL); + if (ib_dma_mapping_error(ia->ri_id->device, iov->addr)) + return -ENOMEM; + iov->length = len; if (ia->ri_have_dma_lkey) { -- cgit v1.2.1 From 5fc83f470d8ada25927701512cf94a53dab6c4c8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Jul 2014 17:23:17 -0400 Subject: xprtrdma: Fix panic in rpcrdma_register_frmr_external() seg1->mr_nsegs is not yet initialized when it is used to unmap segments during an error exit. Use the same unmapping logic for all error exits. "if (frmr_wr.wr.fast_reg.length < len) {" used to be a BUG_ON check. The broken code will never be executed under normal operation. Fixes: c977dea (xprtrdma: Remove BUG_ON() call sites) Signed-off-by: Chuck Lever Tested-by: Steve Wise Tested-by: Shirley Ma Tested-by: Devesh Sharma Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 176dafc6e6d7..f337bdaa9939 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1548,9 +1548,8 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; frmr_wr.wr.fast_reg.length = page_no << PAGE_SHIFT; if (frmr_wr.wr.fast_reg.length < len) { - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); - return -EIO; + rc = -EIO; + goto out_err; } /* Bump the key */ @@ -1568,8 +1567,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, if (rc) { dprintk("RPC: %s: failed ib_post_send for register," " status %i\n", __func__, rc); - while (i--) - rpcrdma_unmap_one(ia, --seg); + goto out_err; } else { seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; seg1->mr_base = seg1->mr_dma + pageoff; @@ -1577,6 +1575,10 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, seg1->mr_len = len; } *nsegs = i; + return 0; +out_err: + while (i--) + rpcrdma_unmap_one(ia, --seg); return rc; } -- cgit v1.2.1 From 73806c8832b3438ef0439603dab1f3cfc61cb6cd Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Jul 2014 17:23:25 -0400 Subject: xprtrdma: Protect ia->ri_id when unmapping/invalidating MRs Ensure ia->ri_id remains valid while invoking dma_unmap_page() or posting LOCAL_INV during a transport reconnect. Otherwise, ia->ri_id->device or ia->ri_id->qp is NULL, which triggers a panic. BugLink: https://bugzilla.linux-nfs.org/show_bug.cgi?id=259 Fixes: ec62f40 'xprtrdma: Ensure ia->ri_id->qp is not NULL when reconnecting' Signed-off-by: Chuck Lever Tested-by: Steve Wise Tested-by: Shirley Ma Tested-by: Devesh Sharma Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 23 +++++++++++++++++------ net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 2 files changed, 18 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index f337bdaa9939..aa08de89de42 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -613,6 +613,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) /* Else will do memory reg/dereg for each chunk */ ia->ri_memreg_strategy = memreg; + rwlock_init(&ia->ri_qplock); return 0; out2: rdma_destroy_id(ia->ri_id); @@ -859,7 +860,7 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) int rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { - struct rdma_cm_id *id; + struct rdma_cm_id *id, *old; int rc = 0; int retry_count = 0; @@ -905,9 +906,14 @@ retry: rc = -ENETUNREACH; goto out; } - rdma_destroy_qp(ia->ri_id); - rdma_destroy_id(ia->ri_id); + + write_lock(&ia->ri_qplock); + old = ia->ri_id; ia->ri_id = id; + write_unlock(&ia->ri_qplock); + + rdma_destroy_qp(old); + rdma_destroy_id(old); } else { dprintk("RPC: %s: connecting...\n", __func__); rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); @@ -1590,9 +1596,6 @@ rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, struct ib_send_wr invalidate_wr, *bad_wr; int rc; - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); - memset(&invalidate_wr, 0, sizeof invalidate_wr); invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; invalidate_wr.opcode = IB_WR_LOCAL_INV; @@ -1600,7 +1603,11 @@ rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; DECR_CQCOUNT(&r_xprt->rx_ep); + read_lock(&ia->ri_qplock); + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(ia, seg++); rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); + read_unlock(&ia->ri_qplock); if (rc) dprintk("RPC: %s: failed ib_post_send for invalidate," " status %i\n", __func__, rc); @@ -1661,8 +1668,10 @@ rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg, list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l); rc = ib_unmap_fmr(&l); + read_lock(&ia->ri_qplock); while (seg1->mr_nsegs--) rpcrdma_unmap_one(ia, seg++); + read_unlock(&ia->ri_qplock); if (rc) dprintk("RPC: %s: failed ib_unmap_fmr," " status %i\n", __func__, rc); @@ -1718,7 +1727,9 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, #if RPCRDMA_PERSISTENT_REGISTRATION case RPCRDMA_ALLPHYSICAL: + read_lock(&ia->ri_qplock); rpcrdma_unmap_one(ia, seg); + read_unlock(&ia->ri_qplock); break; #endif diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 89e7cd479705..97ca516ec619 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -59,6 +59,7 @@ * Interface Adapter -- one per transport instance */ struct rpcrdma_ia { + rwlock_t ri_qplock; struct rdma_cm_id *ri_id; struct ib_pd *ri_pd; struct ib_mr *ri_bind_mem; -- cgit v1.2.1 From 43e95988178ed70a878a5be6be9ad248342dbf7d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Jul 2014 17:23:34 -0400 Subject: xprtrdma: Limit data payload size for ALLPHYSICAL When the client uses physical memory registration, each page in the payload gets its own array entry in the RPC/RDMA header's chunk list. Therefore, don't advertise a maximum payload size that would require more array entries than can fit in the RPC buffer where RPC/RDMA headers are built. BugLink: https://bugzilla.linux-nfs.org/show_bug.cgi?id=248 Signed-off-by: Chuck Lever Tested-by: Steve Wise Tested-by: Shirley Ma Tested-by: Devesh Sharma Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 4 +++- net/sunrpc/xprtrdma/verbs.c | 41 +++++++++++++++++++++++++++++++++++++++++ net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 3 files changed, 45 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 66f91f0d071a..418510202919 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -296,7 +296,6 @@ xprt_setup_rdma(struct xprt_create *args) xprt->resvport = 0; /* privileged port not needed */ xprt->tsh_size = 0; /* RPC-RDMA handles framing */ - xprt->max_payload = RPCRDMA_MAX_DATA_SEGS * PAGE_SIZE; xprt->ops = &xprt_rdma_procs; /* @@ -382,6 +381,9 @@ xprt_setup_rdma(struct xprt_create *args) new_ep->rep_xprt = xprt; xprt_rdma_format_addresses(xprt); + xprt->max_payload = rpcrdma_max_payload(new_xprt); + dprintk("RPC: %s: transport data payload maximum: %zu bytes\n", + __func__, xprt->max_payload); if (!try_module_get(THIS_MODULE)) goto out4; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index aa08de89de42..13ff87400203 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1825,3 +1825,44 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, rc); return rc; } + +/* Physical mapping means one Read/Write list entry per-page. + * All list entries must fit within an inline buffer + * + * NB: The server must return a Write list for NFS READ, + * which has the same constraint. Factor in the inline + * rsize as well. + */ +static size_t +rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; + unsigned int inline_size, pages; + + inline_size = min_t(unsigned int, + cdata->inline_wsize, cdata->inline_rsize); + inline_size -= RPCRDMA_HDRLEN_MIN; + pages = inline_size / sizeof(struct rpcrdma_segment); + return pages << PAGE_SHIFT; +} + +static size_t +rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt) +{ + return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT; +} + +size_t +rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt) +{ + size_t result; + + switch (r_xprt->rx_ia.ri_memreg_strategy) { + case RPCRDMA_ALLPHYSICAL: + result = rpcrdma_physical_max_payload(r_xprt); + break; + default: + result = rpcrdma_mr_max_payload(r_xprt); + } + return result; +} diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 97ca516ec619..f3d86b24a4af 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -348,6 +348,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *); * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c */ int rpcrdma_marshal_req(struct rpc_rqst *); +size_t rpcrdma_max_payload(struct rpcrdma_xprt *); /* Temporary NFS request map cache. Created in svc_rdma.c */ extern struct kmem_cache *svc_rdma_map_cachep; -- cgit v1.2.1 From 6ab59945f292a5c6cbc4a6c2011f1a732a116af2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Jul 2014 17:23:43 -0400 Subject: xprtrdma: Update rkeys after transport reconnect Various reports of: rpcrdma_qp_async_error_upcall: QP error 3 on device mlx4_0 ep ffff8800bfd3e848 Ensure that rkeys in already-marshalled RPC/RDMA headers are refreshed after the QP has been replaced by a reconnect. BugLink: https://bugzilla.linux-nfs.org/show_bug.cgi?id=249 Suggested-by: Selvin Xavier Signed-off-by: Chuck Lever Tested-by: Steve Wise Tested-by: Shirley Ma Tested-by: Devesh Sharma Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 75 +++++++++++++++++++++-------------------- net/sunrpc/xprtrdma/transport.c | 11 +++--- net/sunrpc/xprtrdma/xprt_rdma.h | 10 ++++++ 3 files changed, 54 insertions(+), 42 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 693966d3f33b..54422f73b03b 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -53,14 +53,6 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif -enum rpcrdma_chunktype { - rpcrdma_noch = 0, - rpcrdma_readch, - rpcrdma_areadch, - rpcrdma_writech, - rpcrdma_replych -}; - #ifdef RPC_DEBUG static const char transfertypes[][12] = { "pure inline", /* no chunks */ @@ -285,6 +277,28 @@ out: return n; } +/* + * Marshal chunks. This routine returns the header length + * consumed by marshaling. + * + * Returns positive RPC/RDMA header size, or negative errno. + */ + +ssize_t +rpcrdma_marshal_chunks(struct rpc_rqst *rqst, ssize_t result) +{ + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)req->rl_base; + + if (req->rl_rtype != rpcrdma_noch) + result = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf, + headerp, req->rl_rtype); + else if (req->rl_wtype != rpcrdma_noch) + result = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf, + headerp, req->rl_wtype); + return result; +} + /* * Copy write data inline. * This function is used for "small" requests. Data which is passed @@ -377,7 +391,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) char *base; size_t rpclen, padlen; ssize_t hdrlen; - enum rpcrdma_chunktype rtype, wtype; struct rpcrdma_msg *headerp; /* @@ -415,13 +428,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * into pages; otherwise use reply chunks. */ if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst)) - wtype = rpcrdma_noch; + req->rl_wtype = rpcrdma_noch; else if (rqst->rq_rcv_buf.page_len == 0) - wtype = rpcrdma_replych; + req->rl_wtype = rpcrdma_replych; else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) - wtype = rpcrdma_writech; + req->rl_wtype = rpcrdma_writech; else - wtype = rpcrdma_replych; + req->rl_wtype = rpcrdma_replych; /* * Chunks needed for arguments? @@ -438,16 +451,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * TBD check NFSv4 setacl */ if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst)) - rtype = rpcrdma_noch; + req->rl_rtype = rpcrdma_noch; else if (rqst->rq_snd_buf.page_len == 0) - rtype = rpcrdma_areadch; + req->rl_rtype = rpcrdma_areadch; else - rtype = rpcrdma_readch; + req->rl_rtype = rpcrdma_readch; /* The following simplification is not true forever */ - if (rtype != rpcrdma_noch && wtype == rpcrdma_replych) - wtype = rpcrdma_noch; - if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) { + if (req->rl_rtype != rpcrdma_noch && req->rl_wtype == rpcrdma_replych) + req->rl_wtype = rpcrdma_noch; + if (req->rl_rtype != rpcrdma_noch && req->rl_wtype != rpcrdma_noch) { dprintk("RPC: %s: cannot marshal multiple chunk lists\n", __func__); return -EIO; @@ -461,7 +474,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * When padding is in use and applies to the transfer, insert * it and change the message type. */ - if (rtype == rpcrdma_noch) { + if (req->rl_rtype == rpcrdma_noch) { padlen = rpcrdma_inline_pullup(rqst, RPCRDMA_INLINE_PAD_VALUE(rqst)); @@ -476,7 +489,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; hdrlen += 2 * sizeof(u32); /* extra words in padhdr */ - if (wtype != rpcrdma_noch) { + if (req->rl_wtype != rpcrdma_noch) { dprintk("RPC: %s: invalid chunk list\n", __func__); return -EIO; @@ -497,30 +510,18 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * on receive. Therefore, we request a reply chunk * for non-writes wherever feasible and efficient. */ - if (wtype == rpcrdma_noch) - wtype = rpcrdma_replych; + if (req->rl_wtype == rpcrdma_noch) + req->rl_wtype = rpcrdma_replych; } } - /* - * Marshal chunks. This routine will return the header length - * consumed by marshaling. - */ - if (rtype != rpcrdma_noch) { - hdrlen = rpcrdma_create_chunks(rqst, - &rqst->rq_snd_buf, headerp, rtype); - wtype = rtype; /* simplify dprintk */ - - } else if (wtype != rpcrdma_noch) { - hdrlen = rpcrdma_create_chunks(rqst, - &rqst->rq_rcv_buf, headerp, wtype); - } + hdrlen = rpcrdma_marshal_chunks(rqst, hdrlen); if (hdrlen < 0) return hdrlen; dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" " headerp 0x%p base 0x%p lkey 0x%x\n", - __func__, transfertypes[wtype], hdrlen, rpclen, padlen, + __func__, transfertypes[req->rl_wtype], hdrlen, rpclen, padlen, headerp, base, req->rl_iov.lkey); /* diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 418510202919..f6d280b31dc9 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -597,13 +597,14 @@ xprt_rdma_send_request(struct rpc_task *task) struct rpc_xprt *xprt = rqst->rq_xprt; struct rpcrdma_req *req = rpcr_to_rdmar(rqst); struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - int rc; + int rc = 0; - if (req->rl_niovs == 0) { + if (req->rl_niovs == 0) rc = rpcrdma_marshal_req(rqst); - if (rc < 0) - goto failed_marshal; - } + else if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR) + rc = rpcrdma_marshal_chunks(rqst, 0); + if (rc < 0) + goto failed_marshal; if (req->rl_reply == NULL) /* e.g. reconnection */ rpcrdma_recv_buffer_get(req); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index f3d86b24a4af..c270e59cf917 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -99,6 +99,14 @@ struct rpcrdma_ep { #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) #define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount) +enum rpcrdma_chunktype { + rpcrdma_noch = 0, + rpcrdma_readch, + rpcrdma_areadch, + rpcrdma_writech, + rpcrdma_replych +}; + /* * struct rpcrdma_rep -- this structure encapsulates state required to recv * and complete a reply, asychronously. It needs several pieces of @@ -192,6 +200,7 @@ struct rpcrdma_req { unsigned int rl_niovs; /* 0, 2 or 4 */ unsigned int rl_nchunks; /* non-zero if chunks */ unsigned int rl_connect_cookie; /* retry detection */ + enum rpcrdma_chunktype rl_rtype, rl_wtype; struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */ @@ -347,6 +356,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *); /* * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c */ +ssize_t rpcrdma_marshal_chunks(struct rpc_rqst *, ssize_t); int rpcrdma_marshal_req(struct rpc_rqst *); size_t rpcrdma_max_payload(struct rpcrdma_xprt *); -- cgit v1.2.1 From a7bc211ac926172ad20463afcf00ae7b9ebcd950 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Jul 2014 17:23:52 -0400 Subject: xprtrdma: On disconnect, don't ignore pending CQEs xprtrdma is currently throwing away queued completions during a reconnect. RPC replies posted just before connection loss, or successful completions that change the state of an FRMR, can be missed. Signed-off-by: Chuck Lever Tested-by: Steve Wise Tested-by: Shirley Ma Tested-by: Devesh Sharma Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 13ff87400203..e49cdc930dfd 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -310,6 +310,13 @@ rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context) rpcrdma_recvcq_poll(cq, ep); } +static void +rpcrdma_flush_cqs(struct rpcrdma_ep *ep) +{ + rpcrdma_recvcq_upcall(ep->rep_attr.recv_cq, ep); + rpcrdma_sendcq_upcall(ep->rep_attr.send_cq, ep); +} + #ifdef RPC_DEBUG static const char * const conn[] = { "address resolved", @@ -872,9 +879,7 @@ retry: if (rc && rc != -ENOTCONN) dprintk("RPC: %s: rpcrdma_ep_disconnect" " status %i\n", __func__, rc); - - rpcrdma_clean_cq(ep->rep_attr.recv_cq); - rpcrdma_clean_cq(ep->rep_attr.send_cq); + rpcrdma_flush_cqs(ep); xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); id = rpcrdma_create_id(xprt, ia, @@ -985,8 +990,7 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { int rc; - rpcrdma_clean_cq(ep->rep_attr.recv_cq); - rpcrdma_clean_cq(ep->rep_attr.send_cq); + rpcrdma_flush_cqs(ep); rc = rdma_disconnect(ia->ri_id); if (!rc) { /* returns without wait if not connected */ -- cgit v1.2.1 From 539431a437d2e5d6d94016184dfc0aab263c01e1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Jul 2014 17:24:01 -0400 Subject: xprtrdma: Don't invalidate FRMRs if registration fails If FRMR registration fails, it's likely to transition the QP to the error state. Or, registration may have failed because the QP is _already_ in ERROR. Thus calling rpcrdma_deregister_external() in rpcrdma_create_chunks() is useless in FRMR mode: the LOCAL_INVs just get flushed. It is safe to leave existing registrations: when FRMR registration is tried again, rpcrdma_register_frmr_external() checks if each FRMR is already/still VALID, and knocks it down first if it is. Signed-off-by: Chuck Lever Tested-by: Steve Wise Tested-by: Shirley Ma Tested-by: Devesh Sharma Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 54422f73b03b..6166c985fe24 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -271,9 +271,11 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, return (unsigned char *)iptr - (unsigned char *)headerp; out: - for (pos = 0; nchunks--;) - pos += rpcrdma_deregister_external( - &req->rl_segments[pos], r_xprt); + if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_FRMR) { + for (pos = 0; nchunks--;) + pos += rpcrdma_deregister_external( + &req->rl_segments[pos], r_xprt); + } return n; } -- cgit v1.2.1 From 0dbb4108a6a615589751de2aaf468d3ddbcef24c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Jul 2014 17:24:09 -0400 Subject: xprtrdma: Unclutter struct rpcrdma_mr_seg Clean ups: - make it obvious that the rl_mw field is a pointer -- allocated separately, not as part of struct rpcrdma_mr_seg - promote "struct {} frmr;" to a named type - promote the state enum to a named type - name the MW state field the same way other fields in rpcrdma_mw are named Signed-off-by: Chuck Lever Tested-by: Steve Wise Tested-by: Shirley Ma Tested-by: Devesh Sharma Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 34 +++++++++++++++---------------- net/sunrpc/xprtrdma/xprt_rdma.h | 44 ++++++++++++++++++++++++++++++----------- 2 files changed, 50 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index e49cdc930dfd..dd1dabcd3a07 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -156,9 +156,9 @@ rpcrdma_sendcq_process_wc(struct ib_wc *wc) return; if (wc->opcode == IB_WC_FAST_REG_MR) - frmr->r.frmr.state = FRMR_IS_VALID; + frmr->r.frmr.fr_state = FRMR_IS_VALID; else if (wc->opcode == IB_WC_LOCAL_INV) - frmr->r.frmr.state = FRMR_IS_INVALID; + frmr->r.frmr.fr_state = FRMR_IS_INVALID; } static int @@ -1496,6 +1496,9 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, struct rpcrdma_xprt *r_xprt) { struct rpcrdma_mr_seg *seg1 = seg; + struct rpcrdma_mw *mw = seg1->mr_chunk.rl_mw; + struct rpcrdma_frmr *frmr = &mw->r.frmr; + struct ib_mr *mr = frmr->fr_mr; struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr; u8 key; @@ -1515,8 +1518,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, rpcrdma_map_one(ia, seg, writing); pa = seg->mr_dma; for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) { - seg1->mr_chunk.rl_mw->r.frmr.fr_pgl-> - page_list[page_no++] = pa; + frmr->fr_pgl->page_list[page_no++] = pa; pa += PAGE_SIZE; } len += seg->mr_len; @@ -1528,20 +1530,18 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, break; } dprintk("RPC: %s: Using frmr %p to map %d segments\n", - __func__, seg1->mr_chunk.rl_mw, i); + __func__, mw, i); - if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) { + if (unlikely(frmr->fr_state == FRMR_IS_VALID)) { dprintk("RPC: %s: frmr %x left valid, posting invalidate.\n", - __func__, - seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey); + __func__, mr->rkey); /* Invalidate before using. */ memset(&invalidate_wr, 0, sizeof invalidate_wr); - invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; + invalidate_wr.wr_id = (unsigned long)(void *)mw; invalidate_wr.next = &frmr_wr; invalidate_wr.opcode = IB_WR_LOCAL_INV; invalidate_wr.send_flags = IB_SEND_SIGNALED; - invalidate_wr.ex.invalidate_rkey = - seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; + invalidate_wr.ex.invalidate_rkey = mr->rkey; DECR_CQCOUNT(&r_xprt->rx_ep); post_wr = &invalidate_wr; } else @@ -1549,11 +1549,11 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, /* Prepare FRMR WR */ memset(&frmr_wr, 0, sizeof frmr_wr); - frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; + frmr_wr.wr_id = (unsigned long)(void *)mw; frmr_wr.opcode = IB_WR_FAST_REG_MR; frmr_wr.send_flags = IB_SEND_SIGNALED; frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma; - frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl; + frmr_wr.wr.fast_reg.page_list = frmr->fr_pgl; frmr_wr.wr.fast_reg.page_list_len = page_no; frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; frmr_wr.wr.fast_reg.length = page_no << PAGE_SHIFT; @@ -1563,13 +1563,13 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, } /* Bump the key */ - key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF); - ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key); + key = (u8)(mr->rkey & 0x000000FF); + ib_update_fast_reg_key(mr, ++key); frmr_wr.wr.fast_reg.access_flags = (writing ? IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : IB_ACCESS_REMOTE_READ); - frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; + frmr_wr.wr.fast_reg.rkey = mr->rkey; DECR_CQCOUNT(&r_xprt->rx_ep); rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr); @@ -1579,7 +1579,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, " status %i\n", __func__, rc); goto out_err; } else { - seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; + seg1->mr_rkey = mr->rkey; seg1->mr_base = seg1->mr_dma + pageoff; seg1->mr_nsegs = i; seg1->mr_len = len; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index c270e59cf917..84c3455a9521 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -145,6 +145,38 @@ struct rpcrdma_rep { char rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */ }; +/* + * struct rpcrdma_mw - external memory region metadata + * + * An external memory region is any buffer or page that is registered + * on the fly (ie, not pre-registered). + * + * Each rpcrdma_buffer has a list of these anchored in rb_mws. During + * call_allocate, rpcrdma_buffer_get() assigns one to each segment in + * an rpcrdma_req. Then rpcrdma_register_external() grabs these to keep + * track of registration metadata while each RPC is pending. + * rpcrdma_deregister_external() uses this metadata to unmap and + * release these resources when an RPC is complete. + */ +enum rpcrdma_frmr_state { + FRMR_IS_INVALID, /* ready to be used */ + FRMR_IS_VALID, /* in use */ +}; + +struct rpcrdma_frmr { + struct ib_fast_reg_page_list *fr_pgl; + struct ib_mr *fr_mr; + enum rpcrdma_frmr_state fr_state; +}; + +struct rpcrdma_mw { + union { + struct ib_fmr *fmr; + struct rpcrdma_frmr frmr; + } r; + struct list_head mw_list; +}; + /* * struct rpcrdma_req -- structure central to the request/reply sequence. * @@ -172,17 +204,7 @@ struct rpcrdma_rep { struct rpcrdma_mr_seg { /* chunk descriptors */ union { /* chunk memory handles */ struct ib_mr *rl_mr; /* if registered directly */ - struct rpcrdma_mw { /* if registered from region */ - union { - struct ib_fmr *fmr; - struct { - struct ib_fast_reg_page_list *fr_pgl; - struct ib_mr *fr_mr; - enum { FRMR_IS_INVALID, FRMR_IS_VALID } state; - } frmr; - } r; - struct list_head mw_list; - } *rl_mw; + struct rpcrdma_mw *rl_mw; /* if registered from region */ } mr_chunk; u64 mr_base; /* registration result */ u32 mr_rkey; /* registration result */ -- cgit v1.2.1 From c93e986a295d537589efd0504f36ca952bd1a5be Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Jul 2014 17:24:19 -0400 Subject: xprtrdma: Back off rkey when FAST_REG_MR fails If posting a FAST_REG_MR Work Reqeust fails, revert the rkey update to avoid subsequent IB_WC_MW_BIND_ERR completions. Suggested-by: Steve Wise Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index dd1dabcd3a07..b670f4d92840 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1577,6 +1577,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, if (rc) { dprintk("RPC: %s: failed ib_post_send for register," " status %i\n", __func__, rc); + ib_update_fast_reg_key(mr, --key); goto out_err; } else { seg1->mr_rkey = mr->rkey; -- cgit v1.2.1 From 3111d72c7ced444b1034f6e365e0e02444c68aa8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Jul 2014 17:24:28 -0400 Subject: xprtrdma: Chain together all MWs in same buffer pool During connection loss recovery, need to visit every MW in a buffer pool. Any MW that is in use by an RPC will not be on the rb_mws list. Signed-off-by: Chuck Lever Tested-by: Steve Wise Tested-by: Shirley Ma Tested-by: Devesh Sharma Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 4 ++++ net/sunrpc/xprtrdma/xprt_rdma.h | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index b670f4d92840..0ad7d10f13a7 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1074,6 +1074,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, p += cdata->padding; INIT_LIST_HEAD(&buf->rb_mws); + INIT_LIST_HEAD(&buf->rb_all); r = (struct rpcrdma_mw *)p; switch (ia->ri_memreg_strategy) { case RPCRDMA_FRMR: @@ -1098,6 +1099,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, ib_dereg_mr(r->r.frmr.fr_mr); goto out; } + list_add(&r->mw_all, &buf->rb_all); list_add(&r->mw_list, &buf->rb_mws); ++r; } @@ -1116,6 +1118,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, " failed %i\n", __func__, rc); goto out; } + list_add(&r->mw_all, &buf->rb_all); list_add(&r->mw_list, &buf->rb_mws); ++r; } @@ -1225,6 +1228,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) while (!list_empty(&buf->rb_mws)) { r = list_entry(buf->rb_mws.next, struct rpcrdma_mw, mw_list); + list_del(&r->mw_all); list_del(&r->mw_list); switch (ia->ri_memreg_strategy) { case RPCRDMA_FRMR: diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 84c3455a9521..c1d865287b0e 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -151,7 +151,7 @@ struct rpcrdma_rep { * An external memory region is any buffer or page that is registered * on the fly (ie, not pre-registered). * - * Each rpcrdma_buffer has a list of these anchored in rb_mws. During + * Each rpcrdma_buffer has a list of free MWs anchored in rb_mws. During * call_allocate, rpcrdma_buffer_get() assigns one to each segment in * an rpcrdma_req. Then rpcrdma_register_external() grabs these to keep * track of registration metadata while each RPC is pending. @@ -175,6 +175,7 @@ struct rpcrdma_mw { struct rpcrdma_frmr frmr; } r; struct list_head mw_list; + struct list_head mw_all; }; /* @@ -246,6 +247,7 @@ struct rpcrdma_buffer { atomic_t rb_credits; /* most recent server credits */ int rb_max_requests;/* client max requests */ struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */ + struct list_head rb_all; int rb_send_index; struct rpcrdma_req **rb_send_bufs; int rb_recv_index; -- cgit v1.2.1 From c2922c0235aac1c787fa81e24d7d7e93c2202275 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Jul 2014 17:24:36 -0400 Subject: xprtrdma: Properly handle exhaustion of the rb_mws list If the rb_mws list is exhausted, clean up and return NULL so that call_allocate() will delay and try again. Signed-off-by: Chuck Lever Tested-by: Steve Wise Tested-by: Shirley Ma Tested-by: Devesh Sharma Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 103 ++++++++++++++++++++++++++++++-------------- 1 file changed, 71 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 0ad7d10f13a7..017f0abb2a86 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1256,6 +1256,67 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) kfree(buf->rb_pool); } +/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving + * some req segments uninitialized. + */ +static void +rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf) +{ + if (*mw) { + list_add_tail(&(*mw)->mw_list, &buf->rb_mws); + *mw = NULL; + } +} + +/* Cycle mw's back in reverse order, and "spin" them. + * This delays and scrambles reuse as much as possible. + */ +static void +rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) +{ + struct rpcrdma_mr_seg *seg = req->rl_segments; + struct rpcrdma_mr_seg *seg1 = seg; + int i; + + for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++) + rpcrdma_buffer_put_mr(&seg->mr_chunk.rl_mw, buf); + rpcrdma_buffer_put_mr(&seg1->mr_chunk.rl_mw, buf); +} + +static void +rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) +{ + buf->rb_send_bufs[--buf->rb_send_index] = req; + req->rl_niovs = 0; + if (req->rl_reply) { + buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply; + req->rl_reply->rr_func = NULL; + req->rl_reply = NULL; + } +} + +static struct rpcrdma_req * +rpcrdma_buffer_get_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) +{ + struct rpcrdma_mw *r; + int i; + + i = RPCRDMA_MAX_SEGS - 1; + while (!list_empty(&buf->rb_mws)) { + r = list_entry(buf->rb_mws.next, + struct rpcrdma_mw, mw_list); + list_del(&r->mw_list); + req->rl_segments[i].mr_chunk.rl_mw = r; + if (unlikely(i-- == 0)) + return req; /* Success */ + } + + /* Not enough entries on rb_mws for this req */ + rpcrdma_buffer_put_sendbuf(req, buf); + rpcrdma_buffer_put_mrs(req, buf); + return NULL; +} + /* * Get a set of request/reply buffers. * @@ -1268,10 +1329,9 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) struct rpcrdma_req * rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) { + struct rpcrdma_ia *ia = rdmab_to_ia(buffers); struct rpcrdma_req *req; unsigned long flags; - int i; - struct rpcrdma_mw *r; spin_lock_irqsave(&buffers->rb_lock, flags); if (buffers->rb_send_index == buffers->rb_max_requests) { @@ -1291,14 +1351,13 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL; } buffers->rb_send_bufs[buffers->rb_send_index++] = NULL; - if (!list_empty(&buffers->rb_mws)) { - i = RPCRDMA_MAX_SEGS - 1; - do { - r = list_entry(buffers->rb_mws.next, - struct rpcrdma_mw, mw_list); - list_del(&r->mw_list); - req->rl_segments[i].mr_chunk.rl_mw = r; - } while (--i >= 0); + switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + case RPCRDMA_MTHCAFMR: + req = rpcrdma_buffer_get_mrs(req, buffers); + break; + default: + break; } spin_unlock_irqrestore(&buffers->rb_lock, flags); return req; @@ -1313,34 +1372,14 @@ rpcrdma_buffer_put(struct rpcrdma_req *req) { struct rpcrdma_buffer *buffers = req->rl_buffer; struct rpcrdma_ia *ia = rdmab_to_ia(buffers); - int i; unsigned long flags; spin_lock_irqsave(&buffers->rb_lock, flags); - buffers->rb_send_bufs[--buffers->rb_send_index] = req; - req->rl_niovs = 0; - if (req->rl_reply) { - buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply; - req->rl_reply->rr_func = NULL; - req->rl_reply = NULL; - } + rpcrdma_buffer_put_sendbuf(req, buffers); switch (ia->ri_memreg_strategy) { case RPCRDMA_FRMR: case RPCRDMA_MTHCAFMR: - /* - * Cycle mw's back in reverse order, and "spin" them. - * This delays and scrambles reuse as much as possible. - */ - i = 1; - do { - struct rpcrdma_mw **mw; - mw = &req->rl_segments[i].mr_chunk.rl_mw; - list_add_tail(&(*mw)->mw_list, &buffers->rb_mws); - *mw = NULL; - } while (++i < RPCRDMA_MAX_SEGS); - list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list, - &buffers->rb_mws); - req->rl_segments[0].mr_chunk.rl_mw = NULL; + rpcrdma_buffer_put_mrs(req, buffers); break; default: break; -- cgit v1.2.1 From 9f9d802a28a107937ecda4ff78de2ab5cedd439d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Jul 2014 17:24:45 -0400 Subject: xprtrdma: Reset FRMRs when FAST_REG_MR is flushed by a disconnect FAST_REG_MR Work Requests update a Memory Region's rkey. Rkey's are used to block unwanted access to the memory controlled by an MR. The rkey is passed to the receiver (the NFS server, in our case), and is also used by xprtrdma to invalidate the MR when the RPC is complete. When a FAST_REG_MR Work Request is flushed after a transport disconnect, xprtrdma cannot tell whether the WR actually hit the adapter or not. So it is indeterminant at that point whether the existing rkey is still valid. After the transport connection is re-established, the next FAST_REG_MR or LOCAL_INV Work Request against that MR can sometimes fail because the rkey value does not match what xprtrdma expects. The only reliable way to recover in this case is to deregister and register the MR before it is used again. These operations can be done only in a process context, so handle it in the transport connect worker. Signed-off-by: Chuck Lever Tested-by: Steve Wise Tested-by: Shirley Ma Tested-by: Devesh Sharma Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 67 +++++++++++++++++++++++++++++++++++++++-- net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 2 files changed, 66 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 017f0abb2a86..3a6376a77fcc 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -61,6 +61,8 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif +static void rpcrdma_reset_frmrs(struct rpcrdma_ia *); + /* * internal functions */ @@ -152,8 +154,10 @@ rpcrdma_sendcq_process_wc(struct ib_wc *wc) if (wc->wr_id == 0ULL) return; - if (wc->status != IB_WC_SUCCESS) + if (wc->status != IB_WC_SUCCESS) { + frmr->r.frmr.fr_state = FRMR_IS_STALE; return; + } if (wc->opcode == IB_WC_FAST_REG_MR) frmr->r.frmr.fr_state = FRMR_IS_VALID; @@ -881,6 +885,9 @@ retry: " status %i\n", __func__, rc); rpcrdma_flush_cqs(ep); + if (ia->ri_memreg_strategy == RPCRDMA_FRMR) + rpcrdma_reset_frmrs(ia); + xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); id = rpcrdma_create_id(xprt, ia, (struct sockaddr *)&xprt->rx_data.addr); @@ -1256,6 +1263,62 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) kfree(buf->rb_pool); } +/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in + * an unusable state. Find FRMRs in this state and dereg / reg + * each. FRMRs that are VALID and attached to an rpcrdma_req are + * also torn down. + * + * This gives all in-use FRMRs a fresh rkey and leaves them INVALID. + * + * This is invoked only in the transport connect worker in order + * to serialize with rpcrdma_register_frmr_external(). + */ +static void +rpcrdma_reset_frmrs(struct rpcrdma_ia *ia) +{ + struct rpcrdma_xprt *r_xprt = + container_of(ia, struct rpcrdma_xprt, rx_ia); + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct list_head *pos; + struct rpcrdma_mw *r; + int rc; + + list_for_each(pos, &buf->rb_all) { + r = list_entry(pos, struct rpcrdma_mw, mw_all); + + if (r->r.frmr.fr_state == FRMR_IS_INVALID) + continue; + + rc = ib_dereg_mr(r->r.frmr.fr_mr); + if (rc) + dprintk("RPC: %s: ib_dereg_mr failed %i\n", + __func__, rc); + ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); + + r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, + ia->ri_max_frmr_depth); + if (IS_ERR(r->r.frmr.fr_mr)) { + rc = PTR_ERR(r->r.frmr.fr_mr); + dprintk("RPC: %s: ib_alloc_fast_reg_mr" + " failed %i\n", __func__, rc); + continue; + } + r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list( + ia->ri_id->device, + ia->ri_max_frmr_depth); + if (IS_ERR(r->r.frmr.fr_pgl)) { + rc = PTR_ERR(r->r.frmr.fr_pgl); + dprintk("RPC: %s: " + "ib_alloc_fast_reg_page_list " + "failed %i\n", __func__, rc); + + ib_dereg_mr(r->r.frmr.fr_mr); + continue; + } + r->r.frmr.fr_state = FRMR_IS_INVALID; + } +} + /* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving * some req segments uninitialized. */ @@ -1575,7 +1638,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, dprintk("RPC: %s: Using frmr %p to map %d segments\n", __func__, mw, i); - if (unlikely(frmr->fr_state == FRMR_IS_VALID)) { + if (unlikely(frmr->fr_state != FRMR_IS_INVALID)) { dprintk("RPC: %s: frmr %x left valid, posting invalidate.\n", __func__, mr->rkey); /* Invalidate before using. */ diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index c1d865287b0e..1ee6db30abc5 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -161,6 +161,7 @@ struct rpcrdma_rep { enum rpcrdma_frmr_state { FRMR_IS_INVALID, /* ready to be used */ FRMR_IS_VALID, /* in use */ + FRMR_IS_STALE, /* failed completion */ }; struct rpcrdma_frmr { -- cgit v1.2.1 From ddb6bebcc64678fcf73eb9e21f80c6dacfa093a7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Jul 2014 17:24:54 -0400 Subject: xprtrdma: Reset FRMRs after a flushed LOCAL_INV Work Request When a LOCAL_INV Work Request is flushed, it leaves an FRMR in the VALID state. This FRMR can be returned by rpcrdma_buffer_get(), and must be knocked down in rpcrdma_register_frmr_external() before it can be re-used. Instead, capture these in rpcrdma_buffer_get(), and reset them. Signed-off-by: Chuck Lever Tested-by: Steve Wise Tested-by: Shirley Ma Tested-by: Devesh Sharma Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 94 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 92 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 3a6376a77fcc..ca55acf42365 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1358,8 +1358,91 @@ rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) } } +/* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external(). + * Redo only the ib_post_send(). + */ +static void +rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia) +{ + struct rpcrdma_xprt *r_xprt = + container_of(ia, struct rpcrdma_xprt, rx_ia); + struct ib_send_wr invalidate_wr, *bad_wr; + int rc; + + dprintk("RPC: %s: FRMR %p is stale\n", __func__, r); + + /* When this FRMR is re-inserted into rb_mws, it is no longer stale */ + r->r.frmr.fr_state = FRMR_IS_VALID; + + memset(&invalidate_wr, 0, sizeof(invalidate_wr)); + invalidate_wr.wr_id = (unsigned long)(void *)r; + invalidate_wr.opcode = IB_WR_LOCAL_INV; + invalidate_wr.send_flags = IB_SEND_SIGNALED; + invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey; + DECR_CQCOUNT(&r_xprt->rx_ep); + + dprintk("RPC: %s: frmr %p invalidating rkey %08x\n", + __func__, r, r->r.frmr.fr_mr->rkey); + + read_lock(&ia->ri_qplock); + rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); + read_unlock(&ia->ri_qplock); + if (rc) { + /* Force rpcrdma_buffer_get() to retry */ + r->r.frmr.fr_state = FRMR_IS_STALE; + dprintk("RPC: %s: ib_post_send failed, %i\n", + __func__, rc); + } +} + +static void +rpcrdma_retry_flushed_linv(struct list_head *stale, + struct rpcrdma_buffer *buf) +{ + struct rpcrdma_ia *ia = rdmab_to_ia(buf); + struct list_head *pos; + struct rpcrdma_mw *r; + unsigned long flags; + + list_for_each(pos, stale) { + r = list_entry(pos, struct rpcrdma_mw, mw_list); + rpcrdma_retry_local_inv(r, ia); + } + + spin_lock_irqsave(&buf->rb_lock, flags); + list_splice_tail(stale, &buf->rb_mws); + spin_unlock_irqrestore(&buf->rb_lock, flags); +} + static struct rpcrdma_req * -rpcrdma_buffer_get_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) +rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf, + struct list_head *stale) +{ + struct rpcrdma_mw *r; + int i; + + i = RPCRDMA_MAX_SEGS - 1; + while (!list_empty(&buf->rb_mws)) { + r = list_entry(buf->rb_mws.next, + struct rpcrdma_mw, mw_list); + list_del(&r->mw_list); + if (r->r.frmr.fr_state == FRMR_IS_STALE) { + list_add(&r->mw_list, stale); + continue; + } + req->rl_segments[i].mr_chunk.rl_mw = r; + if (unlikely(i-- == 0)) + return req; /* Success */ + } + + /* Not enough entries on rb_mws for this req */ + rpcrdma_buffer_put_sendbuf(req, buf); + rpcrdma_buffer_put_mrs(req, buf); + return NULL; +} + +static struct rpcrdma_req * +rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) { struct rpcrdma_mw *r; int i; @@ -1393,6 +1476,7 @@ struct rpcrdma_req * rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) { struct rpcrdma_ia *ia = rdmab_to_ia(buffers); + struct list_head stale; struct rpcrdma_req *req; unsigned long flags; @@ -1414,15 +1498,21 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL; } buffers->rb_send_bufs[buffers->rb_send_index++] = NULL; + + INIT_LIST_HEAD(&stale); switch (ia->ri_memreg_strategy) { case RPCRDMA_FRMR: + req = rpcrdma_buffer_get_frmrs(req, buffers, &stale); + break; case RPCRDMA_MTHCAFMR: - req = rpcrdma_buffer_get_mrs(req, buffers); + req = rpcrdma_buffer_get_fmrs(req, buffers); break; default: break; } spin_unlock_irqrestore(&buffers->rb_lock, flags); + if (!list_empty(&stale)) + rpcrdma_retry_flushed_linv(&stale, buffers); return req; } -- cgit v1.2.1 From 440ddad51b821a8ab9099addcc29d4d18d02f6ac Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Jul 2014 17:25:03 -0400 Subject: xprtrdma: Don't post a LOCAL_INV in rpcrdma_register_frmr_external() Any FRMR arriving in rpcrdma_register_frmr_external() is now guaranteed to be either invalid, or to be targeted by a queued LOCAL_INV that will invalidate it before the adapter processes the FAST_REG_MR being built here. The problem with current arrangement of chaining a LOCAL_INV to the FAST_REG_MR is that if the transport is not connected, the LOCAL_INV is flushed and the FAST_REG_MR is flushed. This leaves the FRMR valid with the old rkey. But rpcrdma_register_frmr_external() has already bumped the in-memory rkey. Next time through rpcrdma_register_frmr_external(), a LOCAL_INV and FAST_REG_MR is attempted again because the FRMR is still valid. But the rkey no longer matches the hardware's rkey, and a memory management operation error occurs. Signed-off-by: Chuck Lever Tested-by: Steve Wise Tested-by: Shirley Ma Tested-by: Devesh Sharma Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index ca55acf42365..7459b867d7d6 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1695,8 +1695,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, struct rpcrdma_mw *mw = seg1->mr_chunk.rl_mw; struct rpcrdma_frmr *frmr = &mw->r.frmr; struct ib_mr *mr = frmr->fr_mr; - struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr; - + struct ib_send_wr frmr_wr, *bad_wr; u8 key; int len, pageoff; int i, rc; @@ -1728,22 +1727,6 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, dprintk("RPC: %s: Using frmr %p to map %d segments\n", __func__, mw, i); - if (unlikely(frmr->fr_state != FRMR_IS_INVALID)) { - dprintk("RPC: %s: frmr %x left valid, posting invalidate.\n", - __func__, mr->rkey); - /* Invalidate before using. */ - memset(&invalidate_wr, 0, sizeof invalidate_wr); - invalidate_wr.wr_id = (unsigned long)(void *)mw; - invalidate_wr.next = &frmr_wr; - invalidate_wr.opcode = IB_WR_LOCAL_INV; - invalidate_wr.send_flags = IB_SEND_SIGNALED; - invalidate_wr.ex.invalidate_rkey = mr->rkey; - DECR_CQCOUNT(&r_xprt->rx_ep); - post_wr = &invalidate_wr; - } else - post_wr = &frmr_wr; - - /* Prepare FRMR WR */ memset(&frmr_wr, 0, sizeof frmr_wr); frmr_wr.wr_id = (unsigned long)(void *)mw; frmr_wr.opcode = IB_WR_FAST_REG_MR; @@ -1768,8 +1751,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, frmr_wr.wr.fast_reg.rkey = mr->rkey; DECR_CQCOUNT(&r_xprt->rx_ep); - rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr); - + rc = ib_post_send(ia->ri_id->qp, &frmr_wr, &bad_wr); if (rc) { dprintk("RPC: %s: failed ib_post_send for register," " status %i\n", __func__, rc); -- cgit v1.2.1 From 050557220e34ed5acc830c9bf6cd993f6b4ea33e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Jul 2014 17:25:12 -0400 Subject: xprtrdma: Disable completions for FAST_REG_MR Work Requests Instead of relying on a completion to change the state of an FRMR to FRMR_IS_VALID, set it in advance. If an error occurs, a completion will fire anyway and mark the FRMR FRMR_IS_STALE. Signed-off-by: Chuck Lever Tested-by: Steve Wise Tested-by: Shirley Ma Tested-by: Devesh Sharma Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 7459b867d7d6..3e8b3881548d 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -159,10 +159,7 @@ rpcrdma_sendcq_process_wc(struct ib_wc *wc) return; } - if (wc->opcode == IB_WC_FAST_REG_MR) - frmr->r.frmr.fr_state = FRMR_IS_VALID; - else if (wc->opcode == IB_WC_LOCAL_INV) - frmr->r.frmr.fr_state = FRMR_IS_INVALID; + frmr->r.frmr.fr_state = FRMR_IS_INVALID; } static int @@ -1727,10 +1724,11 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, dprintk("RPC: %s: Using frmr %p to map %d segments\n", __func__, mw, i); + frmr->fr_state = FRMR_IS_VALID; + memset(&frmr_wr, 0, sizeof frmr_wr); frmr_wr.wr_id = (unsigned long)(void *)mw; frmr_wr.opcode = IB_WR_FAST_REG_MR; - frmr_wr.send_flags = IB_SEND_SIGNALED; frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma; frmr_wr.wr.fast_reg.page_list = frmr->fr_pgl; frmr_wr.wr.fast_reg.page_list_len = page_no; @@ -1766,6 +1764,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, *nsegs = i; return 0; out_err: + frmr->fr_state = FRMR_IS_INVALID; while (i--) rpcrdma_unmap_one(ia, --seg); return rc; -- cgit v1.2.1 From dab7e3b8da5ef76143a7e609612c306898f8f8fc Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Jul 2014 17:25:20 -0400 Subject: xprtrdma: Disable completions for LOCAL_INV Work Requests Instead of relying on a completion to change the state of an FRMR to FRMR_IS_INVALID, set it in advance. If an error occurs, a completion will fire anyway and mark the FRMR FRMR_IS_STALE. Signed-off-by: Chuck Lever Tested-by: Steve Wise Tested-by: Shirley Ma Tested-by: Devesh Sharma Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 3e8b3881548d..08c92355c64c 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -154,12 +154,8 @@ rpcrdma_sendcq_process_wc(struct ib_wc *wc) if (wc->wr_id == 0ULL) return; - if (wc->status != IB_WC_SUCCESS) { + if (wc->status != IB_WC_SUCCESS) frmr->r.frmr.fr_state = FRMR_IS_STALE; - return; - } - - frmr->r.frmr.fr_state = FRMR_IS_INVALID; } static int @@ -1369,12 +1365,11 @@ rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia) dprintk("RPC: %s: FRMR %p is stale\n", __func__, r); /* When this FRMR is re-inserted into rb_mws, it is no longer stale */ - r->r.frmr.fr_state = FRMR_IS_VALID; + r->r.frmr.fr_state = FRMR_IS_INVALID; memset(&invalidate_wr, 0, sizeof(invalidate_wr)); invalidate_wr.wr_id = (unsigned long)(void *)r; invalidate_wr.opcode = IB_WR_LOCAL_INV; - invalidate_wr.send_flags = IB_SEND_SIGNALED; invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey; DECR_CQCOUNT(&r_xprt->rx_ep); @@ -1778,10 +1773,11 @@ rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, struct ib_send_wr invalidate_wr, *bad_wr; int rc; + seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_INVALID; + memset(&invalidate_wr, 0, sizeof invalidate_wr); invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; invalidate_wr.opcode = IB_WR_LOCAL_INV; - invalidate_wr.send_flags = IB_SEND_SIGNALED; invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; DECR_CQCOUNT(&r_xprt->rx_ep); @@ -1790,9 +1786,12 @@ rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, rpcrdma_unmap_one(ia, seg++); rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); read_unlock(&ia->ri_qplock); - if (rc) + if (rc) { + /* Force rpcrdma_buffer_get() to retry */ + seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_STALE; dprintk("RPC: %s: failed ib_post_send for invalidate," " status %i\n", __func__, rc); + } return rc; } -- cgit v1.2.1 From f590e878c52c38046fd7cfa5a742ddae68717484 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Jul 2014 17:25:29 -0400 Subject: xprtrdma: Rename frmr_wr Clean up: Name frmr_wr after the opcode of the Work Request, consistent with the send and local invalidation paths. Signed-off-by: Chuck Lever Tested-by: Steve Wise Tested-by: Shirley Ma Tested-by: Devesh Sharma Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 08c92355c64c..80c01638a66b 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1687,7 +1687,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, struct rpcrdma_mw *mw = seg1->mr_chunk.rl_mw; struct rpcrdma_frmr *frmr = &mw->r.frmr; struct ib_mr *mr = frmr->fr_mr; - struct ib_send_wr frmr_wr, *bad_wr; + struct ib_send_wr fastreg_wr, *bad_wr; u8 key; int len, pageoff; int i, rc; @@ -1721,15 +1721,15 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, frmr->fr_state = FRMR_IS_VALID; - memset(&frmr_wr, 0, sizeof frmr_wr); - frmr_wr.wr_id = (unsigned long)(void *)mw; - frmr_wr.opcode = IB_WR_FAST_REG_MR; - frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma; - frmr_wr.wr.fast_reg.page_list = frmr->fr_pgl; - frmr_wr.wr.fast_reg.page_list_len = page_no; - frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; - frmr_wr.wr.fast_reg.length = page_no << PAGE_SHIFT; - if (frmr_wr.wr.fast_reg.length < len) { + memset(&fastreg_wr, 0, sizeof(fastreg_wr)); + fastreg_wr.wr_id = (unsigned long)(void *)mw; + fastreg_wr.opcode = IB_WR_FAST_REG_MR; + fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma; + fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl; + fastreg_wr.wr.fast_reg.page_list_len = page_no; + fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT; + if (fastreg_wr.wr.fast_reg.length < len) { rc = -EIO; goto out_err; } @@ -1738,13 +1738,13 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, key = (u8)(mr->rkey & 0x000000FF); ib_update_fast_reg_key(mr, ++key); - frmr_wr.wr.fast_reg.access_flags = (writing ? + fastreg_wr.wr.fast_reg.access_flags = (writing ? IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : IB_ACCESS_REMOTE_READ); - frmr_wr.wr.fast_reg.rkey = mr->rkey; + fastreg_wr.wr.fast_reg.rkey = mr->rkey; DECR_CQCOUNT(&r_xprt->rx_ep); - rc = ib_post_send(ia->ri_id->qp, &frmr_wr, &bad_wr); + rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr); if (rc) { dprintk("RPC: %s: failed ib_post_send for register," " status %i\n", __func__, rc); -- cgit v1.2.1 From 2e84522c2e0323a090fe1f7eeed6d5b6a68efe5f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Jul 2014 17:25:38 -0400 Subject: xprtrdma: Allocate each struct rpcrdma_mw separately Currently rpcrdma_buffer_create() allocates struct rpcrdma_mw's as a single contiguous area of memory. It amounts to quite a bit of memory, and there's no requirement for these to be carved from a single piece of contiguous memory. Signed-off-by: Chuck Lever Tested-by: Steve Wise Tested-by: Shirley Ma Tested-by: Devesh Sharma Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 242 ++++++++++++++++++++++++++------------------ 1 file changed, 143 insertions(+), 99 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 80c01638a66b..31c4fd36d62c 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1005,9 +1005,91 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) return rc; } -/* - * Initialize buffer memory - */ +static int +rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf) +{ + int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; + struct ib_fmr_attr fmr_attr = { + .max_pages = RPCRDMA_MAX_DATA_SEGS, + .max_maps = 1, + .page_shift = PAGE_SHIFT + }; + struct rpcrdma_mw *r; + int i, rc; + + i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; + dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i); + + while (i--) { + r = kzalloc(sizeof(*r), GFP_KERNEL); + if (r == NULL) + return -ENOMEM; + + r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr); + if (IS_ERR(r->r.fmr)) { + rc = PTR_ERR(r->r.fmr); + dprintk("RPC: %s: ib_alloc_fmr failed %i\n", + __func__, rc); + goto out_free; + } + + list_add(&r->mw_list, &buf->rb_mws); + list_add(&r->mw_all, &buf->rb_all); + } + return 0; + +out_free: + kfree(r); + return rc; +} + +static int +rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf) +{ + struct rpcrdma_frmr *f; + struct rpcrdma_mw *r; + int i, rc; + + i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; + dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i); + + while (i--) { + r = kzalloc(sizeof(*r), GFP_KERNEL); + if (r == NULL) + return -ENOMEM; + f = &r->r.frmr; + + f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, + ia->ri_max_frmr_depth); + if (IS_ERR(f->fr_mr)) { + rc = PTR_ERR(f->fr_mr); + dprintk("RPC: %s: ib_alloc_fast_reg_mr " + "failed %i\n", __func__, rc); + goto out_free; + } + + f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device, + ia->ri_max_frmr_depth); + if (IS_ERR(f->fr_pgl)) { + rc = PTR_ERR(f->fr_pgl); + dprintk("RPC: %s: ib_alloc_fast_reg_page_list " + "failed %i\n", __func__, rc); + + ib_dereg_mr(f->fr_mr); + goto out_free; + } + + list_add(&r->mw_list, &buf->rb_mws); + list_add(&r->mw_all, &buf->rb_all); + } + + return 0; + +out_free: + kfree(r); + return rc; +} + int rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) @@ -1015,7 +1097,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, char *p; size_t len, rlen, wlen; int i, rc; - struct rpcrdma_mw *r; buf->rb_max_requests = cdata->max_requests; spin_lock_init(&buf->rb_lock); @@ -1026,28 +1107,12 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, * 2. arrays of struct rpcrdma_req to fill in pointers * 3. array of struct rpcrdma_rep for replies * 4. padding, if any - * 5. mw's, fmr's or frmr's, if any * Send/recv buffers in req/rep need to be registered */ - len = buf->rb_max_requests * (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *)); len += cdata->padding; - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: - len += buf->rb_max_requests * RPCRDMA_MAX_SEGS * - sizeof(struct rpcrdma_mw); - break; - case RPCRDMA_MTHCAFMR: - /* TBD we are perhaps overallocating here */ - len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * - sizeof(struct rpcrdma_mw); - break; - default: - break; - } - /* allocate 1, 4 and 5 in one shot */ p = kzalloc(len, GFP_KERNEL); if (p == NULL) { dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n", @@ -1075,53 +1140,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, INIT_LIST_HEAD(&buf->rb_mws); INIT_LIST_HEAD(&buf->rb_all); - r = (struct rpcrdma_mw *)p; switch (ia->ri_memreg_strategy) { case RPCRDMA_FRMR: - for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) { - r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, - ia->ri_max_frmr_depth); - if (IS_ERR(r->r.frmr.fr_mr)) { - rc = PTR_ERR(r->r.frmr.fr_mr); - dprintk("RPC: %s: ib_alloc_fast_reg_mr" - " failed %i\n", __func__, rc); - goto out; - } - r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list( - ia->ri_id->device, - ia->ri_max_frmr_depth); - if (IS_ERR(r->r.frmr.fr_pgl)) { - rc = PTR_ERR(r->r.frmr.fr_pgl); - dprintk("RPC: %s: " - "ib_alloc_fast_reg_page_list " - "failed %i\n", __func__, rc); - - ib_dereg_mr(r->r.frmr.fr_mr); - goto out; - } - list_add(&r->mw_all, &buf->rb_all); - list_add(&r->mw_list, &buf->rb_mws); - ++r; - } + rc = rpcrdma_init_frmrs(ia, buf); + if (rc) + goto out; break; case RPCRDMA_MTHCAFMR: - /* TBD we are perhaps overallocating here */ - for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { - static struct ib_fmr_attr fa = - { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT }; - r->r.fmr = ib_alloc_fmr(ia->ri_pd, - IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ, - &fa); - if (IS_ERR(r->r.fmr)) { - rc = PTR_ERR(r->r.fmr); - dprintk("RPC: %s: ib_alloc_fmr" - " failed %i\n", __func__, rc); - goto out; - } - list_add(&r->mw_all, &buf->rb_all); - list_add(&r->mw_list, &buf->rb_mws); - ++r; - } + rc = rpcrdma_init_fmrs(ia, buf); + if (rc) + goto out; break; default: break; @@ -1189,24 +1217,57 @@ out: return rc; } -/* - * Unregister and destroy buffer memory. Need to deal with - * partial initialization, so it's callable from failed create. - * Must be called before destroying endpoint, as registrations - * reference it. - */ +static void +rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_mw *r; + int rc; + + while (!list_empty(&buf->rb_all)) { + r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); + list_del(&r->mw_all); + list_del(&r->mw_list); + + rc = ib_dealloc_fmr(r->r.fmr); + if (rc) + dprintk("RPC: %s: ib_dealloc_fmr failed %i\n", + __func__, rc); + + kfree(r); + } +} + +static void +rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_mw *r; + int rc; + + while (!list_empty(&buf->rb_all)) { + r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); + list_del(&r->mw_all); + list_del(&r->mw_list); + + rc = ib_dereg_mr(r->r.frmr.fr_mr); + if (rc) + dprintk("RPC: %s: ib_dereg_mr failed %i\n", + __func__, rc); + ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); + + kfree(r); + } +} + void rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { - int rc, i; struct rpcrdma_ia *ia = rdmab_to_ia(buf); - struct rpcrdma_mw *r; + int i; /* clean up in reverse order from create * 1. recv mr memory (mr free, then kfree) * 2. send mr memory (mr free, then kfree) - * 3. padding (if any) [moved to rpcrdma_ep_destroy] - * 4. arrays + * 3. MWs */ dprintk("RPC: %s: entering\n", __func__); @@ -1225,32 +1286,15 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) } } - while (!list_empty(&buf->rb_mws)) { - r = list_entry(buf->rb_mws.next, - struct rpcrdma_mw, mw_list); - list_del(&r->mw_all); - list_del(&r->mw_list); - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: - rc = ib_dereg_mr(r->r.frmr.fr_mr); - if (rc) - dprintk("RPC: %s:" - " ib_dereg_mr" - " failed %i\n", - __func__, rc); - ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); - break; - case RPCRDMA_MTHCAFMR: - rc = ib_dealloc_fmr(r->r.fmr); - if (rc) - dprintk("RPC: %s:" - " ib_dealloc_fmr" - " failed %i\n", - __func__, rc); - break; - default: - break; - } + switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + rpcrdma_destroy_frmrs(buf); + break; + case RPCRDMA_MTHCAFMR: + rpcrdma_destroy_fmrs(buf); + break; + default: + break; } kfree(buf->rb_pool); -- cgit v1.2.1 From bb96193d9104613cd87fb518f25db3fadc36432e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Jul 2014 17:25:46 -0400 Subject: xprtrdma: Schedule reply tasklet once per upcall Minor optimization: grab rpcrdma_tk_lock_g and disable hard IRQs just once after clearing the receive completion queue. Signed-off-by: Chuck Lever Tested-by: Steve Wise Tested-by: Shirley Ma Tested-by: Devesh Sharma Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 31c4fd36d62c..f124f04e2e4e 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -105,17 +105,6 @@ rpcrdma_run_tasklet(unsigned long data) static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL); -static inline void -rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep) -{ - unsigned long flags; - - spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); - list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g); - spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); - tasklet_schedule(&rpcrdma_tasklet_g); -} - static void rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) { @@ -214,7 +203,7 @@ rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context) } static void -rpcrdma_recvcq_process_wc(struct ib_wc *wc) +rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list) { struct rpcrdma_rep *rep = (struct rpcrdma_rep *)(unsigned long)wc->wr_id; @@ -245,28 +234,38 @@ rpcrdma_recvcq_process_wc(struct ib_wc *wc) } out_schedule: - rpcrdma_schedule_tasklet(rep); + list_add_tail(&rep->rr_list, sched_list); } static int rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) { + struct list_head sched_list; struct ib_wc *wcs; int budget, count, rc; + unsigned long flags; + INIT_LIST_HEAD(&sched_list); budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE; do { wcs = ep->rep_recv_wcs; rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs); if (rc <= 0) - return rc; + goto out_schedule; count = rc; while (count-- > 0) - rpcrdma_recvcq_process_wc(wcs++); + rpcrdma_recvcq_process_wc(wcs++, &sched_list); } while (rc == RPCRDMA_POLLSIZE && --budget); - return 0; + rc = 0; + +out_schedule: + spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); + list_splice_tail(&sched_list, &rpcrdma_tasklets_g); + spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); + tasklet_schedule(&rpcrdma_tasklet_g); + return rc; } /* -- cgit v1.2.1 From 282191cb725db9a1aa80269e8369b06e9270a948 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Jul 2014 17:25:55 -0400 Subject: xprtrdma: Make rpcrdma_ep_disconnect() return void Clean up: The return code is used only for dprintk's that are already redundant. Signed-off-by: Chuck Lever Tested-by: Steve Wise Tested-by: Shirley Ma Tested-by: Devesh Sharma Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 2 +- net/sunrpc/xprtrdma/verbs.c | 14 ++++---------- net/sunrpc/xprtrdma/xprt_rdma.h | 2 +- 3 files changed, 6 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index f6d280b31dc9..2faac4940563 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -414,7 +414,7 @@ xprt_rdma_close(struct rpc_xprt *xprt) if (r_xprt->rx_ep.rep_connected > 0) xprt->reestablish_timeout = 0; xprt_disconnect_done(xprt); - (void) rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia); + rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia); } static void diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index f124f04e2e4e..1208ab2e655f 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -830,10 +830,7 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) cancel_delayed_work_sync(&ep->rep_connect_worker); if (ia->ri_id->qp) { - rc = rpcrdma_ep_disconnect(ep, ia); - if (rc) - dprintk("RPC: %s: rpcrdma_ep_disconnect" - " returned %i\n", __func__, rc); + rpcrdma_ep_disconnect(ep, ia); rdma_destroy_qp(ia->ri_id); ia->ri_id->qp = NULL; } @@ -871,10 +868,8 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) struct rpcrdma_xprt *xprt; retry: dprintk("RPC: %s: reconnecting...\n", __func__); - rc = rpcrdma_ep_disconnect(ep, ia); - if (rc && rc != -ENOTCONN) - dprintk("RPC: %s: rpcrdma_ep_disconnect" - " status %i\n", __func__, rc); + + rpcrdma_ep_disconnect(ep, ia); rpcrdma_flush_cqs(ep); if (ia->ri_memreg_strategy == RPCRDMA_FRMR) @@ -984,7 +979,7 @@ out: * This call is not reentrant, and must not be made in parallel * on the same endpoint. */ -int +void rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { int rc; @@ -1001,7 +996,6 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc); ep->rep_connected = rc; } - return rc; } static int diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 1ee6db30abc5..c419498b8f46 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -341,7 +341,7 @@ int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *, struct rpcrdma_create_data_internal *); void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *); int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); -int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); +void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *, struct rpcrdma_req *); -- cgit v1.2.1 From a779ca5fa766e270b9e11c162d877295e2904f4e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Jul 2014 17:26:04 -0400 Subject: xprtrdma: Remove RPCRDMA_PERSISTENT_REGISTRATION macro Clean up. RPCRDMA_PERSISTENT_REGISTRATION was a compile-time switch between RPCRDMA_REGISTER mode and RPCRDMA_ALLPHYSICAL mode. Since RPCRDMA_REGISTER has been removed, there's no need for the extra conditional compilation. Signed-off-by: Chuck Lever Tested-by: Steve Wise Tested-by: Shirley Ma Tested-by: Devesh Sharma Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 1208ab2e655f..c2253d4c64df 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -561,12 +561,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) if (!ia->ri_id->device->alloc_fmr) { dprintk("RPC: %s: MTHCAFMR registration " "not supported by HCA\n", __func__); -#if RPCRDMA_PERSISTENT_REGISTRATION memreg = RPCRDMA_ALLPHYSICAL; -#else - rc = -ENOMEM; - goto out2; -#endif } } @@ -581,20 +576,16 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) switch (memreg) { case RPCRDMA_FRMR: break; -#if RPCRDMA_PERSISTENT_REGISTRATION case RPCRDMA_ALLPHYSICAL: mem_priv = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; goto register_setup; -#endif case RPCRDMA_MTHCAFMR: if (ia->ri_have_dma_lkey) break; mem_priv = IB_ACCESS_LOCAL_WRITE; -#if RPCRDMA_PERSISTENT_REGISTRATION register_setup: -#endif ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv); if (IS_ERR(ia->ri_bind_mem)) { printk(KERN_ALERT "%s: ib_get_dma_mr for " @@ -1905,7 +1896,6 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg, switch (ia->ri_memreg_strategy) { -#if RPCRDMA_PERSISTENT_REGISTRATION case RPCRDMA_ALLPHYSICAL: rpcrdma_map_one(ia, seg, writing); seg->mr_rkey = ia->ri_bind_mem->rkey; @@ -1913,7 +1903,6 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg, seg->mr_nsegs = 1; nsegs = 1; break; -#endif /* Registration using frmr registration */ case RPCRDMA_FRMR: @@ -1943,13 +1932,11 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, switch (ia->ri_memreg_strategy) { -#if RPCRDMA_PERSISTENT_REGISTRATION case RPCRDMA_ALLPHYSICAL: read_lock(&ia->ri_qplock); rpcrdma_unmap_one(ia, seg); read_unlock(&ia->ri_qplock); break; -#endif case RPCRDMA_FRMR: rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt); -- cgit v1.2.1 From 8079fb785e34de6dff34bd846b8b79c212861edf Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Jul 2014 17:26:12 -0400 Subject: xprtrdma: Handle additional connection events Commit 38ca83a5 added RDMA_CM_EVENT_TIMEWAIT_EXIT. But that status is relevant only for consumers that re-use their QPs on new connections. xprtrdma creates a fresh QP on reconnection, so that event should be explicitly ignored. Squelch the alarming "unexpected CM event" message. Signed-off-by: Chuck Lever Tested-by: Steve Wise Tested-by: Shirley Ma Tested-by: Devesh Sharma Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index c2253d4c64df..61c41298b4ea 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -326,8 +326,16 @@ static const char * const conn[] = { "rejected", "established", "disconnected", - "device removal" + "device removal", + "multicast join", + "multicast error", + "address change", + "timewait exit", }; + +#define CONNECTION_MSG(status) \ + ((status) < ARRAY_SIZE(conn) ? \ + conn[(status)] : "unrecognized connection error") #endif static int @@ -385,23 +393,18 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) case RDMA_CM_EVENT_DEVICE_REMOVAL: connstate = -ENODEV; connected: - dprintk("RPC: %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n", - __func__, - (event->event <= 11) ? conn[event->event] : - "unknown connection error", - &addr->sin_addr.s_addr, - ntohs(addr->sin_port), - ep, event->event); atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1); dprintk("RPC: %s: %sconnected\n", __func__, connstate > 0 ? "" : "dis"); ep->rep_connected = connstate; ep->rep_func(ep); wake_up_all(&ep->rep_connect_wait); - break; + /*FALLTHROUGH*/ default: - dprintk("RPC: %s: unexpected CM event %d\n", - __func__, event->event); + dprintk("RPC: %s: %pI4:%u (ep 0x%p): %s\n", + __func__, &addr->sin_addr.s_addr, + ntohs(addr->sin_port), ep, + CONNECTION_MSG(event->event)); break; } -- cgit v1.2.1