diff options
Diffstat (limited to 'net')
110 files changed, 2457 insertions, 924 deletions
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 56ca494621c6..0c5866bb49b6 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -31,7 +31,7 @@ #include <net/bluetooth/bluetooth.h> #include <linux/proc_fs.h> -#define VERSION "2.17" +#define VERSION "2.18" /* Bluetooth sockets */ #define BT_MAX_PROTO 8 diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 6ccc4eb9e55e..8b8b5f80dd89 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1228,7 +1228,7 @@ static void hci_set_event_mask_page_2(struct hci_request *req) /* If Connectionless Slave Broadcast master role is supported * enable all necessary events for it. */ - if (hdev->features[2][0] & 0x01) { + if (lmp_csb_master_capable(hdev)) { events[1] |= 0x40; /* Triggered Clock Capture */ events[1] |= 0x80; /* Synchronization Train Complete */ events[2] |= 0x10; /* Slave Page Response Timeout */ @@ -1238,7 +1238,7 @@ static void hci_set_event_mask_page_2(struct hci_request *req) /* If Connectionless Slave Broadcast slave role is supported * enable all necessary events for it. */ - if (hdev->features[2][0] & 0x02) { + if (lmp_csb_slave_capable(hdev)) { events[2] |= 0x01; /* Synchronization Train Received */ events[2] |= 0x02; /* CSB Receive */ events[2] |= 0x04; /* CSB Timeout */ @@ -1275,15 +1275,17 @@ static void hci_init3_req(struct hci_request *req, unsigned long opt) hci_setup_link_policy(req); if (lmp_le_capable(hdev)) { - /* If the controller has a public BD_ADDR, then by - * default use that one. If this is a LE only - * controller without one, default to the random - * address. - */ - if (bacmp(&hdev->bdaddr, BDADDR_ANY)) - hdev->own_addr_type = ADDR_LE_DEV_PUBLIC; - else - hdev->own_addr_type = ADDR_LE_DEV_RANDOM; + if (test_bit(HCI_SETUP, &hdev->dev_flags)) { + /* If the controller has a public BD_ADDR, then + * by default use that one. If this is a LE only + * controller without a public address, default + * to the random address. + */ + if (bacmp(&hdev->bdaddr, BDADDR_ANY)) + hdev->own_addr_type = ADDR_LE_DEV_PUBLIC; + else + hdev->own_addr_type = ADDR_LE_DEV_RANDOM; + } hci_set_le_support(req); } @@ -1307,7 +1309,7 @@ static void hci_init4_req(struct hci_request *req, unsigned long opt) hci_set_event_mask_page_2(req); /* Check for Synchronization Train support */ - if (hdev->features[2][0] & 0x04) + if (lmp_sync_train_capable(hdev)) hci_req_add(req, HCI_OP_READ_SYNC_TRAIN_PARAMS, 0, NULL); } diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 5935f748c0f9..5fb3df66c2cd 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -486,7 +486,10 @@ static void hci_cc_read_local_commands(struct hci_dev *hdev, BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); - if (!rp->status) + if (rp->status) + return; + + if (test_bit(HCI_SETUP, &hdev->dev_flags)) memcpy(hdev->commands, rp->commands, sizeof(hdev->commands)); } @@ -538,12 +541,6 @@ static void hci_cc_read_local_features(struct hci_dev *hdev, if (hdev->features[0][5] & LMP_EDR_3S_ESCO) hdev->esco_type |= (ESCO_2EV5 | ESCO_3EV5); - - BT_DBG("%s features 0x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x", hdev->name, - hdev->features[0][0], hdev->features[0][1], - hdev->features[0][2], hdev->features[0][3], - hdev->features[0][4], hdev->features[0][5], - hdev->features[0][6], hdev->features[0][7]); } static void hci_cc_read_local_ext_features(struct hci_dev *hdev, @@ -1782,7 +1779,9 @@ static u8 hci_to_mgmt_reason(u8 err) static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_ev_disconn_complete *ev = (void *) skb->data; + u8 reason = hci_to_mgmt_reason(ev->reason); struct hci_conn *conn; + u8 type; BT_DBG("%s status 0x%2.2x", hdev->name, ev->status); @@ -1792,43 +1791,38 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) if (!conn) goto unlock; - if (ev->status == 0) - conn->state = BT_CLOSED; + if (ev->status) { + mgmt_disconnect_failed(hdev, &conn->dst, conn->type, + conn->dst_type, ev->status); + goto unlock; + } - if (test_and_clear_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags) && - (conn->type == ACL_LINK || conn->type == LE_LINK)) { - if (ev->status) { - mgmt_disconnect_failed(hdev, &conn->dst, conn->type, - conn->dst_type, ev->status); - } else { - u8 reason = hci_to_mgmt_reason(ev->reason); + conn->state = BT_CLOSED; - mgmt_device_disconnected(hdev, &conn->dst, conn->type, - conn->dst_type, reason); - } - } + if (test_and_clear_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags)) + mgmt_device_disconnected(hdev, &conn->dst, conn->type, + conn->dst_type, reason); - if (ev->status == 0) { - u8 type = conn->type; + if (conn->type == ACL_LINK && conn->flush_key) + hci_remove_link_key(hdev, &conn->dst); - if (type == ACL_LINK && conn->flush_key) - hci_remove_link_key(hdev, &conn->dst); - hci_proto_disconn_cfm(conn, ev->reason); - hci_conn_del(conn); + type = conn->type; - /* Re-enable advertising if necessary, since it might - * have been disabled by the connection. From the - * HCI_LE_Set_Advertise_Enable command description in - * the core specification (v4.0): - * "The Controller shall continue advertising until the Host - * issues an LE_Set_Advertise_Enable command with - * Advertising_Enable set to 0x00 (Advertising is disabled) - * or until a connection is created or until the Advertising - * is timed out due to Directed Advertising." - */ - if (type == LE_LINK) - mgmt_reenable_advertising(hdev); - } + hci_proto_disconn_cfm(conn, ev->reason); + hci_conn_del(conn); + + /* Re-enable advertising if necessary, since it might + * have been disabled by the connection. From the + * HCI_LE_Set_Advertise_Enable command description in + * the core specification (v4.0): + * "The Controller shall continue advertising until the Host + * issues an LE_Set_Advertise_Enable command with + * Advertising_Enable set to 0x00 (Advertising is disabled) + * or until a connection is created or until the Advertising + * is timed out due to Directed Advertising." + */ + if (type == LE_LINK) + mgmt_reenable_advertising(hdev); unlock: hci_dev_unlock(hdev); diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 4af3821df880..b6bca64b320d 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -49,6 +49,9 @@ static u8 l2cap_fixed_chan[8] = { L2CAP_FC_L2CAP | L2CAP_FC_CONNLESS, }; static LIST_HEAD(chan_list); static DEFINE_RWLOCK(chan_list_lock); +static u16 le_max_credits = L2CAP_LE_MAX_CREDITS; +static u16 le_default_mps = L2CAP_LE_DEFAULT_MPS; + static struct sk_buff *l2cap_build_cmd(struct l2cap_conn *conn, u8 code, u8 ident, u16 dlen, void *data); static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len, @@ -213,9 +216,14 @@ int l2cap_add_scid(struct l2cap_chan *chan, __u16 scid) static u16 l2cap_alloc_cid(struct l2cap_conn *conn) { - u16 cid = L2CAP_CID_DYN_START; + u16 cid, dyn_end; + + if (conn->hcon->type == LE_LINK) + dyn_end = L2CAP_CID_LE_DYN_END; + else + dyn_end = L2CAP_CID_DYN_END; - for (; cid < L2CAP_CID_DYN_END; cid++) { + for (cid = L2CAP_CID_DYN_START; cid < dyn_end; cid++) { if (!__l2cap_get_chan_by_scid(conn, cid)) return cid; } @@ -490,6 +498,18 @@ void l2cap_chan_set_defaults(struct l2cap_chan *chan) set_bit(FLAG_FORCE_ACTIVE, &chan->flags); } +static void l2cap_le_flowctl_init(struct l2cap_chan *chan) +{ + chan->sdu = NULL; + chan->sdu_last_frag = NULL; + chan->sdu_len = 0; + chan->tx_credits = 0; + chan->rx_credits = le_max_credits; + chan->mps = min_t(u16, chan->imtu, L2CAP_LE_DEFAULT_MPS); + + skb_queue_head_init(&chan->tx_q); +} + void __l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan) { BT_DBG("conn %p, psm 0x%2.2x, dcid 0x%4.4x", conn, @@ -502,12 +522,12 @@ void __l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan) switch (chan->chan_type) { case L2CAP_CHAN_CONN_ORIENTED: if (conn->hcon->type == LE_LINK) { - /* LE connection */ - chan->omtu = L2CAP_DEFAULT_MTU; - if (chan->dcid == L2CAP_CID_ATT) + if (chan->dcid == L2CAP_CID_ATT) { + chan->omtu = L2CAP_DEFAULT_MTU; chan->scid = L2CAP_CID_ATT; - else + } else { chan->scid = l2cap_alloc_cid(conn); + } } else { /* Alloc CID for connection-oriented socket */ chan->scid = l2cap_alloc_cid(conn); @@ -597,6 +617,10 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err) case L2CAP_MODE_BASIC: break; + case L2CAP_MODE_LE_FLOWCTL: + skb_queue_purge(&chan->tx_q); + break; + case L2CAP_MODE_ERTM: __clear_retrans_timer(chan); __clear_monitor_timer(chan); @@ -617,6 +641,50 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err) return; } +static void l2cap_chan_le_connect_reject(struct l2cap_chan *chan) +{ + struct l2cap_conn *conn = chan->conn; + struct l2cap_le_conn_rsp rsp; + u16 result; + + if (test_bit(FLAG_DEFER_SETUP, &chan->flags)) + result = L2CAP_CR_AUTHORIZATION; + else + result = L2CAP_CR_BAD_PSM; + + l2cap_state_change(chan, BT_DISCONN); + + rsp.dcid = cpu_to_le16(chan->scid); + rsp.mtu = cpu_to_le16(chan->imtu); + rsp.mps = cpu_to_le16(chan->mps); + rsp.credits = cpu_to_le16(chan->rx_credits); + rsp.result = cpu_to_le16(result); + + l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CONN_RSP, sizeof(rsp), + &rsp); +} + +static void l2cap_chan_connect_reject(struct l2cap_chan *chan) +{ + struct l2cap_conn *conn = chan->conn; + struct l2cap_conn_rsp rsp; + u16 result; + + if (test_bit(FLAG_DEFER_SETUP, &chan->flags)) + result = L2CAP_CR_SEC_BLOCK; + else + result = L2CAP_CR_BAD_PSM; + + l2cap_state_change(chan, BT_DISCONN); + + rsp.scid = cpu_to_le16(chan->dcid); + rsp.dcid = cpu_to_le16(chan->scid); + rsp.result = cpu_to_le16(result); + rsp.status = __constant_cpu_to_le16(L2CAP_CS_NO_INFO); + + l2cap_send_cmd(conn, chan->ident, L2CAP_CONN_RSP, sizeof(rsp), &rsp); +} + void l2cap_chan_close(struct l2cap_chan *chan, int reason) { struct l2cap_conn *conn = chan->conn; @@ -630,8 +698,10 @@ void l2cap_chan_close(struct l2cap_chan *chan, int reason) case BT_CONNECTED: case BT_CONFIG: - if (chan->chan_type == L2CAP_CHAN_CONN_ORIENTED && - conn->hcon->type == ACL_LINK) { + /* ATT uses L2CAP_CHAN_CONN_ORIENTED so we must also + * check for chan->psm. + */ + if (chan->chan_type == L2CAP_CHAN_CONN_ORIENTED && chan->psm) { __set_chan_timer(chan, chan->ops->get_sndtimeo(chan)); l2cap_send_disconn_req(chan, reason); } else @@ -639,24 +709,11 @@ void l2cap_chan_close(struct l2cap_chan *chan, int reason) break; case BT_CONNECT2: - if (chan->chan_type == L2CAP_CHAN_CONN_ORIENTED && - conn->hcon->type == ACL_LINK) { - struct l2cap_conn_rsp rsp; - __u16 result; - - if (test_bit(FLAG_DEFER_SETUP, &chan->flags)) - result = L2CAP_CR_SEC_BLOCK; - else - result = L2CAP_CR_BAD_PSM; - - l2cap_state_change(chan, BT_DISCONN); - - rsp.scid = cpu_to_le16(chan->dcid); - rsp.dcid = cpu_to_le16(chan->scid); - rsp.result = cpu_to_le16(result); - rsp.status = __constant_cpu_to_le16(L2CAP_CS_NO_INFO); - l2cap_send_cmd(conn, chan->ident, L2CAP_CONN_RSP, - sizeof(rsp), &rsp); + if (chan->chan_type == L2CAP_CHAN_CONN_ORIENTED) { + if (conn->hcon->type == ACL_LINK) + l2cap_chan_connect_reject(chan); + else if (conn->hcon->type == LE_LINK) + l2cap_chan_le_connect_reject(chan); } l2cap_chan_del(chan, reason); @@ -726,6 +783,9 @@ int l2cap_chan_check_security(struct l2cap_chan *chan) struct l2cap_conn *conn = chan->conn; __u8 auth_type; + if (conn->hcon->type == LE_LINK) + return smp_conn_security(conn->hcon, chan->sec_level); + auth_type = l2cap_get_auth_type(chan); return hci_conn_security(conn->hcon, chan->sec_level, auth_type); @@ -1152,16 +1212,57 @@ static void l2cap_chan_ready(struct l2cap_chan *chan) chan->conf_state = 0; __clear_chan_timer(chan); + if (chan->mode == L2CAP_MODE_LE_FLOWCTL && !chan->tx_credits) + chan->ops->suspend(chan); + chan->state = BT_CONNECTED; chan->ops->ready(chan); } +static void l2cap_le_connect(struct l2cap_chan *chan) +{ + struct l2cap_conn *conn = chan->conn; + struct l2cap_le_conn_req req; + + if (test_and_set_bit(FLAG_LE_CONN_REQ_SENT, &chan->flags)) + return; + + req.psm = chan->psm; + req.scid = cpu_to_le16(chan->scid); + req.mtu = cpu_to_le16(chan->imtu); + req.mps = cpu_to_le16(chan->mps); + req.credits = cpu_to_le16(chan->rx_credits); + + chan->ident = l2cap_get_ident(conn); + + l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CONN_REQ, + sizeof(req), &req); +} + +static void l2cap_le_start(struct l2cap_chan *chan) +{ + struct l2cap_conn *conn = chan->conn; + + if (!smp_conn_security(conn->hcon, chan->sec_level)) + return; + + if (!chan->psm) { + l2cap_chan_ready(chan); + return; + } + + if (chan->state == BT_CONNECT) + l2cap_le_connect(chan); +} + static void l2cap_start_connection(struct l2cap_chan *chan) { if (__amp_capable(chan)) { BT_DBG("chan %p AMP capable: discover AMPs", chan); a2mp_discover_amp(chan); + } else if (chan->conn->hcon->type == LE_LINK) { + l2cap_le_start(chan); } else { l2cap_send_conn_req(chan); } @@ -1172,7 +1273,7 @@ static void l2cap_do_start(struct l2cap_chan *chan) struct l2cap_conn *conn = chan->conn; if (conn->hcon->type == LE_LINK) { - l2cap_chan_ready(chan); + l2cap_le_start(chan); return; } @@ -1430,9 +1531,7 @@ static void l2cap_conn_ready(struct l2cap_conn *conn) } if (hcon->type == LE_LINK) { - if (smp_conn_security(hcon, chan->sec_level)) - l2cap_chan_ready(chan); - + l2cap_le_start(chan); } else if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) { l2cap_chan_ready(chan); @@ -1703,7 +1802,8 @@ EXPORT_SYMBOL(l2cap_conn_put); */ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm, bdaddr_t *src, - bdaddr_t *dst) + bdaddr_t *dst, + u8 link_type) { struct l2cap_chan *c, *c1 = NULL; @@ -1713,6 +1813,12 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm, if (state && c->state != state) continue; + if (link_type == ACL_LINK && c->src_type != BDADDR_BREDR) + continue; + + if (link_type == LE_LINK && c->src_type == BDADDR_BREDR) + continue; + if (c->psm == psm) { int src_match, dst_match; int src_any, dst_any; @@ -1739,6 +1845,18 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm, return c1; } +static bool is_valid_psm(u16 psm, u8 dst_type) +{ + if (!psm) + return false; + + if (bdaddr_type_is_le(dst_type)) + return (psm <= 0x00ff); + + /* PSM must be odd and lsb of upper byte must be 0 */ + return ((psm & 0x0101) == 0x0001); +} + int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, bdaddr_t *dst, u8 dst_type) { @@ -1759,8 +1877,7 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, l2cap_chan_lock(chan); - /* PSM must be odd and lsb of upper byte must be 0 */ - if ((__le16_to_cpu(psm) & 0x0101) != 0x0001 && !cid && + if (!is_valid_psm(__le16_to_cpu(psm), dst_type) && !cid && chan->chan_type != L2CAP_CHAN_RAW) { err = -EINVAL; goto done; @@ -1774,6 +1891,9 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, switch (chan->mode) { case L2CAP_MODE_BASIC: break; + case L2CAP_MODE_LE_FLOWCTL: + l2cap_le_flowctl_init(chan); + break; case L2CAP_MODE_ERTM: case L2CAP_MODE_STREAMING: if (!disable_ertm) @@ -2432,6 +2552,89 @@ static int l2cap_segment_sdu(struct l2cap_chan *chan, return 0; } +static struct sk_buff *l2cap_create_le_flowctl_pdu(struct l2cap_chan *chan, + struct msghdr *msg, + size_t len, u16 sdulen) +{ + struct l2cap_conn *conn = chan->conn; + struct sk_buff *skb; + int err, count, hlen; + struct l2cap_hdr *lh; + + BT_DBG("chan %p len %zu", chan, len); + + if (!conn) + return ERR_PTR(-ENOTCONN); + + hlen = L2CAP_HDR_SIZE; + + if (sdulen) + hlen += L2CAP_SDULEN_SIZE; + + count = min_t(unsigned int, (conn->mtu - hlen), len); + + skb = chan->ops->alloc_skb(chan, count + hlen, + msg->msg_flags & MSG_DONTWAIT); + if (IS_ERR(skb)) + return skb; + + /* Create L2CAP header */ + lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE); + lh->cid = cpu_to_le16(chan->dcid); + lh->len = cpu_to_le16(len + (hlen - L2CAP_HDR_SIZE)); + + if (sdulen) + put_unaligned_le16(sdulen, skb_put(skb, L2CAP_SDULEN_SIZE)); + + err = l2cap_skbuff_fromiovec(chan, msg, len, count, skb); + if (unlikely(err < 0)) { + kfree_skb(skb); + return ERR_PTR(err); + } + + return skb; +} + +static int l2cap_segment_le_sdu(struct l2cap_chan *chan, + struct sk_buff_head *seg_queue, + struct msghdr *msg, size_t len) +{ + struct sk_buff *skb; + size_t pdu_len; + u16 sdu_len; + + BT_DBG("chan %p, msg %p, len %zu", chan, msg, len); + + pdu_len = chan->conn->mtu - L2CAP_HDR_SIZE; + + pdu_len = min_t(size_t, pdu_len, chan->remote_mps); + + sdu_len = len; + pdu_len -= L2CAP_SDULEN_SIZE; + + while (len > 0) { + if (len <= pdu_len) + pdu_len = len; + + skb = l2cap_create_le_flowctl_pdu(chan, msg, pdu_len, sdu_len); + if (IS_ERR(skb)) { + __skb_queue_purge(seg_queue); + return PTR_ERR(skb); + } + + __skb_queue_tail(seg_queue, skb); + + len -= pdu_len; + + if (sdu_len) { + sdu_len = 0; + pdu_len += L2CAP_SDULEN_SIZE; + } + } + + return 0; +} + int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len, u32 priority) { @@ -2453,6 +2656,40 @@ int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len, } switch (chan->mode) { + case L2CAP_MODE_LE_FLOWCTL: + /* Check outgoing MTU */ + if (len > chan->omtu) + return -EMSGSIZE; + + if (!chan->tx_credits) + return -EAGAIN; + + __skb_queue_head_init(&seg_queue); + + err = l2cap_segment_le_sdu(chan, &seg_queue, msg, len); + + if (chan->state != BT_CONNECTED) { + __skb_queue_purge(&seg_queue); + err = -ENOTCONN; + } + + if (err) + return err; + + skb_queue_splice_tail_init(&seg_queue, &chan->tx_q); + + while (chan->tx_credits && !skb_queue_empty(&chan->tx_q)) { + l2cap_do_send(chan, skb_dequeue(&chan->tx_q)); + chan->tx_credits--; + } + + if (!chan->tx_credits) + chan->ops->suspend(chan); + + err = len; + + break; + case L2CAP_MODE_BASIC: /* Check outgoing MTU */ if (len > chan->omtu) @@ -3592,6 +3829,23 @@ static int l2cap_build_conf_rsp(struct l2cap_chan *chan, void *data, return ptr - data; } +void __l2cap_le_connect_rsp_defer(struct l2cap_chan *chan) +{ + struct l2cap_le_conn_rsp rsp; + struct l2cap_conn *conn = chan->conn; + + BT_DBG("chan %p", chan); + + rsp.dcid = cpu_to_le16(chan->scid); + rsp.mtu = cpu_to_le16(chan->imtu); + rsp.mps = cpu_to_le16(chan->mps); + rsp.credits = cpu_to_le16(chan->rx_credits); + rsp.result = __constant_cpu_to_le16(L2CAP_CR_SUCCESS); + + l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CONN_RSP, sizeof(rsp), + &rsp); +} + void __l2cap_connect_rsp_defer(struct l2cap_chan *chan) { struct l2cap_conn_rsp rsp; @@ -3713,7 +3967,7 @@ static struct l2cap_chan *l2cap_connect(struct l2cap_conn *conn, /* Check if we have socket listening on psm */ pchan = l2cap_global_chan_by_psm(BT_LISTEN, psm, &conn->hcon->src, - &conn->hcon->dst); + &conn->hcon->dst, ACL_LINK); if (!pchan) { result = L2CAP_CR_BAD_PSM; goto sendresp; @@ -5155,18 +5409,17 @@ static inline int l2cap_check_conn_param(u16 min, u16 max, u16 latency, static inline int l2cap_conn_param_update_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, - u8 *data) + u16 cmd_len, u8 *data) { struct hci_conn *hcon = conn->hcon; struct l2cap_conn_param_update_req *req; struct l2cap_conn_param_update_rsp rsp; - u16 min, max, latency, to_multiplier, cmd_len; + u16 min, max, latency, to_multiplier; int err; if (!(hcon->link_mode & HCI_LM_MASTER)) return -EINVAL; - cmd_len = __le16_to_cpu(cmd->len); if (cmd_len != sizeof(struct l2cap_conn_param_update_req)) return -EPROTO; @@ -5196,6 +5449,65 @@ static inline int l2cap_conn_param_update_req(struct l2cap_conn *conn, return 0; } +static int l2cap_le_connect_rsp(struct l2cap_conn *conn, + struct l2cap_cmd_hdr *cmd, u16 cmd_len, + u8 *data) +{ + struct l2cap_le_conn_rsp *rsp = (struct l2cap_le_conn_rsp *) data; + u16 dcid, mtu, mps, credits, result; + struct l2cap_chan *chan; + int err; + + if (cmd_len < sizeof(*rsp)) + return -EPROTO; + + dcid = __le16_to_cpu(rsp->dcid); + mtu = __le16_to_cpu(rsp->mtu); + mps = __le16_to_cpu(rsp->mps); + credits = __le16_to_cpu(rsp->credits); + result = __le16_to_cpu(rsp->result); + + if (result == L2CAP_CR_SUCCESS && (mtu < 23 || mps < 23)) + return -EPROTO; + + BT_DBG("dcid 0x%4.4x mtu %u mps %u credits %u result 0x%2.2x", + dcid, mtu, mps, credits, result); + + mutex_lock(&conn->chan_lock); + + chan = __l2cap_get_chan_by_ident(conn, cmd->ident); + if (!chan) { + err = -EBADSLT; + goto unlock; + } + + err = 0; + + l2cap_chan_lock(chan); + + switch (result) { + case L2CAP_CR_SUCCESS: + chan->ident = 0; + chan->dcid = dcid; + chan->omtu = mtu; + chan->remote_mps = mps; + chan->tx_credits = credits; + l2cap_chan_ready(chan); + break; + + default: + l2cap_chan_del(chan, ECONNREFUSED); + break; + } + + l2cap_chan_unlock(chan); + +unlock: + mutex_unlock(&conn->chan_lock); + + return err; +} + static inline int l2cap_bredr_sig_cmd(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u16 cmd_len, u8 *data) @@ -5276,23 +5588,235 @@ static inline int l2cap_bredr_sig_cmd(struct l2cap_conn *conn, return err; } +static int l2cap_le_connect_req(struct l2cap_conn *conn, + struct l2cap_cmd_hdr *cmd, u16 cmd_len, + u8 *data) +{ + struct l2cap_le_conn_req *req = (struct l2cap_le_conn_req *) data; + struct l2cap_le_conn_rsp rsp; + struct l2cap_chan *chan, *pchan; + u16 dcid, scid, credits, mtu, mps; + __le16 psm; + u8 result; + + if (cmd_len != sizeof(*req)) + return -EPROTO; + + scid = __le16_to_cpu(req->scid); + mtu = __le16_to_cpu(req->mtu); + mps = __le16_to_cpu(req->mps); + psm = req->psm; + dcid = 0; + credits = 0; + + if (mtu < 23 || mps < 23) + return -EPROTO; + + BT_DBG("psm 0x%2.2x scid 0x%4.4x mtu %u mps %u", __le16_to_cpu(psm), + scid, mtu, mps); + + /* Check if we have socket listening on psm */ + pchan = l2cap_global_chan_by_psm(BT_LISTEN, psm, &conn->hcon->src, + &conn->hcon->dst, LE_LINK); + if (!pchan) { + result = L2CAP_CR_BAD_PSM; + chan = NULL; + goto response; + } + + mutex_lock(&conn->chan_lock); + l2cap_chan_lock(pchan); + + if (!smp_sufficient_security(conn->hcon, pchan->sec_level)) { + result = L2CAP_CR_AUTHENTICATION; + chan = NULL; + goto response_unlock; + } + + /* Check if we already have channel with that dcid */ + if (__l2cap_get_chan_by_dcid(conn, scid)) { + result = L2CAP_CR_NO_MEM; + chan = NULL; + goto response_unlock; + } + + chan = pchan->ops->new_connection(pchan); + if (!chan) { + result = L2CAP_CR_NO_MEM; + goto response_unlock; + } + + l2cap_le_flowctl_init(chan); + + bacpy(&chan->src, &conn->hcon->src); + bacpy(&chan->dst, &conn->hcon->dst); + chan->src_type = bdaddr_type(conn->hcon, conn->hcon->src_type); + chan->dst_type = bdaddr_type(conn->hcon, conn->hcon->dst_type); + chan->psm = psm; + chan->dcid = scid; + chan->omtu = mtu; + chan->remote_mps = mps; + chan->tx_credits = __le16_to_cpu(req->credits); + + __l2cap_chan_add(conn, chan); + dcid = chan->scid; + credits = chan->rx_credits; + + __set_chan_timer(chan, chan->ops->get_sndtimeo(chan)); + + chan->ident = cmd->ident; + + if (test_bit(FLAG_DEFER_SETUP, &chan->flags)) { + l2cap_state_change(chan, BT_CONNECT2); + result = L2CAP_CR_PEND; + chan->ops->defer(chan); + } else { + l2cap_chan_ready(chan); + result = L2CAP_CR_SUCCESS; + } + +response_unlock: + l2cap_chan_unlock(pchan); + mutex_unlock(&conn->chan_lock); + + if (result == L2CAP_CR_PEND) + return 0; + +response: + if (chan) { + rsp.mtu = cpu_to_le16(chan->imtu); + rsp.mps = cpu_to_le16(chan->mps); + } else { + rsp.mtu = 0; + rsp.mps = 0; + } + + rsp.dcid = cpu_to_le16(dcid); + rsp.credits = cpu_to_le16(credits); + rsp.result = cpu_to_le16(result); + + l2cap_send_cmd(conn, cmd->ident, L2CAP_LE_CONN_RSP, sizeof(rsp), &rsp); + + return 0; +} + +static inline int l2cap_le_credits(struct l2cap_conn *conn, + struct l2cap_cmd_hdr *cmd, u16 cmd_len, + u8 *data) +{ + struct l2cap_le_credits *pkt; + struct l2cap_chan *chan; + u16 cid, credits; + + if (cmd_len != sizeof(*pkt)) + return -EPROTO; + + pkt = (struct l2cap_le_credits *) data; + cid = __le16_to_cpu(pkt->cid); + credits = __le16_to_cpu(pkt->credits); + + BT_DBG("cid 0x%4.4x credits 0x%4.4x", cid, credits); + + chan = l2cap_get_chan_by_dcid(conn, cid); + if (!chan) + return -EBADSLT; + + chan->tx_credits += credits; + + while (chan->tx_credits && !skb_queue_empty(&chan->tx_q)) { + l2cap_do_send(chan, skb_dequeue(&chan->tx_q)); + chan->tx_credits--; + } + + if (chan->tx_credits) + chan->ops->resume(chan); + + l2cap_chan_unlock(chan); + + return 0; +} + +static inline int l2cap_le_command_rej(struct l2cap_conn *conn, + struct l2cap_cmd_hdr *cmd, u16 cmd_len, + u8 *data) +{ + struct l2cap_cmd_rej_unk *rej = (struct l2cap_cmd_rej_unk *) data; + struct l2cap_chan *chan; + + if (cmd_len < sizeof(*rej)) + return -EPROTO; + + mutex_lock(&conn->chan_lock); + + chan = __l2cap_get_chan_by_ident(conn, cmd->ident); + if (!chan) + goto done; + + l2cap_chan_lock(chan); + l2cap_chan_del(chan, ECONNREFUSED); + l2cap_chan_unlock(chan); + +done: + mutex_unlock(&conn->chan_lock); + return 0; +} + static inline int l2cap_le_sig_cmd(struct l2cap_conn *conn, - struct l2cap_cmd_hdr *cmd, u8 *data) + struct l2cap_cmd_hdr *cmd, u16 cmd_len, + u8 *data) { + int err = 0; + + if (!enable_lecoc) { + switch (cmd->code) { + case L2CAP_LE_CONN_REQ: + case L2CAP_LE_CONN_RSP: + case L2CAP_LE_CREDITS: + case L2CAP_DISCONN_REQ: + case L2CAP_DISCONN_RSP: + return -EINVAL; + } + } + switch (cmd->code) { case L2CAP_COMMAND_REJ: - return 0; + l2cap_le_command_rej(conn, cmd, cmd_len, data); + break; case L2CAP_CONN_PARAM_UPDATE_REQ: - return l2cap_conn_param_update_req(conn, cmd, data); + err = l2cap_conn_param_update_req(conn, cmd, cmd_len, data); + break; case L2CAP_CONN_PARAM_UPDATE_RSP: - return 0; + break; + + case L2CAP_LE_CONN_RSP: + l2cap_le_connect_rsp(conn, cmd, cmd_len, data); + break; + + case L2CAP_LE_CONN_REQ: + err = l2cap_le_connect_req(conn, cmd, cmd_len, data); + break; + + case L2CAP_LE_CREDITS: + err = l2cap_le_credits(conn, cmd, cmd_len, data); + break; + + case L2CAP_DISCONN_REQ: + err = l2cap_disconnect_req(conn, cmd, cmd_len, data); + break; + + case L2CAP_DISCONN_RSP: + l2cap_disconnect_rsp(conn, cmd, cmd_len, data); + break; default: BT_ERR("Unknown LE signaling command 0x%2.2x", cmd->code); - return -EINVAL; + err = -EINVAL; + break; } + + return err; } static inline void l2cap_le_sig_channel(struct l2cap_conn *conn, @@ -5321,7 +5845,7 @@ static inline void l2cap_le_sig_channel(struct l2cap_conn *conn, goto drop; } - err = l2cap_le_sig_cmd(conn, cmd, skb->data); + err = l2cap_le_sig_cmd(conn, cmd, len, skb->data); if (err) { struct l2cap_cmd_rej_unk rej; @@ -6312,6 +6836,121 @@ drop: return 0; } +static void l2cap_chan_le_send_credits(struct l2cap_chan *chan) +{ + struct l2cap_conn *conn = chan->conn; + struct l2cap_le_credits pkt; + u16 return_credits; + + /* We return more credits to the sender only after the amount of + * credits falls below half of the initial amount. + */ + if (chan->rx_credits >= (le_max_credits + 1) / 2) + return; + + return_credits = le_max_credits - chan->rx_credits; + + BT_DBG("chan %p returning %u credits to sender", chan, return_credits); + + chan->rx_credits += return_credits; + + pkt.cid = cpu_to_le16(chan->scid); + pkt.credits = cpu_to_le16(return_credits); + + chan->ident = l2cap_get_ident(conn); + + l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CREDITS, sizeof(pkt), &pkt); +} + +static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb) +{ + int err; + + if (!chan->rx_credits) { + BT_ERR("No credits to receive LE L2CAP data"); + return -ENOBUFS; + } + + if (chan->imtu < skb->len) { + BT_ERR("Too big LE L2CAP PDU"); + return -ENOBUFS; + } + + chan->rx_credits--; + BT_DBG("rx_credits %u -> %u", chan->rx_credits + 1, chan->rx_credits); + + l2cap_chan_le_send_credits(chan); + + err = 0; + + if (!chan->sdu) { + u16 sdu_len; + + sdu_len = get_unaligned_le16(skb->data); + skb_pull(skb, L2CAP_SDULEN_SIZE); + + BT_DBG("Start of new SDU. sdu_len %u skb->len %u imtu %u", + sdu_len, skb->len, chan->imtu); + + if (sdu_len > chan->imtu) { + BT_ERR("Too big LE L2CAP SDU length received"); + err = -EMSGSIZE; + goto failed; + } + + if (skb->len > sdu_len) { + BT_ERR("Too much LE L2CAP data received"); + err = -EINVAL; + goto failed; + } + + if (skb->len == sdu_len) + return chan->ops->recv(chan, skb); + + chan->sdu = skb; + chan->sdu_len = sdu_len; + chan->sdu_last_frag = skb; + + return 0; + } + + BT_DBG("SDU fragment. chan->sdu->len %u skb->len %u chan->sdu_len %u", + chan->sdu->len, skb->len, chan->sdu_len); + + if (chan->sdu->len + skb->len > chan->sdu_len) { + BT_ERR("Too much LE L2CAP data received"); + err = -EINVAL; + goto failed; + } + + append_skb_frag(chan->sdu, skb, &chan->sdu_last_frag); + skb = NULL; + + if (chan->sdu->len == chan->sdu_len) { + err = chan->ops->recv(chan, chan->sdu); + if (!err) { + chan->sdu = NULL; + chan->sdu_last_frag = NULL; + chan->sdu_len = 0; + } + } + +failed: + if (err) { + kfree_skb(skb); + kfree_skb(chan->sdu); + chan->sdu = NULL; + chan->sdu_last_frag = NULL; + chan->sdu_len = 0; + } + + /* We can't return an error here since we took care of the skb + * freeing internally. An error return would cause the caller to + * do a double-free of the skb. + */ + return 0; +} + static void l2cap_data_channel(struct l2cap_conn *conn, u16 cid, struct sk_buff *skb) { @@ -6341,6 +6980,12 @@ static void l2cap_data_channel(struct l2cap_conn *conn, u16 cid, goto drop; switch (chan->mode) { + case L2CAP_MODE_LE_FLOWCTL: + if (l2cap_le_data_rcv(chan, skb) < 0) + goto drop; + + goto done; + case L2CAP_MODE_BASIC: /* If socket recv buffers overflows we drop data here * which is *bad* because L2CAP has to be reliable. @@ -6380,7 +7025,8 @@ static void l2cap_conless_channel(struct l2cap_conn *conn, __le16 psm, if (hcon->type != ACL_LINK) goto drop; - chan = l2cap_global_chan_by_psm(0, psm, &hcon->src, &hcon->dst); + chan = l2cap_global_chan_by_psm(0, psm, &hcon->src, &hcon->dst, + ACL_LINK); if (!chan) goto drop; @@ -6612,11 +7258,10 @@ int l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) } if (chan->state == BT_CONNECT) { - if (!status) { + if (!status) l2cap_start_connection(chan); - } else { + else __set_chan_timer(chan, L2CAP_DISC_TIMEOUT); - } } else if (chan->state == BT_CONNECT2) { struct l2cap_conn_rsp rsp; __u16 res, stat; @@ -6817,6 +7462,11 @@ int __init l2cap_init(void) l2cap_debugfs = debugfs_create_file("l2cap", 0444, bt_debugfs, NULL, &l2cap_debugfs_fops); + debugfs_create_u16("l2cap_le_max_credits", 0466, bt_debugfs, + &le_max_credits); + debugfs_create_u16("l2cap_le_default_mps", 0466, bt_debugfs, + &le_default_mps); + return 0; } diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 7cc24d263caa..e7806e6d282c 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -27,6 +27,7 @@ /* Bluetooth L2CAP sockets. */ +#include <linux/module.h> #include <linux/export.h> #include <net/bluetooth/bluetooth.h> @@ -35,6 +36,8 @@ #include "smp.h" +bool enable_lecoc; + static struct bt_sock_list l2cap_sk_list = { .lock = __RW_LOCK_UNLOCKED(l2cap_sk_list.lock) }; @@ -50,6 +53,32 @@ bool l2cap_is_socket(struct socket *sock) } EXPORT_SYMBOL(l2cap_is_socket); +static int l2cap_validate_bredr_psm(u16 psm) +{ + /* PSM must be odd and lsb of upper byte must be 0 */ + if ((psm & 0x0101) != 0x0001) + return -EINVAL; + + /* Restrict usage of well-known PSMs */ + if (psm < 0x1001 && !capable(CAP_NET_BIND_SERVICE)) + return -EACCES; + + return 0; +} + +static int l2cap_validate_le_psm(u16 psm) +{ + /* Valid LE_PSM ranges are defined only until 0x00ff */ + if (psm > 0x00ff) + return -EINVAL; + + /* Restrict fixed, SIG assigned PSM values to CAP_NET_BIND_SERVICE */ + if (psm <= 0x007f && !capable(CAP_NET_BIND_SERVICE)) + return -EACCES; + + return 0; +} + static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) { struct sock *sk = sock->sk; @@ -73,11 +102,11 @@ static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) return -EINVAL; if (bdaddr_type_is_le(la.l2_bdaddr_type)) { - /* Connection oriented channels are not supported on LE */ - if (la.l2_psm) + if (!enable_lecoc && la.l2_psm) return -EINVAL; /* We only allow ATT user space socket */ - if (la.l2_cid != __constant_cpu_to_le16(L2CAP_CID_ATT)) + if (la.l2_cid && + la.l2_cid != __constant_cpu_to_le16(L2CAP_CID_ATT)) return -EINVAL; } @@ -91,17 +120,13 @@ static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) if (la.l2_psm) { __u16 psm = __le16_to_cpu(la.l2_psm); - /* PSM must be odd and lsb of upper byte must be 0 */ - if ((psm & 0x0101) != 0x0001) { - err = -EINVAL; - goto done; - } + if (la.l2_bdaddr_type == BDADDR_BREDR) + err = l2cap_validate_bredr_psm(psm); + else + err = l2cap_validate_le_psm(psm); - /* Restrict usage of well-known PSMs */ - if (psm < 0x1001 && !capable(CAP_NET_BIND_SERVICE)) { - err = -EACCES; + if (err) goto done; - } } if (la.l2_cid) @@ -127,6 +152,9 @@ static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) bacpy(&chan->src, &la.l2_bdaddr); chan->src_type = la.l2_bdaddr_type; + if (chan->psm && bdaddr_type_is_le(chan->src_type)) + chan->mode = L2CAP_MODE_LE_FLOWCTL; + chan->state = BT_BOUND; sk->sk_state = BT_BOUND; @@ -189,14 +217,17 @@ static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, return -EINVAL; if (bdaddr_type_is_le(la.l2_bdaddr_type)) { - /* Connection oriented channels are not supported on LE */ - if (la.l2_psm) + if (!enable_lecoc && la.l2_psm) return -EINVAL; /* We only allow ATT user space socket */ - if (la.l2_cid != __constant_cpu_to_le16(L2CAP_CID_ATT)) + if (la.l2_cid && + la.l2_cid != __constant_cpu_to_le16(L2CAP_CID_ATT)) return -EINVAL; } + if (chan->psm && bdaddr_type_is_le(chan->src_type)) + chan->mode = L2CAP_MODE_LE_FLOWCTL; + err = l2cap_chan_connect(chan, la.l2_psm, __le16_to_cpu(la.l2_cid), &la.l2_bdaddr, la.l2_bdaddr_type); if (err) @@ -234,6 +265,7 @@ static int l2cap_sock_listen(struct socket *sock, int backlog) switch (chan->mode) { case L2CAP_MODE_BASIC: + case L2CAP_MODE_LE_FLOWCTL: break; case L2CAP_MODE_ERTM: case L2CAP_MODE_STREAMING: @@ -360,6 +392,16 @@ static int l2cap_sock_getsockopt_old(struct socket *sock, int optname, switch (optname) { case L2CAP_OPTIONS: + /* LE sockets should use BT_SNDMTU/BT_RCVMTU, but since + * legacy ATT code depends on getsockopt for + * L2CAP_OPTIONS we need to let this pass. + */ + if (bdaddr_type_is_le(chan->src_type) && + chan->scid != L2CAP_CID_ATT) { + err = -EINVAL; + break; + } + memset(&opts, 0, sizeof(opts)); opts.imtu = chan->imtu; opts.omtu = chan->omtu; @@ -514,6 +556,41 @@ static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, err = -EFAULT; break; + case BT_SNDMTU: + if (!enable_lecoc) { + err = -EPROTONOSUPPORT; + break; + } + + if (!bdaddr_type_is_le(chan->src_type)) { + err = -EINVAL; + break; + } + + if (sk->sk_state != BT_CONNECTED) { + err = -ENOTCONN; + break; + } + + if (put_user(chan->omtu, (u16 __user *) optval)) + err = -EFAULT; + break; + + case BT_RCVMTU: + if (!enable_lecoc) { + err = -EPROTONOSUPPORT; + break; + } + + if (!bdaddr_type_is_le(chan->src_type)) { + err = -EINVAL; + break; + } + + if (put_user(chan->imtu, (u16 __user *) optval)) + err = -EFAULT; + break; + default: err = -ENOPROTOOPT; break; @@ -554,6 +631,11 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, switch (optname) { case L2CAP_OPTIONS: + if (bdaddr_type_is_le(chan->src_type)) { + err = -EINVAL; + break; + } + if (sk->sk_state == BT_CONNECTED) { err = -EINVAL; break; @@ -585,6 +667,8 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, chan->mode = opts.mode; switch (chan->mode) { + case L2CAP_MODE_LE_FLOWCTL: + break; case L2CAP_MODE_BASIC: clear_bit(CONF_STATE2_DEVICE, &chan->conf_state); break; @@ -807,6 +891,47 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; + case BT_SNDMTU: + if (!enable_lecoc) { + err = -EPROTONOSUPPORT; + break; + } + + if (!bdaddr_type_is_le(chan->src_type)) { + err = -EINVAL; + break; + } + + /* Setting is not supported as it's the remote side that + * decides this. + */ + err = -EPERM; + break; + + case BT_RCVMTU: + if (!enable_lecoc) { + err = -EPROTONOSUPPORT; + break; + } + + if (!bdaddr_type_is_le(chan->src_type)) { + err = -EINVAL; + break; + } + + if (sk->sk_state == BT_CONNECTED) { + err = -EISCONN; + break; + } + + if (get_user(opt, (u32 __user *) optval)) { + err = -EFAULT; + break; + } + + chan->imtu = opt; + break; + default: err = -ENOPROTOOPT; break; @@ -859,10 +984,16 @@ static int l2cap_sock_recvmsg(struct kiocb *iocb, struct socket *sock, if (sk->sk_state == BT_CONNECT2 && test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { - sk->sk_state = BT_CONFIG; - pi->chan->state = BT_CONFIG; + if (bdaddr_type_is_le(pi->chan->src_type)) { + sk->sk_state = BT_CONNECTED; + pi->chan->state = BT_CONNECTED; + __l2cap_le_connect_rsp_defer(pi->chan); + } else { + sk->sk_state = BT_CONFIG; + pi->chan->state = BT_CONFIG; + __l2cap_connect_rsp_defer(pi->chan); + } - __l2cap_connect_rsp_defer(pi->chan); err = 0; goto done; } @@ -1236,6 +1367,14 @@ static long l2cap_sock_get_sndtimeo_cb(struct l2cap_chan *chan) return sk->sk_sndtimeo; } +static void l2cap_sock_suspend_cb(struct l2cap_chan *chan) +{ + struct sock *sk = chan->data; + + set_bit(BT_SK_SUSPEND, &bt_sk(sk)->flags); + sk->sk_state_change(sk); +} + static struct l2cap_ops l2cap_chan_ops = { .name = "L2CAP Socket Interface", .new_connection = l2cap_sock_new_connection_cb, @@ -1246,6 +1385,7 @@ static struct l2cap_ops l2cap_chan_ops = { .ready = l2cap_sock_ready_cb, .defer = l2cap_sock_defer_cb, .resume = l2cap_sock_resume_cb, + .suspend = l2cap_sock_suspend_cb, .set_shutdown = l2cap_sock_set_shutdown_cb, .get_sndtimeo = l2cap_sock_get_sndtimeo_cb, .alloc_skb = l2cap_sock_alloc_skb_cb, @@ -1303,6 +1443,8 @@ static void l2cap_sock_init(struct sock *sk, struct sock *parent) chan->tx_win_max = pchan->tx_win_max; chan->sec_level = pchan->sec_level; chan->flags = pchan->flags; + chan->tx_credits = pchan->tx_credits; + chan->rx_credits = pchan->rx_credits; security_sk_clone(parent, sk); } else { @@ -1469,3 +1611,6 @@ void l2cap_cleanup_sockets(void) bt_sock_unregister(BTPROTO_L2CAP); proto_unregister(&l2cap_proto); } + +module_param(enable_lecoc, bool, 0644); +MODULE_PARM_DESC(enable_lecoc, "Enable support for LE CoC"); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 074d83690a41..a03ca3ca91bf 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -1264,7 +1264,7 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data, if (cp->val == 0x02) { /* Limited discoverable mode */ - hci_cp.num_iac = 2; + hci_cp.num_iac = min_t(u8, hdev->num_iac, 2); hci_cp.iac_lap[0] = 0x00; /* LIAC */ hci_cp.iac_lap[1] = 0x8b; hci_cp.iac_lap[2] = 0x9e; @@ -4595,6 +4595,9 @@ void mgmt_device_disconnected(struct hci_dev *hdev, bdaddr_t *bdaddr, struct mgmt_ev_device_disconnected ev; struct sock *sk = NULL; + if (link_type != ACL_LINK && link_type != LE_LINK) + return; + mgmt_pending_foreach(MGMT_OP_DISCONNECT, hdev, disconnect_rsp, &sk); bacpy(&ev.addr.bdaddr, bdaddr); @@ -4613,6 +4616,8 @@ void mgmt_device_disconnected(struct hci_dev *hdev, bdaddr_t *bdaddr, void mgmt_disconnect_failed(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 status) { + u8 bdaddr_type = link_to_bdaddr(link_type, addr_type); + struct mgmt_cp_disconnect *cp; struct mgmt_rp_disconnect rp; struct pending_cmd *cmd; @@ -4623,8 +4628,16 @@ void mgmt_disconnect_failed(struct hci_dev *hdev, bdaddr_t *bdaddr, if (!cmd) return; + cp = cmd->param; + + if (bacmp(bdaddr, &cp->addr.bdaddr)) + return; + + if (cp->addr.type != bdaddr_type) + return; + bacpy(&rp.addr.bdaddr, bdaddr); - rp.addr.type = link_to_bdaddr(link_type, addr_type); + rp.addr.type = bdaddr_type; cmd_complete(cmd->sk, cmd->index, MGMT_OP_DISCONNECT, mgmt_status(status), &rp, sizeof(rp)); diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 4b07acb8293c..45007362683b 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -53,8 +53,7 @@ static int smp_e(struct crypto_blkcipher *tfm, const u8 *k, u8 *r) { struct blkcipher_desc desc; struct scatterlist sg; - int err, iv_len; - unsigned char iv[128]; + int err; if (tfm == NULL) { BT_ERR("tfm %p", tfm); @@ -72,12 +71,6 @@ static int smp_e(struct crypto_blkcipher *tfm, const u8 *k, u8 *r) sg_init_one(&sg, r, 16); - iv_len = crypto_blkcipher_ivsize(tfm); - if (iv_len) { - memset(&iv, 0xff, iv_len); - crypto_blkcipher_set_iv(tfm, iv, iv_len); - } - err = crypto_blkcipher_encrypt(&desc, &sg, &sg, 16); if (err) BT_ERR("Encrypt data error %d", err); @@ -143,13 +136,6 @@ static int smp_s1(struct crypto_blkcipher *tfm, u8 k[16], u8 r1[16], return err; } -static int smp_rand(u8 *buf) -{ - get_random_bytes(buf, 16); - - return 0; -} - static struct sk_buff *smp_build_cmd(struct l2cap_conn *conn, u8 code, u16 dlen, void *data) { @@ -257,11 +243,11 @@ static u8 check_enc_key_size(struct l2cap_conn *conn, __u8 max_key_size) return 0; } -static void smp_failure(struct l2cap_conn *conn, u8 reason, u8 send) +static void smp_failure(struct l2cap_conn *conn, u8 reason) { struct hci_conn *hcon = conn->hcon; - if (send) + if (reason) smp_send_cmd(conn, SMP_CMD_PAIRING_FAIL, sizeof(reason), &reason); @@ -406,7 +392,7 @@ static void confirm_work(struct work_struct *work) return; error: - smp_failure(conn, reason, 1); + smp_failure(conn, reason); } static void random_work(struct work_struct *work) @@ -490,7 +476,7 @@ static void random_work(struct work_struct *work) return; error: - smp_failure(conn, reason, 1); + smp_failure(conn, reason); } static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) @@ -555,10 +541,10 @@ int smp_user_confirm_reply(struct hci_conn *hcon, u16 mgmt_op, __le32 passkey) break; case MGMT_OP_USER_PASSKEY_NEG_REPLY: case MGMT_OP_USER_CONFIRM_NEG_REPLY: - smp_failure(conn, SMP_PASSKEY_ENTRY_FAILED, 1); + smp_failure(conn, SMP_PASSKEY_ENTRY_FAILED); return 0; default: - smp_failure(conn, SMP_PASSKEY_ENTRY_FAILED, 1); + smp_failure(conn, SMP_PASSKEY_ENTRY_FAILED); return -EOPNOTSUPP; } @@ -606,9 +592,7 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) if (check_enc_key_size(conn, key_size)) return SMP_ENC_KEY_SIZE; - ret = smp_rand(smp->prnd); - if (ret) - return SMP_UNSPECIFIED; + get_random_bytes(smp->prnd, sizeof(smp->prnd)); smp->prsp[0] = SMP_CMD_PAIRING_RSP; memcpy(&smp->prsp[1], &rsp, sizeof(rsp)); @@ -644,9 +628,7 @@ static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb) if (check_enc_key_size(conn, key_size)) return SMP_ENC_KEY_SIZE; - ret = smp_rand(smp->prnd); - if (ret) - return SMP_UNSPECIFIED; + get_random_bytes(smp->prnd, sizeof(smp->prnd)); smp->prsp[0] = SMP_CMD_PAIRING_RSP; memcpy(&smp->prsp[1], rsp, sizeof(*rsp)); @@ -768,6 +750,17 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) return 0; } +bool smp_sufficient_security(struct hci_conn *hcon, u8 sec_level) +{ + if (sec_level == BT_SECURITY_LOW) + return true; + + if (hcon->sec_level >= sec_level) + return true; + + return false; +} + int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) { struct l2cap_conn *conn = hcon->l2cap_data; @@ -779,10 +772,7 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) if (!test_bit(HCI_LE_ENABLED, &hcon->hdev->dev_flags)) return 1; - if (sec_level == BT_SECURITY_LOW) - return 1; - - if (hcon->sec_level >= sec_level) + if (smp_sufficient_security(hcon, sec_level)) return 1; if (hcon->link_mode & HCI_LM_MASTER) @@ -895,7 +885,7 @@ int smp_sig_channel(struct l2cap_conn *conn, struct sk_buff *skb) break; case SMP_CMD_PAIRING_FAIL: - smp_failure(conn, skb->data[0], 0); + smp_failure(conn, 0); reason = 0; err = -EPERM; break; @@ -941,7 +931,7 @@ int smp_sig_channel(struct l2cap_conn *conn, struct sk_buff *skb) done: if (reason) - smp_failure(conn, reason, 1); + smp_failure(conn, reason); kfree_skb(skb); return err; diff --git a/net/bluetooth/smp.h b/net/bluetooth/smp.h index f8ba07f3e5fa..a700bcb490d7 100644 --- a/net/bluetooth/smp.h +++ b/net/bluetooth/smp.h @@ -136,6 +136,7 @@ struct smp_chan { }; /* SMP Commands */ +bool smp_sufficient_security(struct hci_conn *hcon, u8 sec_level); int smp_conn_security(struct hci_conn *hcon, __u8 sec_level); int smp_sig_channel(struct l2cap_conn *conn, struct sk_buff *skb); int smp_distribute_keys(struct l2cap_conn *conn, __u8 force); diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 4b81b1471789..96866aaf3f71 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -26,7 +26,7 @@ static int deliver_clone(const struct net_bridge_port *prev, void (*__packet_hook)(const struct net_bridge_port *p, struct sk_buff *skb)); -/* Don't forward packets to originating port or forwarding diasabled */ +/* Don't forward packets to originating port or forwarding disabled */ static inline int should_deliver(const struct net_bridge_port *p, const struct sk_buff *skb) { diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 4bf02adb5dc2..1f6bd1e2e8a4 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -61,7 +61,7 @@ static int port_cost(struct net_device *dev) } -/* Check for port carrier transistions. */ +/* Check for port carrier transitions. */ void br_port_carrier_check(struct net_bridge_port *p) { struct net_device *dev = p->dev; diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index af5ebd18d705..7ffc801467ec 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -146,7 +146,7 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br, /* At this point, we know that the frame was filtered and contains * a valid vlan id. If the vlan id is set in the untagged bitmap, - * send untagged; otherwise, send taged. + * send untagged; otherwise, send tagged. */ br_vlan_get_tag(skb, &vid); if (test_bit(vid, pv->untagged_bitmap)) diff --git a/net/core/dev.c b/net/core/dev.c index 9d4369ece679..c482fe8abf87 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3006,7 +3006,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, } skb_reset_network_header(skb); - if (!skb_get_rxhash(skb)) + if (!skb_get_hash(skb)) goto done; flow_table = rcu_dereference(rxqueue->rps_flow_table); @@ -3151,7 +3151,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) rcu_read_lock(); fl = rcu_dereference(sd->flow_limit); if (fl) { - new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1); + new_flow = skb_get_hash(skb) & (fl->num_buckets - 1); old_flow = fl->history[fl->history_head]; fl->history[fl->history_head] = new_flow; diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index ec40a849fc42..bb504a919e33 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -186,47 +186,6 @@ static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list, return err; } -int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list, - struct netdev_hw_addr_list *from_list, - int addr_len, unsigned char addr_type) -{ - int err; - struct netdev_hw_addr *ha, *ha2; - unsigned char type; - - list_for_each_entry(ha, &from_list->list, list) { - type = addr_type ? addr_type : ha->type; - err = __hw_addr_add(to_list, ha->addr, addr_len, type); - if (err) - goto unroll; - } - return 0; - -unroll: - list_for_each_entry(ha2, &from_list->list, list) { - if (ha2 == ha) - break; - type = addr_type ? addr_type : ha2->type; - __hw_addr_del(to_list, ha2->addr, addr_len, type); - } - return err; -} -EXPORT_SYMBOL(__hw_addr_add_multiple); - -void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list, - struct netdev_hw_addr_list *from_list, - int addr_len, unsigned char addr_type) -{ - struct netdev_hw_addr *ha; - unsigned char type; - - list_for_each_entry(ha, &from_list->list, list) { - type = addr_type ? addr_type : ha->type; - __hw_addr_del(to_list, ha->addr, addr_len, type); - } -} -EXPORT_SYMBOL(__hw_addr_del_multiple); - /* This function only works where there is a strict 1-1 relationship * between source and destionation of they synch. If you ever need to * sync addresses to more then 1 destination, you need to use @@ -264,7 +223,7 @@ void __hw_addr_unsync(struct netdev_hw_addr_list *to_list, } EXPORT_SYMBOL(__hw_addr_unsync); -void __hw_addr_flush(struct netdev_hw_addr_list *list) +static void __hw_addr_flush(struct netdev_hw_addr_list *list) { struct netdev_hw_addr *ha, *tmp; @@ -274,7 +233,6 @@ void __hw_addr_flush(struct netdev_hw_addr_list *list) } list->count = 0; } -EXPORT_SYMBOL(__hw_addr_flush); void __hw_addr_init(struct netdev_hw_addr_list *list) { @@ -400,59 +358,6 @@ int dev_addr_del(struct net_device *dev, const unsigned char *addr, } EXPORT_SYMBOL(dev_addr_del); -/** - * dev_addr_add_multiple - Add device addresses from another device - * @to_dev: device to which addresses will be added - * @from_dev: device from which addresses will be added - * @addr_type: address type - 0 means type will be used from from_dev - * - * Add device addresses of the one device to another. - ** - * The caller must hold the rtnl_mutex. - */ -int dev_addr_add_multiple(struct net_device *to_dev, - struct net_device *from_dev, - unsigned char addr_type) -{ - int err; - - ASSERT_RTNL(); - - if (from_dev->addr_len != to_dev->addr_len) - return -EINVAL; - err = __hw_addr_add_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs, - to_dev->addr_len, addr_type); - if (!err) - call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); - return err; -} -EXPORT_SYMBOL(dev_addr_add_multiple); - -/** - * dev_addr_del_multiple - Delete device addresses by another device - * @to_dev: device where the addresses will be deleted - * @from_dev: device supplying the addresses to be deleted - * @addr_type: address type - 0 means type will be used from from_dev - * - * Deletes addresses in to device by the list of addresses in from device. - * - * The caller must hold the rtnl_mutex. - */ -int dev_addr_del_multiple(struct net_device *to_dev, - struct net_device *from_dev, - unsigned char addr_type) -{ - ASSERT_RTNL(); - - if (from_dev->addr_len != to_dev->addr_len) - return -EINVAL; - __hw_addr_del_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs, - to_dev->addr_len, addr_type); - call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); - return 0; -} -EXPORT_SYMBOL(dev_addr_del_multiple); - /* * Unicast list handling functions */ diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 95897183226e..e70301eb7a4a 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -64,7 +64,6 @@ static struct genl_family net_drop_monitor_family = { .hdrsize = 0, .name = "NET_DM", .version = 2, - .maxattr = NET_DM_CMD_MAX, }; static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index d6ef17322500..b324bfa3485c 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -202,12 +202,12 @@ static __always_inline u32 __flow_hash_1word(u32 a) } /* - * __skb_get_rxhash: calculate a flow hash based on src/dst addresses + * __skb_get_hash: calculate a flow hash based on src/dst addresses * and src/dst port numbers. Sets rxhash in skb to non-zero hash value * on success, zero indicates no valid hash. Also, sets l4_rxhash in skb * if hash is a canonical 4-tuple hash over transport ports. */ -void __skb_get_rxhash(struct sk_buff *skb) +void __skb_get_hash(struct sk_buff *skb) { struct flow_keys keys; u32 hash; @@ -234,7 +234,7 @@ void __skb_get_rxhash(struct sk_buff *skb) skb->rxhash = hash; } -EXPORT_SYMBOL(__skb_get_rxhash); +EXPORT_SYMBOL(__skb_get_hash); /* * Returns a Tx hash based on the given packet descriptor a Tx queues' number diff --git a/net/core/neighbour.c b/net/core/neighbour.c index bf6f404c04aa..a666740051dc 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1169,6 +1169,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, neigh->parms->reachable_time : 0))); neigh->nud_state = new; + notify = 1; } if (lladdr != neigh->ha) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 06e72d3cdf60..2b6b863f51f2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -712,9 +712,8 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->inner_network_header = old->inner_network_header; new->inner_mac_header = old->inner_mac_header; skb_dst_copy(new, old); - new->rxhash = old->rxhash; + skb_copy_hash(new, old); new->ooo_okay = old->ooo_okay; - new->l4_rxhash = old->l4_rxhash; new->no_fcs = old->no_fcs; new->encapsulation = old->encapsulation; #ifdef CONFIG_XFRM diff --git a/net/core/sock.c b/net/core/sock.c index ab20ed9b0f31..5393b4b719d7 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -882,7 +882,7 @@ set_rcvbuf: case SO_PEEK_OFF: if (sock->ops->set_peek_off) - sock->ops->set_peek_off(sk, val); + ret = sock->ops->set_peek_off(sk, val); else ret = -EOPNOTSUPP; break; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 4ac71ff7c2e4..629019e6f8e9 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -141,6 +141,9 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (type == ICMPV6_PKT_TOOBIG) { struct dst_entry *dst = NULL; + if (!ip6_sk_accept_pmtu(sk)) + goto out; + if (sock_owned_by_user(sk)) goto out; if ((1 << sk->sk_state) & (DCCPF_LISTEN | DCCPF_CLOSED)) @@ -851,7 +854,6 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (flowlabel == NULL) return -EINVAL; - usin->sin6_addr = flowlabel->dst; fl6_sock_release(flowlabel); } } diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 4bdab1521878..327060c6c874 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -127,11 +127,6 @@ int hsr_create_self_node(struct list_head *self_node_db, return 0; } -static void node_entry_reclaim(struct rcu_head *rh) -{ - kfree(container_of(rh, struct node_entry, rcu_head)); -} - /* Add/merge node to the database of nodes. 'skb' must contain an HSR * supervision frame. @@ -175,7 +170,7 @@ struct node_entry *hsr_merge_node(struct hsr_priv *hsr_priv, if (node && !ether_addr_equal(node->MacAddressA, hsr_sp->MacAddressA)) { /* Node has changed its AddrA, frame was received from SlaveB */ list_del_rcu(&node->mac_list); - call_rcu(&node->rcu_head, node_entry_reclaim); + kfree_rcu(node, rcu_head); node = NULL; } @@ -183,7 +178,7 @@ struct node_entry *hsr_merge_node(struct hsr_priv *hsr_priv, !ether_addr_equal(node->MacAddressB, hsr_ethsup->ethhdr.h_source)) { /* Cables have been swapped */ list_del_rcu(&node->mac_list); - call_rcu(&node->rcu_head, node_entry_reclaim); + kfree_rcu(node, rcu_head); node = NULL; } @@ -192,7 +187,7 @@ struct node_entry *hsr_merge_node(struct hsr_priv *hsr_priv, !ether_addr_equal(node->MacAddressA, hsr_ethsup->ethhdr.h_source)) { /* Cables have been swapped */ list_del_rcu(&node->mac_list); - call_rcu(&node->rcu_head, node_entry_reclaim); + kfree_rcu(node, rcu_head); node = NULL; } @@ -417,7 +412,7 @@ void hsr_prune_nodes(struct hsr_priv *hsr_priv) hsr_nl_nodedown(hsr_priv, node->MacAddressA); list_del_rcu(&node->mac_list); /* Note that we need to free this entry later: */ - call_rcu(&node->rcu_head, node_entry_reclaim); + kfree_rcu(node, rcu_head); } } rcu_read_unlock(); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index ef4f9df6d698..6b1193e63911 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -126,9 +126,6 @@ static struct list_head inetsw[SOCK_MAX]; static DEFINE_SPINLOCK(inetsw_lock); -struct ipv4_config ipv4_config; -EXPORT_SYMBOL(ipv4_config); - /* New destruction routine */ void inet_sock_destruct(struct sock *sk) @@ -342,7 +339,7 @@ lookup_protocol: inet->hdrincl = 1; } - if (ipv4_config.no_pmtu_disc) + if (net->ipv4.sysctl_ip_no_pmtu_disc) inet->pmtudisc = IP_PMTUDISC_DONT; else inet->pmtudisc = IP_PMTUDISC_WANT; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 523be38e37de..f2e15738534d 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -104,7 +104,10 @@ errout: static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg) { struct fib_result *result = (struct fib_result *) arg->result; - struct net_device *dev = result->fi->fib_dev; + struct net_device *dev = NULL; + + if (result->fi) + dev = result->fi->fib_dev; /* do not accept result if the route does * not meet the required prefix length diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 5c0e8bc6e5ba..fb3c5637199d 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -705,7 +705,9 @@ static void icmp_unreach(struct sk_buff *skb) case ICMP_PORT_UNREACH: break; case ICMP_FRAG_NEEDED: - if (ipv4_config.no_pmtu_disc) { + if (net->ipv4.sysctl_ip_no_pmtu_disc == 2) { + goto out; + } else if (net->ipv4.sysctl_ip_no_pmtu_disc) { LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"), &iph->daddr); } else { diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 2481993a4970..c10a3ce5cbff 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -704,7 +704,7 @@ struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user) memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); if (ip_defrag(skb, user)) return NULL; - skb->rxhash = 0; + skb_clear_hash(skb); } } return skb; diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 42ffbc8d65c6..6156f4ef5e91 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -56,7 +56,7 @@ int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb, skb_scrub_packet(skb, xnet); - skb->rxhash = 0; + skb_clear_hash(skb); skb_dst_set(skb, &rt->dst); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); @@ -107,8 +107,7 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) nf_reset(skb); secpath_reset(skb); - if (!skb->l4_rxhash) - skb->rxhash = 0; + skb_clear_hash_if_not_l4(skb); skb_dst_drop(skb); skb->vlan_tci = 0; skb_set_queue_mapping(skb, 0); diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index f13bd91d9a56..a313c3fbeb46 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -423,6 +423,7 @@ static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par) static struct xt_target synproxy_tg4_reg __read_mostly = { .name = "SYNPROXY", .family = NFPROTO_IPV4, + .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD), .target = synproxy_tg4, .targetsize = sizeof(struct xt_synproxy_info), .checkentry = synproxy_tg4_check, diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c index fff5ba1a33b7..4a5e94ac314a 100644 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -72,7 +72,7 @@ static int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_reject *priv = nft_expr_priv(expr); - if (nla_put_be32(skb, NFTA_REJECT_TYPE, priv->type)) + if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type))) goto nla_put_failure; switch (priv->type) { diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 38c8ec90ff68..d7b63a614454 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -286,13 +286,6 @@ static struct ctl_table ipv4_table[] = { .extra2 = &ip_ttl_max, }, { - .procname = "ip_no_pmtu_disc", - .data = &ipv4_config.no_pmtu_disc, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "ip_nonlocal_bind", .data = &sysctl_ip_nonlocal_bind, .maxlen = sizeof(int), @@ -831,6 +824,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = ipv4_local_port_range, }, + { + .procname = "ip_no_pmtu_disc", + .data = &init_net.ipv4.sysctl_ip_no_pmtu_disc, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 2a69f42e51ca..9e7aec7ee67e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1384,23 +1384,51 @@ static void tcp_cwnd_validate(struct sock *sk) } } -/* Returns the portion of skb which can be sent right away without - * introducing MSS oddities to segment boundaries. In rare cases where - * mss_now != mss_cache, we will request caller to create a small skb - * per input skb which could be mostly avoided here (if desired). - * - * We explicitly want to create a request for splitting write queue tail - * to a small skb for Nagle purposes while avoiding unnecessary modulos, - * thus all the complexity (cwnd_len is always MSS multiple which we - * return whenever allowed by the other factors). Basically we need the - * modulo only when the receiver window alone is the limiting factor or - * when we would be allowed to send the split-due-to-Nagle skb fully. +/* Minshall's variant of the Nagle send check. */ +static bool tcp_minshall_check(const struct tcp_sock *tp) +{ + return after(tp->snd_sml, tp->snd_una) && + !after(tp->snd_sml, tp->snd_nxt); +} + +/* Update snd_sml if this skb is under mss + * Note that a TSO packet might end with a sub-mss segment + * The test is really : + * if ((skb->len % mss) != 0) + * tp->snd_sml = TCP_SKB_CB(skb)->end_seq; + * But we can avoid doing the divide again given we already have + * skb_pcount = skb->len / mss_now */ -static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, - unsigned int mss_now, unsigned int max_segs) +static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, + const struct sk_buff *skb) +{ + if (skb->len < tcp_skb_pcount(skb) * mss_now) + tp->snd_sml = TCP_SKB_CB(skb)->end_seq; +} + +/* Return false, if packet can be sent now without violation Nagle's rules: + * 1. It is full sized. (provided by caller in %partial bool) + * 2. Or it contains FIN. (already checked by caller) + * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. + * 4. Or TCP_CORK is not set, and all sent packets are ACKed. + * With Minshall's modification: all sent small packets are ACKed. + */ +static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, + unsigned int mss_now, int nonagle) +{ + return partial && + ((nonagle & TCP_NAGLE_CORK) || + (!nonagle && tp->packets_out && tcp_minshall_check(tp))); +} +/* Returns the portion of skb which can be sent right away */ +static unsigned int tcp_mss_split_point(const struct sock *sk, + const struct sk_buff *skb, + unsigned int mss_now, + unsigned int max_segs, + int nonagle) { const struct tcp_sock *tp = tcp_sk(sk); - u32 needed, window, max_len; + u32 partial, needed, window, max_len; window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; max_len = mss_now * max_segs; @@ -1413,7 +1441,15 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_b if (max_len <= needed) return max_len; - return needed - needed % mss_now; + partial = needed % mss_now; + /* If last segment is not a full MSS, check if Nagle rules allow us + * to include this last segment in this skb. + * Otherwise, we'll split the skb at last MSS boundary + */ + if (tcp_nagle_check(partial != 0, tp, mss_now, nonagle)) + return needed - partial; + + return needed; } /* Can at least one segment of SKB be sent right now, according to the @@ -1453,28 +1489,6 @@ static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, return tso_segs; } -/* Minshall's variant of the Nagle send check. */ -static inline bool tcp_minshall_check(const struct tcp_sock *tp) -{ - return after(tp->snd_sml, tp->snd_una) && - !after(tp->snd_sml, tp->snd_nxt); -} - -/* Return false, if packet can be sent now without violation Nagle's rules: - * 1. It is full sized. - * 2. Or it contains FIN. (already checked by caller) - * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. - * 4. Or TCP_CORK is not set, and all sent packets are ACKed. - * With Minshall's modification: all sent small packets are ACKed. - */ -static inline bool tcp_nagle_check(const struct tcp_sock *tp, - const struct sk_buff *skb, - unsigned int mss_now, int nonagle) -{ - return skb->len < mss_now && - ((nonagle & TCP_NAGLE_CORK) || - (!nonagle && tp->packets_out && tcp_minshall_check(tp))); -} /* Return true if the Nagle test allows this packet to be * sent now. @@ -1495,7 +1509,7 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) return true; - if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) + if (!tcp_nagle_check(skb->len < cur_mss, tp, cur_mss, nonagle)) return true; return false; @@ -1898,7 +1912,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, limit = tcp_mss_split_point(sk, skb, mss_now, min_t(unsigned int, cwnd_quota, - sk->sk_gso_max_segs)); + sk->sk_gso_max_segs), + nonagle); if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 44f6a20fa29d..f140048334ce 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -560,15 +560,11 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport, struct udp_table *udptable) { - struct sock *sk; const struct iphdr *iph = ip_hdr(skb); - if (unlikely(sk = skb_steal_sock(skb))) - return sk; - else - return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport, - iph->daddr, dport, inet_iif(skb), - udptable); + return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport, + iph->daddr, dport, inet_iif(skb), + udptable); } struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, @@ -1603,12 +1599,16 @@ static void flush_stack(struct sock **stack, unsigned int count, kfree_skb(skb1); } -static void udp_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) +/* For TCP sockets, sk_rx_dst is protected by socket lock + * For UDP, we use xchg() to guard against concurrent changes. + */ +static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) { - struct dst_entry *dst = skb_dst(skb); + struct dst_entry *old; dst_hold(dst); - sk->sk_rx_dst = dst; + old = xchg(&sk->sk_rx_dst, dst); + dst_release(old); } /* @@ -1739,15 +1739,16 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (udp4_csum_init(skb, uh, proto)) goto csum_error; - if (skb->sk) { + sk = skb_steal_sock(skb); + if (sk) { + struct dst_entry *dst = skb_dst(skb); int ret; - sk = skb->sk; - if (unlikely(sk->sk_rx_dst == NULL)) - udp_sk_rx_dst_set(sk, skb); + if (unlikely(sk->sk_rx_dst != dst)) + udp_sk_rx_dst_set(sk, dst); ret = udp_queue_rcv_skb(sk, skb); - + sock_put(sk); /* a return value > 0 means to resubmit the input, but * it wants the return to be -protocol, or 0 */ @@ -1913,17 +1914,20 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, void udp_v4_early_demux(struct sk_buff *skb) { - const struct iphdr *iph = ip_hdr(skb); - const struct udphdr *uh = udp_hdr(skb); + struct net *net = dev_net(skb->dev); + const struct iphdr *iph; + const struct udphdr *uh; struct sock *sk; struct dst_entry *dst; - struct net *net = dev_net(skb->dev); int dif = skb->dev->ifindex; /* validate the packet */ if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr))) return; + iph = ip_hdr(skb); + uh = udp_hdr(skb); + if (skb->pkt_type == PACKET_BROADCAST || skb->pkt_type == PACKET_MULTICAST) sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr, diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 0b2a0641526a..542074c00c78 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -16,7 +16,7 @@ static int xfrm4_init_flags(struct xfrm_state *x) { - if (ipv4_config.no_pmtu_disc) + if (xs_net(x)->ipv4.sysctl_ip_no_pmtu_disc) x->props.flags |= XFRM_STATE_NOPMTUDISC; return 0; } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 027615ac1b4e..5e76dfa765c4 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -213,7 +213,7 @@ lookup_protocol: inet->mc_list = NULL; inet->rcv_tos = 0; - if (ipv4_config.no_pmtu_disc) + if (net->ipv4.sysctl_ip_no_pmtu_disc) inet->pmtudisc = IP_PMTUDISC_DONT; else inet->pmtudisc = IP_PMTUDISC_WANT; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 8dfe1f4d3c1a..93b1aa34c432 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -73,7 +73,6 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (flowlabel == NULL) return -EINVAL; - usin->sin6_addr = flowlabel->dst; } } diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index e27591635f92..3fd0a578329e 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -122,7 +122,11 @@ out: static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg) { struct rt6_info *rt = (struct rt6_info *) arg->result; - struct net_device *dev = rt->rt6i_idev->dev; + struct net_device *dev = NULL; + + if (rt->rt6i_idev) + dev = rt->rt6i_idev->dev; + /* do not accept result if the route does * not meet the required prefix length */ diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 9a311cc79672..bc4e1bcdf4c0 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1165,10 +1165,10 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, np->cork.hop_limit = hlimit; np->cork.tclass = tclass; if (rt->dst.flags & DST_XFRM_TUNNEL) - mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ? + mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? rt->dst.dev->mtu : dst_mtu(&rt->dst); else - mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ? + mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? rt->dst.dev->mtu : dst_mtu(rt->dst.path); if (np->frag_size < mtu) { if (np->frag_size) @@ -1270,7 +1270,7 @@ alloc_new_skb: if (skb == NULL || skb_prev == NULL) ip6_append_data_mtu(&mtu, &maxfraglen, fragheaderlen, skb, rt, - np->pmtudisc == + np->pmtudisc >= IPV6_PMTUDISC_PROBE); skb_prev = skb; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 759fbf96515b..af0ecb94b3b4 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -722,7 +722,7 @@ done: case IPV6_MTU_DISCOVER: if (optlen < sizeof(int)) goto e_inval; - if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_PROBE) + if (val < IPV6_PMTUDISC_DONT || val > IPV6_PMTUDISC_INTERFACE) goto e_inval; np->pmtudisc = val; retv = 0; diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index f78f41aca8e9..a0d17270117c 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -446,6 +446,7 @@ static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par) static struct xt_target synproxy_tg6_reg __read_mostly = { .name = "SYNPROXY", .family = NFPROTO_IPV6, + .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD), .target = synproxy_tg6, .targetsize = sizeof(struct xt_synproxy_info), .checkentry = synproxy_tg6_check, diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 7fb4e14c467f..b6bb87e55805 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -792,7 +792,6 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (flowlabel == NULL) return -EINVAL; - daddr = &flowlabel->dst; } } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a1a57523b158..89b2735cecf5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2170,12 +2170,10 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, bool anycast) { struct net *net = dev_net(idev->dev); - struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, 0, NULL); - - if (!rt) { - net_warn_ratelimited("Maximum number of routes reached, consider increasing route/max_size\n"); + struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, + DST_NOCOUNT, NULL); + if (!rt) return ERR_PTR(-ENOMEM); - } in6_dev_hold(idev); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index bd91e7ff482b..d955487f2c54 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -156,7 +156,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (flowlabel == NULL) return -EINVAL; - usin->sin6_addr = flowlabel->dst; fl6_sock_release(flowlabel); } } @@ -398,6 +397,9 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (sk->sk_state == TCP_LISTEN) goto out; + if (!ip6_sk_accept_pmtu(sk)) + goto out; + tp->mtu_info = ntohl(info); if (!sock_owned_by_user(sk)) tcp_v6_mtu_reduced(sk); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index bcd5699313c3..65ed5cd79264 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -538,8 +538,11 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (sk == NULL) return; - if (type == ICMPV6_PKT_TOOBIG) + if (type == ICMPV6_PKT_TOOBIG) { + if (!ip6_sk_accept_pmtu(sk)) + goto out; ip6_sk_update_pmtu(skb, sk, info); + } if (type == NDISC_REDIRECT) { ip6_sk_redirect(skb, sk); goto out; @@ -1140,7 +1143,6 @@ do_udp_sendmsg: flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (flowlabel == NULL) return -EINVAL; - daddr = &flowlabel->dst; } } diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index d9b437e55007..bb6e206ea70b 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -528,7 +528,6 @@ static int l2tp_ip6_sendmsg(struct kiocb *iocb, struct sock *sk, flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (flowlabel == NULL) return -EINVAL; - daddr = &flowlabel->dst; } } diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index 2bc2dec20b00..6226803fc490 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -59,7 +59,7 @@ hash_netnet4_data_equal(const struct hash_netnet4_elem *ip1, u32 *multi) { return ip1->ipcmp == ip2->ipcmp && - ip2->ccmp == ip2->ccmp; + ip1->ccmp == ip2->ccmp; } static inline int diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index dcddc49c0e08..f93b7d06f4be 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1717,6 +1717,19 @@ nf_tables_delrule_one(struct nft_ctx *ctx, struct nft_rule *rule) return -ENOENT; } +static int nf_table_delrule_by_chain(struct nft_ctx *ctx) +{ + struct nft_rule *rule; + int err; + + list_for_each_entry(rule, &ctx->chain->rules, list) { + err = nf_tables_delrule_one(ctx, rule); + if (err < 0) + return err; + } + return 0; +} + static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -1725,8 +1738,8 @@ static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb, const struct nft_af_info *afi; struct net *net = sock_net(skb->sk); const struct nft_table *table; - struct nft_chain *chain; - struct nft_rule *rule, *tmp; + struct nft_chain *chain = NULL; + struct nft_rule *rule; int family = nfmsg->nfgen_family, err = 0; struct nft_ctx ctx; @@ -1738,22 +1751,29 @@ static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb, if (IS_ERR(table)) return PTR_ERR(table); - chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]); - if (IS_ERR(chain)) - return PTR_ERR(chain); + if (nla[NFTA_RULE_CHAIN]) { + chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]); + if (IS_ERR(chain)) + return PTR_ERR(chain); + } nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); - if (nla[NFTA_RULE_HANDLE]) { - rule = nf_tables_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); - if (IS_ERR(rule)) - return PTR_ERR(rule); + if (chain) { + if (nla[NFTA_RULE_HANDLE]) { + rule = nf_tables_rule_lookup(chain, + nla[NFTA_RULE_HANDLE]); + if (IS_ERR(rule)) + return PTR_ERR(rule); - err = nf_tables_delrule_one(&ctx, rule); - } else { - /* Remove all rules in this chain */ - list_for_each_entry_safe(rule, tmp, &chain->rules, list) { err = nf_tables_delrule_one(&ctx, rule); + } else { + err = nf_table_delrule_by_chain(&ctx); + } + } else { + list_for_each_entry(chain, &table->chains, list) { + ctx.chain = chain; + err = nf_table_delrule_by_chain(&ctx); if (err < 0) break; } diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 9ff035c71403..a3910fc2122b 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -325,21 +325,24 @@ static void htable_gc(unsigned long htlong) add_timer(&ht->timer); } -static void htable_destroy(struct xt_hashlimit_htable *hinfo) +static void htable_remove_proc_entry(struct xt_hashlimit_htable *hinfo) { struct hashlimit_net *hashlimit_net = hashlimit_pernet(hinfo->net); struct proc_dir_entry *parent; - del_timer_sync(&hinfo->timer); - if (hinfo->family == NFPROTO_IPV4) parent = hashlimit_net->ipt_hashlimit; else parent = hashlimit_net->ip6t_hashlimit; - if(parent != NULL) + if (parent != NULL) remove_proc_entry(hinfo->name, parent); +} +static void htable_destroy(struct xt_hashlimit_htable *hinfo) +{ + del_timer_sync(&hinfo->timer); + htable_remove_proc_entry(hinfo); htable_selective_cleanup(hinfo, select_all); kfree(hinfo->name); vfree(hinfo); @@ -883,21 +886,15 @@ static int __net_init hashlimit_proc_net_init(struct net *net) static void __net_exit hashlimit_proc_net_exit(struct net *net) { struct xt_hashlimit_htable *hinfo; - struct proc_dir_entry *pde; struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); - /* recent_net_exit() is called before recent_mt_destroy(). Make sure - * that the parent xt_recent proc entry is is empty before trying to - * remove it. + /* hashlimit_net_exit() is called before hashlimit_mt_destroy(). + * Make sure that the parent ipt_hashlimit and ip6t_hashlimit proc + * entries is empty before trying to remove it. */ mutex_lock(&hashlimit_mutex); - pde = hashlimit_net->ipt_hashlimit; - if (pde == NULL) - pde = hashlimit_net->ip6t_hashlimit; - hlist_for_each_entry(hinfo, &hashlimit_net->htables, node) - remove_proc_entry(hinfo->name, pde); - + htable_remove_proc_entry(hinfo); hashlimit_net->ipt_hashlimit = NULL; hashlimit_net->ip6t_hashlimit = NULL; mutex_unlock(&hashlimit_mutex); diff --git a/net/nfc/af_nfc.c b/net/nfc/af_nfc.c index 9d68441e2a5a..2277276f52bc 100644 --- a/net/nfc/af_nfc.c +++ b/net/nfc/af_nfc.c @@ -16,9 +16,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include <linux/nfc.h> diff --git a/net/nfc/core.c b/net/nfc/core.c index 872529105abc..02ab34132157 100644 --- a/net/nfc/core.c +++ b/net/nfc/core.c @@ -16,9 +16,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #define pr_fmt(fmt) KBUILD_MODNAME ": %s: " fmt, __func__ diff --git a/net/nfc/hci/command.c b/net/nfc/hci/command.c index 64f922be9281..a9f4d2e62d8d 100644 --- a/net/nfc/hci/command.c +++ b/net/nfc/hci/command.c @@ -12,9 +12,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #define pr_fmt(fmt) "hci: %s: " fmt, __func__ diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c index d07ca4c5cf8c..3b9610031baa 100644 --- a/net/nfc/hci/core.c +++ b/net/nfc/hci/core.c @@ -12,9 +12,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #define pr_fmt(fmt) "hci: %s: " fmt, __func__ diff --git a/net/nfc/hci/hci.h b/net/nfc/hci/hci.h index b274d12c18ac..c3d2e2c1394c 100644 --- a/net/nfc/hci/hci.h +++ b/net/nfc/hci/hci.h @@ -12,9 +12,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef __LOCAL_HCI_H diff --git a/net/nfc/hci/hcp.c b/net/nfc/hci/hcp.c index b6b4109f2343..e9de1514656e 100644 --- a/net/nfc/hci/hcp.c +++ b/net/nfc/hci/hcp.c @@ -12,9 +12,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #define pr_fmt(fmt) "hci: %s: " fmt, __func__ diff --git a/net/nfc/hci/llc.c b/net/nfc/hci/llc.c index fe5e966e5b88..a07d2b818487 100644 --- a/net/nfc/hci/llc.c +++ b/net/nfc/hci/llc.c @@ -13,9 +13,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include <net/nfc/llc.h> diff --git a/net/nfc/hci/llc.h b/net/nfc/hci/llc.h index 7be0b7f3ceb6..5dad4c57ffb3 100644 --- a/net/nfc/hci/llc.h +++ b/net/nfc/hci/llc.h @@ -13,9 +13,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef __LOCAL_LLC_H_ diff --git a/net/nfc/hci/llc_nop.c b/net/nfc/hci/llc_nop.c index 87b10291b40f..d0435d5a197b 100644 --- a/net/nfc/hci/llc_nop.c +++ b/net/nfc/hci/llc_nop.c @@ -13,9 +13,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include <linux/types.h> diff --git a/net/nfc/hci/llc_shdlc.c b/net/nfc/hci/llc_shdlc.c index 27b313befc35..719ad0ac40de 100644 --- a/net/nfc/hci/llc_shdlc.c +++ b/net/nfc/hci/llc_shdlc.c @@ -13,9 +13,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #define pr_fmt(fmt) "shdlc: %s: " fmt, __func__ diff --git a/net/nfc/llcp.h b/net/nfc/llcp.h index f4d48b57ea11..de1789e3cc82 100644 --- a/net/nfc/llcp.h +++ b/net/nfc/llcp.h @@ -12,9 +12,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ enum llcp_state { diff --git a/net/nfc/llcp_commands.c b/net/nfc/llcp_commands.c index 1017894807c0..693cd1aad582 100644 --- a/net/nfc/llcp_commands.c +++ b/net/nfc/llcp_commands.c @@ -12,9 +12,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #define pr_fmt(fmt) "llcp: %s: " fmt, __func__ diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c index 81cd3416c7d4..1349074e1ffc 100644 --- a/net/nfc/llcp_core.c +++ b/net/nfc/llcp_core.c @@ -12,9 +12,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #define pr_fmt(fmt) "llcp: %s: " fmt, __func__ diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 824c6056bf82..69fbc8dadba7 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -12,9 +12,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #define pr_fmt(fmt) "llcp: %s: " fmt, __func__ diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index b943d46a1644..f0e955e3a385 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -20,8 +20,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. * */ diff --git a/net/nfc/nci/data.c b/net/nfc/nci/data.c index 2a9399dd6c68..6c3aef852876 100644 --- a/net/nfc/nci/data.c +++ b/net/nfc/nci/data.c @@ -16,8 +16,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. * */ diff --git a/net/nfc/nci/lib.c b/net/nfc/nci/lib.c index 6b7fd26c68d9..ed774a2e989a 100644 --- a/net/nfc/nci/lib.c +++ b/net/nfc/nci/lib.c @@ -20,8 +20,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. * */ diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c index b2aa98ef0927..1e905097456b 100644 --- a/net/nfc/nci/ntf.c +++ b/net/nfc/nci/ntf.c @@ -20,8 +20,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. * */ diff --git a/net/nfc/nci/rsp.c b/net/nfc/nci/rsp.c index dd072f38ad00..041de51ccdbe 100644 --- a/net/nfc/nci/rsp.c +++ b/net/nfc/nci/rsp.c @@ -20,8 +20,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. * */ diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index a9b2342d5253..ebbf6fb88b35 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -16,9 +16,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #define pr_fmt(fmt) KBUILD_MODNAME ": %s: " fmt, __func__ diff --git a/net/nfc/nfc.h b/net/nfc/nfc.h index aaf606fc1faa..9d6e74f7e6b3 100644 --- a/net/nfc/nfc.h +++ b/net/nfc/nfc.h @@ -16,9 +16,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef __LOCAL_NFC_H diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c index 66bcd2eb5773..c27a6e86cae4 100644 --- a/net/nfc/rawsock.c +++ b/net/nfc/rawsock.c @@ -16,9 +16,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #define pr_fmt(fmt) KBUILD_MODNAME ": %s: " fmt, __func__ diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 65cfaa816075..716b7eebfe70 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -165,7 +165,7 @@ static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh, } csum_replace4(&nh->check, *addr, new_addr); - skb->rxhash = 0; + skb_clear_hash(skb); *addr = new_addr; } @@ -199,7 +199,7 @@ static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto, if (recalculate_csum) update_ipv6_checksum(skb, l4_proto, addr, new_addr); - skb->rxhash = 0; + skb_clear_hash(skb); memcpy(addr, new_addr, sizeof(__be32[4])); } @@ -296,7 +296,7 @@ static void set_tp_port(struct sk_buff *skb, __be16 *port, { inet_proto_csum_replace2(check, skb, *port, new_port, 0); *port = new_port; - skb->rxhash = 0; + skb_clear_hash(skb); } static void set_udp_port(struct sk_buff *skb, __be16 *port, __be16 new_port) @@ -310,7 +310,7 @@ static void set_udp_port(struct sk_buff *skb, __be16 *port, __be16 new_port) uh->check = CSUM_MANGLED_0; } else { *port = new_port; - skb->rxhash = 0; + skb_clear_hash(skb); } } @@ -381,7 +381,7 @@ static int set_sctp(struct sk_buff *skb, /* Carry any checksum errors through. */ sh->checksum = old_csum ^ old_correct_csum ^ new_csum; - skb->rxhash = 0; + skb_clear_hash(skb); } return 0; diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index e42542706087..0e720c316070 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -25,7 +25,7 @@ #include <linux/if_vlan.h> #include <net/llc_pdu.h> #include <linux/kernel.h> -#include <linux/jhash.h> +#include <linux/hash.h> #include <linux/jiffies.h> #include <linux/llc.h> #include <linux/module.h> @@ -362,7 +362,7 @@ static u32 flow_hash(const struct sw_flow_key *key, int key_start, /* Make sure number of hash bytes are multiple of u32. */ BUILD_BUG_ON(sizeof(long) % sizeof(u32)); - return jhash2(hash_key, hash_u32s, 0); + return arch_fast_hash2(hash_key, hash_u32s, 0); } static int flow_key_start(const struct sw_flow_key *key) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index cc803c63059a..dd3840846ce2 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -963,7 +963,7 @@ static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb) static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { - ppd->hv1.tp_rxhash = skb_get_rxhash(pkc->skb); + ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb); } static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc, @@ -977,9 +977,11 @@ static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc, { if (vlan_tx_tag_present(pkc->skb)) { ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb); - ppd->tp_status = TP_STATUS_VLAN_VALID; + ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto); + ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else { ppd->hv1.tp_vlan_tci = 0; + ppd->hv1.tp_vlan_tpid = 0; ppd->tp_status = TP_STATUS_AVAILABLE; } } @@ -987,6 +989,7 @@ static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc, static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { + ppd->hv1.tp_padding = 0; prb_fill_vlan_info(pkc, ppd); if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH) @@ -1295,7 +1298,7 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, if (!skb) return 0; } - skb_get_rxhash(skb); + skb_get_hash(skb); idx = fanout_demux_hash(f, skb, num); break; case PACKET_FANOUT_LB: @@ -1812,6 +1815,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct timespec ts; __u32 ts_status; + /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT. + * We may add members to them until current aligned size without forcing + * userspace to call getsockopt(..., PACKET_HDRLEN, ...). + */ + BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32); + BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48); + if (skb->pkt_type == PACKET_LOOPBACK) goto drop; @@ -1918,11 +1928,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, h.h2->tp_nsec = ts.tv_nsec; if (vlan_tx_tag_present(skb)) { h.h2->tp_vlan_tci = vlan_tx_tag_get(skb); - status |= TP_STATUS_VLAN_VALID; + h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto); + status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else { h.h2->tp_vlan_tci = 0; + h.h2->tp_vlan_tpid = 0; } - h.h2->tp_padding = 0; + memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding)); hdrlen = sizeof(*h.h2); break; case TPACKET_V3: @@ -1936,6 +1948,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, h.h3->tp_net = netoff; h.h3->tp_sec = ts.tv_sec; h.h3->tp_nsec = ts.tv_nsec; + memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding)); hdrlen = sizeof(*h.h3); break; default: @@ -2867,11 +2880,12 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, aux.tp_net = skb_network_offset(skb); if (vlan_tx_tag_present(skb)) { aux.tp_vlan_tci = vlan_tx_tag_get(skb); - aux.tp_status |= TP_STATUS_VLAN_VALID; + aux.tp_vlan_tpid = ntohs(skb->vlan_proto); + aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else { aux.tp_vlan_tci = 0; + aux.tp_vlan_tpid = 0; } - aux.tp_padding = 0; put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux); } diff --git a/net/rfkill/core.c b/net/rfkill/core.c index 1bacc1079942..ed7e0b4e7f90 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -14,9 +14,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include <linux/kernel.h> diff --git a/net/sched/Kconfig b/net/sched/Kconfig index ad1f1d819203..919847beec39 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -286,6 +286,15 @@ config NET_SCH_FQ If unsure, say N. +config NET_SCH_HHF + tristate "Heavy-Hitter Filter (HHF)" + help + Say Y here if you want to use the Heavy-Hitter Filter (HHF) + packet scheduling algorithm. + + To compile this driver as a module, choose M here: the module + will be called sch_hhf. + config NET_SCH_INGRESS tristate "Ingress Qdisc" depends on NET_CLS_ACT diff --git a/net/sched/Makefile b/net/sched/Makefile index 35fa47a494ab..3442e5fbc4d7 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -40,6 +40,7 @@ obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o obj-$(CONFIG_NET_SCH_FQ_CODEL) += sch_fq_codel.o obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o +obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 4adbce8f8314..8114fef308d9 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -29,25 +29,16 @@ void tcf_hash_destroy(struct tcf_common *p, struct tcf_hashinfo *hinfo) { - unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask); - struct tcf_common **p1p; - - for (p1p = &hinfo->htab[h]; *p1p; p1p = &(*p1p)->tcfc_next) { - if (*p1p == p) { - write_lock_bh(hinfo->lock); - *p1p = p->tcfc_next; - write_unlock_bh(hinfo->lock); - gen_kill_estimator(&p->tcfc_bstats, - &p->tcfc_rate_est); - /* - * gen_estimator est_timer() might access p->tcfc_lock - * or bstats, wait a RCU grace period before freeing p - */ - kfree_rcu(p, tcfc_rcu); - return; - } - } - WARN_ON(1); + spin_lock_bh(&hinfo->lock); + hlist_del(&p->tcfc_head); + spin_unlock_bh(&hinfo->lock); + gen_kill_estimator(&p->tcfc_bstats, + &p->tcfc_rate_est); + /* + * gen_estimator est_timer() might access p->tcfc_lock + * or bstats, wait a RCU grace period before freeing p + */ + kfree_rcu(p, tcfc_rcu); } EXPORT_SYMBOL(tcf_hash_destroy); @@ -73,18 +64,19 @@ EXPORT_SYMBOL(tcf_hash_release); static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb, struct tc_action *a, struct tcf_hashinfo *hinfo) { + struct hlist_head *head; struct tcf_common *p; int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; struct nlattr *nest; - read_lock_bh(hinfo->lock); + spin_lock_bh(&hinfo->lock); s_i = cb->args[0]; for (i = 0; i < (hinfo->hmask + 1); i++) { - p = hinfo->htab[tcf_hash(i, hinfo->hmask)]; + head = &hinfo->htab[tcf_hash(i, hinfo->hmask)]; - for (; p; p = p->tcfc_next) { + hlist_for_each_entry_rcu(p, head, tcfc_head) { index++; if (index < s_i) continue; @@ -107,7 +99,7 @@ static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb, } } done: - read_unlock_bh(hinfo->lock); + spin_unlock_bh(&hinfo->lock); if (n_i) cb->args[0] += n_i; return n_i; @@ -120,7 +112,9 @@ nla_put_failure: static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a, struct tcf_hashinfo *hinfo) { - struct tcf_common *p, *s_p; + struct hlist_head *head; + struct hlist_node *n; + struct tcf_common *p; struct nlattr *nest; int i = 0, n_i = 0; @@ -130,14 +124,11 @@ static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a, if (nla_put_string(skb, TCA_KIND, a->ops->kind)) goto nla_put_failure; for (i = 0; i < (hinfo->hmask + 1); i++) { - p = hinfo->htab[tcf_hash(i, hinfo->hmask)]; - - while (p != NULL) { - s_p = p->tcfc_next; + head = &hinfo->htab[tcf_hash(i, hinfo->hmask)]; + hlist_for_each_entry_safe(p, n, head, tcfc_head) { if (ACT_P_DELETED == tcf_hash_release(p, 0, hinfo)) module_put(a->ops->owner); n_i++; - p = s_p; } } if (nla_put_u32(skb, TCA_FCNT, n_i)) @@ -168,15 +159,15 @@ EXPORT_SYMBOL(tcf_generic_walker); struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo) { - struct tcf_common *p; + struct tcf_common *p = NULL; + struct hlist_head *head; - read_lock_bh(hinfo->lock); - for (p = hinfo->htab[tcf_hash(index, hinfo->hmask)]; p; - p = p->tcfc_next) { + spin_lock_bh(&hinfo->lock); + head = &hinfo->htab[tcf_hash(index, hinfo->hmask)]; + hlist_for_each_entry_rcu(p, head, tcfc_head) if (p->tcfc_index == index) break; - } - read_unlock_bh(hinfo->lock); + spin_unlock_bh(&hinfo->lock); return p; } @@ -236,6 +227,7 @@ struct tcf_common *tcf_hash_create(u32 index, struct nlattr *est, p->tcfc_bindcnt = 1; spin_lock_init(&p->tcfc_lock); + INIT_HLIST_NODE(&p->tcfc_head); p->tcfc_index = index ? index : tcf_hash_new_index(idx_gen, hinfo); p->tcfc_tm.install = jiffies; p->tcfc_tm.lastuse = jiffies; @@ -257,19 +249,18 @@ void tcf_hash_insert(struct tcf_common *p, struct tcf_hashinfo *hinfo) { unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask); - write_lock_bh(hinfo->lock); - p->tcfc_next = hinfo->htab[h]; - hinfo->htab[h] = p; - write_unlock_bh(hinfo->lock); + spin_lock_bh(&hinfo->lock); + hlist_add_head(&p->tcfc_head, &hinfo->htab[h]); + spin_unlock_bh(&hinfo->lock); } EXPORT_SYMBOL(tcf_hash_insert); -static struct tc_action_ops *act_base = NULL; +static LIST_HEAD(act_base); static DEFINE_RWLOCK(act_mod_lock); int tcf_register_action(struct tc_action_ops *act) { - struct tc_action_ops *a, **ap; + struct tc_action_ops *a; /* Must supply act, dump, cleanup and init */ if (!act->act || !act->dump || !act->cleanup || !act->init) @@ -282,14 +273,13 @@ int tcf_register_action(struct tc_action_ops *act) act->walk = tcf_generic_walker; write_lock(&act_mod_lock); - for (ap = &act_base; (a = *ap) != NULL; ap = &a->next) { + list_for_each_entry(a, &act_base, head) { if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) { write_unlock(&act_mod_lock); return -EEXIST; } } - act->next = NULL; - *ap = act; + list_add_tail(&act->head, &act_base); write_unlock(&act_mod_lock); return 0; } @@ -297,16 +287,15 @@ EXPORT_SYMBOL(tcf_register_action); int tcf_unregister_action(struct tc_action_ops *act) { - struct tc_action_ops *a, **ap; + struct tc_action_ops *a; int err = -ENOENT; write_lock(&act_mod_lock); - for (ap = &act_base; (a = *ap) != NULL; ap = &a->next) + list_for_each_entry(a, &act_base, head) if (a == act) break; if (a) { - *ap = a->next; - a->next = NULL; + list_del(&act->head); err = 0; } write_unlock(&act_mod_lock); @@ -321,7 +310,7 @@ static struct tc_action_ops *tc_lookup_action_n(char *kind) if (kind) { read_lock(&act_mod_lock); - for (a = act_base; a; a = a->next) { + list_for_each_entry(a, &act_base, head) { if (strcmp(kind, a->kind) == 0) { if (!try_module_get(a->owner)) { read_unlock(&act_mod_lock); @@ -342,7 +331,7 @@ static struct tc_action_ops *tc_lookup_action(struct nlattr *kind) if (kind) { read_lock(&act_mod_lock); - for (a = act_base; a; a = a->next) { + list_for_each_entry(a, &act_base, head) { if (nla_strcmp(kind, a->kind) == 0) { if (!try_module_get(a->owner)) { read_unlock(&act_mod_lock); @@ -379,7 +368,7 @@ static struct tc_action_ops *tc_lookup_action_id(u32 type) } #endif -int tcf_action_exec(struct sk_buff *skb, const struct tc_action *act, +int tcf_action_exec(struct sk_buff *skb, const struct list_head *actions, struct tcf_result *res) { const struct tc_action *a; @@ -390,7 +379,7 @@ int tcf_action_exec(struct sk_buff *skb, const struct tc_action *act, ret = TC_ACT_OK; goto exec_done; } - while ((a = act) != NULL) { + list_for_each_entry(a, actions, list) { repeat: if (a->ops) { ret = a->ops->act(skb, a, res); @@ -404,27 +393,26 @@ repeat: if (ret != TC_ACT_PIPE) goto exec_done; } - act = a->next; } exec_done: return ret; } EXPORT_SYMBOL(tcf_action_exec); -void tcf_action_destroy(struct tc_action *act, int bind) +void tcf_action_destroy(struct list_head *actions, int bind) { - struct tc_action *a; + struct tc_action *a, *tmp; - for (a = act; a; a = act) { + list_for_each_entry_safe(a, tmp, actions, list) { if (a->ops) { if (a->ops->cleanup(a, bind) == ACT_P_DELETED) module_put(a->ops->owner); - act = act->next; + list_del(&a->list); kfree(a); } else { /*FIXME: Remove later - catch insertion bugs*/ WARN(1, "tcf_action_destroy: BUG? destroying NULL ops\n"); - act = act->next; + list_del(&a->list); kfree(a); } } @@ -470,14 +458,13 @@ nla_put_failure: EXPORT_SYMBOL(tcf_action_dump_1); int -tcf_action_dump(struct sk_buff *skb, struct tc_action *act, int bind, int ref) +tcf_action_dump(struct sk_buff *skb, struct list_head *actions, int bind, int ref) { struct tc_action *a; int err = -EINVAL; struct nlattr *nest; - while ((a = act) != NULL) { - act = a->next; + list_for_each_entry(a, actions, list) { nest = nla_nest_start(skb, a->order); if (nest == NULL) goto nla_put_failure; @@ -552,6 +539,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla, if (a == NULL) goto err_mod; + INIT_LIST_HEAD(&a->list); /* backward compatibility for policer */ if (name == NULL) err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, a, ovr, bind); @@ -578,37 +566,33 @@ err_out: return ERR_PTR(err); } -struct tc_action *tcf_action_init(struct net *net, struct nlattr *nla, +int tcf_action_init(struct net *net, struct nlattr *nla, struct nlattr *est, char *name, int ovr, - int bind) + int bind, struct list_head *actions) { struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; - struct tc_action *head = NULL, *act, *act_prev = NULL; + struct tc_action *act; int err; int i; err = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL); if (err < 0) - return ERR_PTR(err); + return err; for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { act = tcf_action_init_1(net, tb[i], est, name, ovr, bind); - if (IS_ERR(act)) + if (IS_ERR(act)) { + err = PTR_ERR(act); goto err; + } act->order = i; - - if (head == NULL) - head = act; - else - act_prev->next = act; - act_prev = act; + list_add_tail(&act->list, actions); } - return head; + return 0; err: - if (head != NULL) - tcf_action_destroy(head, bind); - return act; + tcf_action_destroy(actions, bind); + return err; } int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, @@ -637,10 +621,6 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, if (err < 0) goto errout; - if (a->ops != NULL && a->ops->get_stats != NULL) - if (a->ops->get_stats(skb, a) < 0) - goto errout; - if (gnet_stats_copy_basic(&d, &h->tcf_bstats) < 0 || gnet_stats_copy_rate_est(&d, &h->tcf_bstats, &h->tcf_rate_est) < 0 || @@ -657,7 +637,7 @@ errout: } static int -tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 portid, u32 seq, +tca_get_fill(struct sk_buff *skb, struct list_head *actions, u32 portid, u32 seq, u16 flags, int event, int bind, int ref) { struct tcamsg *t; @@ -677,7 +657,7 @@ tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 portid, u32 seq, if (nest == NULL) goto out_nlmsg_trim; - if (tcf_action_dump(skb, a, bind, ref) < 0) + if (tcf_action_dump(skb, actions, bind, ref) < 0) goto out_nlmsg_trim; nla_nest_end(skb, nest); @@ -692,14 +672,14 @@ out_nlmsg_trim: static int act_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, - struct tc_action *a, int event) + struct list_head *actions, int event) { struct sk_buff *skb; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; - if (tca_get_fill(skb, a, portid, n->nlmsg_seq, 0, event, 0, 0) <= 0) { + if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event, 0, 0) <= 0) { kfree_skb(skb); return -EINVAL; } @@ -730,6 +710,7 @@ tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 portid) if (a == NULL) goto err_out; + INIT_LIST_HEAD(&a->list); err = -EINVAL; a->ops = tc_lookup_action(tb[TCA_ACT_KIND]); if (a->ops == NULL) @@ -749,12 +730,12 @@ err_out: return ERR_PTR(err); } -static void cleanup_a(struct tc_action *act) +static void cleanup_a(struct list_head *actions) { - struct tc_action *a; + struct tc_action *a, *tmp; - for (a = act; a; a = act) { - act = a->next; + list_for_each_entry_safe(a, tmp, actions, list) { + list_del(&a->list); kfree(a); } } @@ -769,6 +750,7 @@ static struct tc_action *create_a(int i) return NULL; } act->order = i; + INIT_LIST_HEAD(&act->list); return act; } @@ -856,7 +838,8 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, { int i, ret; struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; - struct tc_action *head = NULL, *act, *act_prev = NULL; + struct tc_action *act; + LIST_HEAD(actions); ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL); if (ret < 0) @@ -876,16 +859,11 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, goto err; } act->order = i; - - if (head == NULL) - head = act; - else - act_prev->next = act; - act_prev = act; + list_add_tail(&act->list, &actions); } if (event == RTM_GETACTION) - ret = act_get_notify(net, portid, n, head, event); + ret = act_get_notify(net, portid, n, &actions, event); else { /* delete */ struct sk_buff *skb; @@ -895,7 +873,7 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, goto err; } - if (tca_get_fill(skb, head, portid, n->nlmsg_seq, 0, event, + if (tca_get_fill(skb, &actions, portid, n->nlmsg_seq, 0, event, 0, 1) <= 0) { kfree_skb(skb); ret = -EINVAL; @@ -903,7 +881,7 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, } /* now do the delete */ - tcf_action_destroy(head, 0); + tcf_action_destroy(&actions, 0); ret = rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); if (ret > 0) @@ -911,11 +889,11 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, return ret; } err: - cleanup_a(head); + cleanup_a(&actions); return ret; } -static int tcf_add_notify(struct net *net, struct tc_action *a, +static int tcf_add_notify(struct net *net, struct list_head *actions, u32 portid, u32 seq, int event, u16 flags) { struct tcamsg *t; @@ -943,7 +921,7 @@ static int tcf_add_notify(struct net *net, struct tc_action *a, if (nest == NULL) goto out_kfree_skb; - if (tcf_action_dump(skb, a, 0, 0) < 0) + if (tcf_action_dump(skb, actions, 0, 0) < 0) goto out_kfree_skb; nla_nest_end(skb, nest); @@ -967,26 +945,18 @@ tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n, u32 portid, int ovr) { int ret = 0; - struct tc_action *act; - struct tc_action *a; + LIST_HEAD(actions); u32 seq = n->nlmsg_seq; - act = tcf_action_init(net, nla, NULL, NULL, ovr, 0); - if (act == NULL) + ret = tcf_action_init(net, nla, NULL, NULL, ovr, 0, &actions); + if (ret) goto done; - if (IS_ERR(act)) { - ret = PTR_ERR(act); - goto done; - } /* dump then free all the actions after update; inserted policy * stays intact */ - ret = tcf_add_notify(net, act, portid, seq, RTM_NEWACTION, n->nlmsg_flags); - for (a = act; a; a = act) { - act = a->next; - kfree(a); - } + ret = tcf_add_notify(net, &actions, portid, seq, RTM_NEWACTION, n->nlmsg_flags); + cleanup_a(&actions); done: return ret; } diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 5c5edf56adbd..5d350c57af3f 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -37,15 +37,8 @@ #include <net/tc_act/tc_csum.h> #define CSUM_TAB_MASK 15 -static struct tcf_common *tcf_csum_ht[CSUM_TAB_MASK + 1]; static u32 csum_idx_gen; -static DEFINE_RWLOCK(csum_lock); - -static struct tcf_hashinfo csum_hash_info = { - .htab = tcf_csum_ht, - .hmask = CSUM_TAB_MASK, - .lock = &csum_lock, -}; +static struct tcf_hashinfo csum_hash_info; static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = { [TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), }, @@ -593,6 +586,10 @@ MODULE_LICENSE("GPL"); static int __init csum_init_module(void) { + int err = tcf_hashinfo_init(&csum_hash_info, CSUM_TAB_MASK+1); + if (err) + return err; + return tcf_register_action(&act_csum_ops); } diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 5645a4d32abd..1e6e0e765243 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -24,15 +24,8 @@ #include <net/tc_act/tc_gact.h> #define GACT_TAB_MASK 15 -static struct tcf_common *tcf_gact_ht[GACT_TAB_MASK + 1]; static u32 gact_idx_gen; -static DEFINE_RWLOCK(gact_lock); - -static struct tcf_hashinfo gact_hash_info = { - .htab = tcf_gact_ht, - .hmask = GACT_TAB_MASK, - .lock = &gact_lock, -}; +static struct tcf_hashinfo gact_hash_info; #ifdef CONFIG_GACT_PROB static int gact_net_rand(struct tcf_gact *gact) @@ -215,6 +208,9 @@ MODULE_LICENSE("GPL"); static int __init gact_init_module(void) { + int err = tcf_hashinfo_init(&gact_hash_info, GACT_TAB_MASK+1); + if (err) + return err; #ifdef CONFIG_GACT_PROB pr_info("GACT probability on\n"); #else @@ -226,6 +222,7 @@ static int __init gact_init_module(void) static void __exit gact_cleanup_module(void) { tcf_unregister_action(&act_gact_ops); + tcf_hashinfo_destroy(&gact_hash_info); } module_init(gact_init_module); diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 882a89762f77..8344380ebaf1 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -29,15 +29,8 @@ #define IPT_TAB_MASK 15 -static struct tcf_common *tcf_ipt_ht[IPT_TAB_MASK + 1]; static u32 ipt_idx_gen; -static DEFINE_RWLOCK(ipt_lock); - -static struct tcf_hashinfo ipt_hash_info = { - .htab = tcf_ipt_ht, - .hmask = IPT_TAB_MASK, - .lock = &ipt_lock, -}; +static struct tcf_hashinfo ipt_hash_info; static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int hook) { @@ -320,7 +313,11 @@ MODULE_ALIAS("act_xt"); static int __init ipt_init_module(void) { - int ret1, ret2; + int ret1, ret2, err; + err = tcf_hashinfo_init(&ipt_hash_info, IPT_TAB_MASK+1); + if (err) + return err; + ret1 = tcf_register_action(&act_xt_ops); if (ret1 < 0) printk("Failed to load xt action\n"); @@ -328,9 +325,10 @@ static int __init ipt_init_module(void) if (ret2 < 0) printk("Failed to load ipt action\n"); - if (ret1 < 0 && ret2 < 0) + if (ret1 < 0 && ret2 < 0) { + tcf_hashinfo_destroy(&ipt_hash_info); return ret1; - else + } else return 0; } @@ -338,6 +336,7 @@ static void __exit ipt_cleanup_module(void) { tcf_unregister_action(&act_xt_ops); tcf_unregister_action(&act_ipt_ops); + tcf_hashinfo_destroy(&ipt_hash_info); } module_init(ipt_init_module); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 252378121ce7..199fc9838af3 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -30,16 +30,9 @@ #include <linux/if_arp.h> #define MIRRED_TAB_MASK 7 -static struct tcf_common *tcf_mirred_ht[MIRRED_TAB_MASK + 1]; static u32 mirred_idx_gen; -static DEFINE_RWLOCK(mirred_lock); static LIST_HEAD(mirred_list); - -static struct tcf_hashinfo mirred_hash_info = { - .htab = tcf_mirred_ht, - .hmask = MIRRED_TAB_MASK, - .lock = &mirred_lock, -}; +static struct tcf_hashinfo mirred_hash_info; static int tcf_mirred_release(struct tcf_mirred *m, int bind) { @@ -261,7 +254,6 @@ static struct notifier_block mirred_device_notifier = { .notifier_call = mirred_device_event, }; - static struct tc_action_ops act_mirred_ops = { .kind = "mirred", .hinfo = &mirred_hash_info, @@ -284,6 +276,11 @@ static int __init mirred_init_module(void) if (err) return err; + err = tcf_hashinfo_init(&mirred_hash_info, MIRRED_TAB_MASK+1); + if (err) { + unregister_netdevice_notifier(&mirred_device_notifier); + return err; + } pr_info("Mirror/redirect action on\n"); return tcf_register_action(&act_mirred_ops); } @@ -291,6 +288,7 @@ static int __init mirred_init_module(void) static void __exit mirred_cleanup_module(void) { unregister_netdevice_notifier(&mirred_device_notifier); + tcf_hashinfo_destroy(&mirred_hash_info); tcf_unregister_action(&act_mirred_ops); } diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 6a15ace00241..409fe7181c5f 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -30,15 +30,9 @@ #define NAT_TAB_MASK 15 -static struct tcf_common *tcf_nat_ht[NAT_TAB_MASK + 1]; static u32 nat_idx_gen; -static DEFINE_RWLOCK(nat_lock); -static struct tcf_hashinfo nat_hash_info = { - .htab = tcf_nat_ht, - .hmask = NAT_TAB_MASK, - .lock = &nat_lock, -}; +static struct tcf_hashinfo nat_hash_info; static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { [TCA_NAT_PARMS] = { .len = sizeof(struct tc_nat) }, @@ -316,12 +310,16 @@ MODULE_LICENSE("GPL"); static int __init nat_init_module(void) { + int err = tcf_hashinfo_init(&nat_hash_info, NAT_TAB_MASK+1); + if (err) + return err; return tcf_register_action(&act_nat_ops); } static void __exit nat_cleanup_module(void) { tcf_unregister_action(&act_nat_ops); + tcf_hashinfo_destroy(&nat_hash_info); } module_init(nat_init_module); diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 03b67674169c..aa5347c1b9f1 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -24,15 +24,9 @@ #include <net/tc_act/tc_pedit.h> #define PEDIT_TAB_MASK 15 -static struct tcf_common *tcf_pedit_ht[PEDIT_TAB_MASK + 1]; static u32 pedit_idx_gen; -static DEFINE_RWLOCK(pedit_lock); -static struct tcf_hashinfo pedit_hash_info = { - .htab = tcf_pedit_ht, - .hmask = PEDIT_TAB_MASK, - .lock = &pedit_lock, -}; +static struct tcf_hashinfo pedit_hash_info; static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = { [TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) }, @@ -252,11 +246,15 @@ MODULE_LICENSE("GPL"); static int __init pedit_init_module(void) { + int err = tcf_hashinfo_init(&pedit_hash_info, PEDIT_TAB_MASK+1); + if (err) + return err; return tcf_register_action(&act_pedit_ops); } static void __exit pedit_cleanup_module(void) { + tcf_hashinfo_destroy(&pedit_hash_info); tcf_unregister_action(&act_pedit_ops); } diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 16a62c36928a..7b23ab07c6cc 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -41,15 +41,8 @@ struct tcf_police { container_of(pc, struct tcf_police, common) #define POL_TAB_MASK 15 -static struct tcf_common *tcf_police_ht[POL_TAB_MASK + 1]; static u32 police_idx_gen; -static DEFINE_RWLOCK(police_lock); - -static struct tcf_hashinfo police_hash_info = { - .htab = tcf_police_ht, - .hmask = POL_TAB_MASK, - .lock = &police_lock, -}; +static struct tcf_hashinfo police_hash_info; /* old policer structure from before tc actions */ struct tc_police_compat { @@ -67,18 +60,19 @@ struct tc_police_compat { static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *cb, int type, struct tc_action *a) { + struct hlist_head *head; struct tcf_common *p; int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; struct nlattr *nest; - read_lock_bh(&police_lock); + spin_lock_bh(&police_hash_info.lock); s_i = cb->args[0]; for (i = 0; i < (POL_TAB_MASK + 1); i++) { - p = tcf_police_ht[tcf_hash(i, POL_TAB_MASK)]; + head = &police_hash_info.htab[tcf_hash(i, POL_TAB_MASK)]; - for (; p; p = p->tcfc_next) { + hlist_for_each_entry_rcu(p, head, tcfc_head) { index++; if (index < s_i) continue; @@ -101,7 +95,7 @@ static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *c } } done: - read_unlock_bh(&police_lock); + spin_unlock_bh(&police_hash_info.lock); if (n_i) cb->args[0] += n_i; return n_i; @@ -113,25 +107,16 @@ nla_put_failure: static void tcf_police_destroy(struct tcf_police *p) { - unsigned int h = tcf_hash(p->tcf_index, POL_TAB_MASK); - struct tcf_common **p1p; - - for (p1p = &tcf_police_ht[h]; *p1p; p1p = &(*p1p)->tcfc_next) { - if (*p1p == &p->common) { - write_lock_bh(&police_lock); - *p1p = p->tcf_next; - write_unlock_bh(&police_lock); - gen_kill_estimator(&p->tcf_bstats, - &p->tcf_rate_est); - /* - * gen_estimator est_timer() might access p->tcf_lock - * or bstats, wait a RCU grace period before freeing p - */ - kfree_rcu(p, tcf_rcu); - return; - } - } - WARN_ON(1); + spin_lock_bh(&police_hash_info.lock); + hlist_del(&p->tcf_head); + spin_unlock_bh(&police_hash_info.lock); + gen_kill_estimator(&p->tcf_bstats, + &p->tcf_rate_est); + /* + * gen_estimator est_timer() might access p->tcf_lock + * or bstats, wait a RCU grace period before freeing p + */ + kfree_rcu(p, tcf_rcu); } static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { @@ -266,10 +251,9 @@ override: police->tcf_index = parm->index ? parm->index : tcf_hash_new_index(&police_idx_gen, &police_hash_info); h = tcf_hash(police->tcf_index, POL_TAB_MASK); - write_lock_bh(&police_lock); - police->tcf_next = tcf_police_ht[h]; - tcf_police_ht[h] = &police->common; - write_unlock_bh(&police_lock); + spin_lock_bh(&police_hash_info.lock); + hlist_add_head(&police->tcf_head, &police_hash_info.htab[h]); + spin_unlock_bh(&police_hash_info.lock); a->priv = police; return ret; @@ -277,10 +261,8 @@ override: failure_unlock: spin_unlock_bh(&police->tcf_lock); failure: - if (P_tab) - qdisc_put_rtab(P_tab); - if (R_tab) - qdisc_put_rtab(R_tab); + qdisc_put_rtab(P_tab); + qdisc_put_rtab(R_tab); if (ret == ACT_P_CREATED) kfree(police); return err; @@ -414,12 +396,19 @@ static struct tc_action_ops act_police_ops = { static int __init police_init_module(void) { - return tcf_register_action(&act_police_ops); + int err = tcf_hashinfo_init(&police_hash_info, POL_TAB_MASK+1); + if (err) + return err; + err = tcf_register_action(&act_police_ops); + if (err) + tcf_hashinfo_destroy(&police_hash_info); + return err; } static void __exit police_cleanup_module(void) { + tcf_hashinfo_destroy(&police_hash_info); tcf_unregister_action(&act_police_ops); } diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 31157d3e729c..2d7a0eb11c69 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -25,15 +25,8 @@ #include <net/tc_act/tc_defact.h> #define SIMP_TAB_MASK 7 -static struct tcf_common *tcf_simp_ht[SIMP_TAB_MASK + 1]; static u32 simp_idx_gen; -static DEFINE_RWLOCK(simp_lock); - -static struct tcf_hashinfo simp_hash_info = { - .htab = tcf_simp_ht, - .hmask = SIMP_TAB_MASK, - .lock = &simp_lock, -}; +static struct tcf_hashinfo simp_hash_info; #define SIMP_MAX_DATA 32 static int tcf_simp(struct sk_buff *skb, const struct tc_action *a, @@ -209,14 +202,23 @@ MODULE_LICENSE("GPL"); static int __init simp_init_module(void) { - int ret = tcf_register_action(&act_simp_ops); + int err, ret; + err = tcf_hashinfo_init(&simp_hash_info, SIMP_TAB_MASK+1); + if (err) + return err; + + ret = tcf_register_action(&act_simp_ops); if (!ret) pr_info("Simple TC action Loaded\n"); + else + tcf_hashinfo_destroy(&simp_hash_info); + return ret; } static void __exit simp_cleanup_module(void) { + tcf_hashinfo_destroy(&simp_hash_info); tcf_unregister_action(&act_simp_ops); } diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index cf20add1c3ff..90ed04a83cf3 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -28,15 +28,8 @@ #include <net/tc_act/tc_skbedit.h> #define SKBEDIT_TAB_MASK 15 -static struct tcf_common *tcf_skbedit_ht[SKBEDIT_TAB_MASK + 1]; static u32 skbedit_idx_gen; -static DEFINE_RWLOCK(skbedit_lock); - -static struct tcf_hashinfo skbedit_hash_info = { - .htab = tcf_skbedit_ht, - .hmask = SKBEDIT_TAB_MASK, - .lock = &skbedit_lock, -}; +static struct tcf_hashinfo skbedit_hash_info; static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) @@ -210,11 +203,15 @@ MODULE_LICENSE("GPL"); static int __init skbedit_init_module(void) { + int err = tcf_hashinfo_init(&skbedit_hash_info, SKBEDIT_TAB_MASK+1); + if (err) + return err; return tcf_register_action(&act_skbedit_ops); } static void __exit skbedit_cleanup_module(void) { + tcf_hashinfo_destroy(&skbedit_hash_info); tcf_unregister_action(&act_skbedit_ops); } diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 8e118af90973..6b085cf27a65 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -31,8 +31,7 @@ #include <net/pkt_cls.h> /* The list of all installed classifier types */ - -static struct tcf_proto_ops *tcf_proto_base __read_mostly; +static LIST_HEAD(tcf_proto_base); /* Protects list of registered TC modules. It is pure SMP lock. */ static DEFINE_RWLOCK(cls_mod_lock); @@ -45,7 +44,7 @@ static const struct tcf_proto_ops *tcf_proto_lookup_ops(struct nlattr *kind) if (kind) { read_lock(&cls_mod_lock); - for (t = tcf_proto_base; t; t = t->next) { + list_for_each_entry(t, &tcf_proto_base, head) { if (nla_strcmp(kind, t->kind) == 0) { if (!try_module_get(t->owner)) t = NULL; @@ -61,16 +60,15 @@ static const struct tcf_proto_ops *tcf_proto_lookup_ops(struct nlattr *kind) int register_tcf_proto_ops(struct tcf_proto_ops *ops) { - struct tcf_proto_ops *t, **tp; + struct tcf_proto_ops *t; int rc = -EEXIST; write_lock(&cls_mod_lock); - for (tp = &tcf_proto_base; (t = *tp) != NULL; tp = &t->next) + list_for_each_entry(t, &tcf_proto_base, head) if (!strcmp(ops->kind, t->kind)) goto out; - ops->next = NULL; - *tp = ops; + list_add_tail(&ops->head, &tcf_proto_base); rc = 0; out: write_unlock(&cls_mod_lock); @@ -80,17 +78,17 @@ EXPORT_SYMBOL(register_tcf_proto_ops); int unregister_tcf_proto_ops(struct tcf_proto_ops *ops) { - struct tcf_proto_ops *t, **tp; + struct tcf_proto_ops *t; int rc = -ENOENT; write_lock(&cls_mod_lock); - for (tp = &tcf_proto_base; (t = *tp) != NULL; tp = &t->next) + list_for_each_entry(t, &tcf_proto_base, head) if (t == ops) break; if (!t) goto out; - *tp = t->next; + list_del(&t->head); rc = 0; out: write_unlock(&cls_mod_lock); @@ -500,46 +498,41 @@ out: void tcf_exts_destroy(struct tcf_proto *tp, struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT - if (exts->action) { - tcf_action_destroy(exts->action, TCA_ACT_UNBIND); - exts->action = NULL; - } + tcf_action_destroy(&exts->actions, TCA_ACT_UNBIND); + INIT_LIST_HEAD(&exts->actions); #endif } EXPORT_SYMBOL(tcf_exts_destroy); int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, - struct nlattr *rate_tlv, struct tcf_exts *exts, - const struct tcf_ext_map *map) + struct nlattr *rate_tlv, struct tcf_exts *exts) { - memset(exts, 0, sizeof(*exts)); - #ifdef CONFIG_NET_CLS_ACT { struct tc_action *act; - if (map->police && tb[map->police]) { - act = tcf_action_init_1(net, tb[map->police], rate_tlv, + INIT_LIST_HEAD(&exts->actions); + if (exts->police && tb[exts->police]) { + act = tcf_action_init_1(net, tb[exts->police], rate_tlv, "police", TCA_ACT_NOREPLACE, TCA_ACT_BIND); if (IS_ERR(act)) return PTR_ERR(act); - act->type = TCA_OLD_COMPAT; - exts->action = act; - } else if (map->action && tb[map->action]) { - act = tcf_action_init(net, tb[map->action], rate_tlv, + act->type = exts->type = TCA_OLD_COMPAT; + list_add(&act->list, &exts->actions); + } else if (exts->action && tb[exts->action]) { + int err; + err = tcf_action_init(net, tb[exts->action], rate_tlv, NULL, TCA_ACT_NOREPLACE, - TCA_ACT_BIND); - if (IS_ERR(act)) - return PTR_ERR(act); - - exts->action = act; + TCA_ACT_BIND, &exts->actions); + if (err) + return err; } } #else - if ((map->action && tb[map->action]) || - (map->police && tb[map->police])) + if ((exts->action && tb[exts->action]) || + (exts->police && tb[exts->police])) return -EOPNOTSUPP; #endif @@ -551,43 +544,44 @@ void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst, struct tcf_exts *src) { #ifdef CONFIG_NET_CLS_ACT - if (src->action) { - struct tc_action *act; + if (!list_empty(&src->actions)) { + LIST_HEAD(tmp); tcf_tree_lock(tp); - act = dst->action; - dst->action = src->action; + list_splice_init(&dst->actions, &tmp); + list_splice(&src->actions, &dst->actions); tcf_tree_unlock(tp); - if (act) - tcf_action_destroy(act, TCA_ACT_UNBIND); + tcf_action_destroy(&tmp, TCA_ACT_UNBIND); } #endif } EXPORT_SYMBOL(tcf_exts_change); -int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts, - const struct tcf_ext_map *map) +#define tcf_exts_first_act(ext) \ + list_first_entry(&(exts)->actions, struct tc_action, list) + +int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT - if (map->action && exts->action) { + if (exts->action && !list_empty(&exts->actions)) { /* * again for backward compatible mode - we want * to work with both old and new modes of entering * tc data even if iproute2 was newer - jhs */ struct nlattr *nest; - - if (exts->action->type != TCA_OLD_COMPAT) { - nest = nla_nest_start(skb, map->action); + if (exts->type != TCA_OLD_COMPAT) { + nest = nla_nest_start(skb, exts->action); if (nest == NULL) goto nla_put_failure; - if (tcf_action_dump(skb, exts->action, 0, 0) < 0) + if (tcf_action_dump(skb, &exts->actions, 0, 0) < 0) goto nla_put_failure; nla_nest_end(skb, nest); - } else if (map->police) { - nest = nla_nest_start(skb, map->police); + } else if (exts->police) { + struct tc_action *act = tcf_exts_first_act(exts); + nest = nla_nest_start(skb, exts->police); if (nest == NULL) goto nla_put_failure; - if (tcf_action_dump_old(skb, exts->action, 0, 0) < 0) + if (tcf_action_dump_old(skb, act, 0, 0) < 0) goto nla_put_failure; nla_nest_end(skb, nest); } @@ -600,17 +594,14 @@ nla_put_failure: __attribute__ ((unused)) EXPORT_SYMBOL(tcf_exts_dump); -int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts, - const struct tcf_ext_map *map) +int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT - if (exts->action) - if (tcf_action_copy_stats(skb, exts->action, 1) < 0) - goto nla_put_failure; + struct tc_action *a = tcf_exts_first_act(exts); + if (tcf_action_copy_stats(skb, a, 1) < 0) + return -1; #endif return 0; -nla_put_failure: __attribute__ ((unused)) - return -1; } EXPORT_SYMBOL(tcf_exts_dump_stats); diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index 636d9131d870..b6552035d1f4 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -34,11 +34,6 @@ struct basic_filter { struct list_head link; }; -static const struct tcf_ext_map basic_ext_map = { - .action = TCA_BASIC_ACT, - .police = TCA_BASIC_POLICE -}; - static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { @@ -141,7 +136,8 @@ static int basic_set_parms(struct net *net, struct tcf_proto *tp, struct tcf_exts e; struct tcf_ematch_tree t; - err = tcf_exts_validate(net, tp, tb, est, &e, &basic_ext_map); + tcf_exts_init(&e, TCA_BASIC_ACT, TCA_BASIC_POLICE); + err = tcf_exts_validate(net, tp, tb, est, &e); if (err < 0) return err; @@ -191,6 +187,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, if (f == NULL) goto errout; + tcf_exts_init(&f->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE); err = -EINVAL; if (handle) f->handle = handle; @@ -263,13 +260,13 @@ static int basic_dump(struct tcf_proto *tp, unsigned long fh, nla_put_u32(skb, TCA_BASIC_CLASSID, f->res.classid)) goto nla_put_failure; - if (tcf_exts_dump(skb, &f->exts, &basic_ext_map) < 0 || + if (tcf_exts_dump(skb, &f->exts) < 0 || tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0) goto nla_put_failure; nla_nest_end(skb, nest); - if (tcf_exts_dump_stats(skb, &f->exts, &basic_ext_map) < 0) + if (tcf_exts_dump_stats(skb, &f->exts) < 0) goto nla_put_failure; return skb->len; diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index d7c72be121f3..00a5a585e5f1 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -46,11 +46,6 @@ static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { .len = sizeof(struct sock_filter) * BPF_MAXINSNS }, }; -static const struct tcf_ext_map bpf_ext_map = { - .action = TCA_BPF_ACT, - .police = TCA_BPF_POLICE, -}; - static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { @@ -174,7 +169,8 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, if (!tb[TCA_BPF_OPS_LEN] || !tb[TCA_BPF_OPS] || !tb[TCA_BPF_CLASSID]) return -EINVAL; - ret = tcf_exts_validate(net, tp, tb, est, &exts, &bpf_ext_map); + tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE); + ret = tcf_exts_validate(net, tp, tb, est, &exts); if (ret < 0) return ret; @@ -271,6 +267,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, if (prog == NULL) return -ENOBUFS; + tcf_exts_init(&prog->exts, TCA_BPF_ACT, TCA_BPF_POLICE); if (handle == 0) prog->handle = cls_bpf_grab_new_handle(tp, head); else @@ -325,12 +322,12 @@ static int cls_bpf_dump(struct tcf_proto *tp, unsigned long fh, memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla)); - if (tcf_exts_dump(skb, &prog->exts, &bpf_ext_map) < 0) + if (tcf_exts_dump(skb, &prog->exts) < 0) goto nla_put_failure; nla_nest_end(skb, nest); - if (tcf_exts_dump_stats(skb, &prog->exts, &bpf_ext_map) < 0) + if (tcf_exts_dump_stats(skb, &prog->exts) < 0) goto nla_put_failure; return skb->len; diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 16006c92c3fd..f9d212583ea2 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -172,11 +172,6 @@ static int cls_cgroup_init(struct tcf_proto *tp) return 0; } -static const struct tcf_ext_map cgroup_ext_map = { - .action = TCA_CGROUP_ACT, - .police = TCA_CGROUP_POLICE, -}; - static const struct nla_policy cgroup_policy[TCA_CGROUP_MAX + 1] = { [TCA_CGROUP_EMATCHES] = { .type = NLA_NESTED }, }; @@ -203,6 +198,7 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, if (head == NULL) return -ENOBUFS; + tcf_exts_init(&head->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); head->handle = handle; tcf_tree_lock(tp); @@ -218,8 +214,8 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, if (err < 0) return err; - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, - &cgroup_ext_map); + tcf_exts_init(&e, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e); if (err < 0) return err; @@ -277,13 +273,13 @@ static int cls_cgroup_dump(struct tcf_proto *tp, unsigned long fh, if (nest == NULL) goto nla_put_failure; - if (tcf_exts_dump(skb, &head->exts, &cgroup_ext_map) < 0 || + if (tcf_exts_dump(skb, &head->exts) < 0 || tcf_em_tree_dump(skb, &head->ematches, TCA_CGROUP_EMATCHES) < 0) goto nla_put_failure; nla_nest_end(skb, nest); - if (tcf_exts_dump_stats(skb, &head->exts, &cgroup_ext_map) < 0) + if (tcf_exts_dump_stats(skb, &head->exts) < 0) goto nla_put_failure; return skb->len; diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 7881e2fccbc2..dfd18a5c3e81 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -56,11 +56,6 @@ struct flow_filter { u32 hashrnd; }; -static const struct tcf_ext_map flow_ext_map = { - .action = TCA_FLOW_ACT, - .police = TCA_FLOW_POLICE, -}; - static inline u32 addr_fold(void *addr) { unsigned long a = (unsigned long)addr; @@ -220,7 +215,7 @@ static u32 flow_get_vlan_tag(const struct sk_buff *skb) static u32 flow_get_rxhash(struct sk_buff *skb) { - return skb_get_rxhash(skb); + return skb_get_hash(skb); } static u32 flow_key_get(struct sk_buff *skb, int key, struct flow_keys *flow) @@ -397,7 +392,8 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, return -EOPNOTSUPP; } - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, &flow_ext_map); + tcf_exts_init(&e, TCA_FLOW_ACT, TCA_FLOW_POLICE); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e); if (err < 0) return err; @@ -455,6 +451,7 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, f->handle = handle; f->mask = ~0U; + tcf_exts_init(&f->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE); get_random_bytes(&f->hashrnd, 4); f->perturb_timer.function = flow_perturbation; @@ -608,7 +605,7 @@ static int flow_dump(struct tcf_proto *tp, unsigned long fh, nla_put_u32(skb, TCA_FLOW_PERTURB, f->perturb_period / HZ)) goto nla_put_failure; - if (tcf_exts_dump(skb, &f->exts, &flow_ext_map) < 0) + if (tcf_exts_dump(skb, &f->exts) < 0) goto nla_put_failure; #ifdef CONFIG_NET_EMATCH if (f->ematches.hdr.nmatches && @@ -617,7 +614,7 @@ static int flow_dump(struct tcf_proto *tp, unsigned long fh, #endif nla_nest_end(skb, nest); - if (tcf_exts_dump_stats(skb, &f->exts, &flow_ext_map) < 0) + if (tcf_exts_dump_stats(skb, &f->exts) < 0) goto nla_put_failure; return skb->len; diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 9b97172db84a..3f9cece13807 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -46,11 +46,6 @@ struct fw_filter { struct tcf_exts exts; }; -static const struct tcf_ext_map fw_ext_map = { - .action = TCA_FW_ACT, - .police = TCA_FW_POLICE -}; - static inline int fw_hash(u32 handle) { if (HTSIZE == 4096) @@ -200,7 +195,8 @@ fw_change_attrs(struct net *net, struct tcf_proto *tp, struct fw_filter *f, u32 mask; int err; - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, &fw_ext_map); + tcf_exts_init(&e, TCA_FW_ACT, TCA_FW_POLICE); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e); if (err < 0) return err; @@ -280,6 +276,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, if (f == NULL) return -ENOBUFS; + tcf_exts_init(&f->exts, TCA_FW_ACT, TCA_FW_POLICE); f->id = handle; err = fw_change_attrs(net, tp, f, tb, tca, base); @@ -359,12 +356,12 @@ static int fw_dump(struct tcf_proto *tp, unsigned long fh, nla_put_u32(skb, TCA_FW_MASK, head->mask)) goto nla_put_failure; - if (tcf_exts_dump(skb, &f->exts, &fw_ext_map) < 0) + if (tcf_exts_dump(skb, &f->exts) < 0) goto nla_put_failure; nla_nest_end(skb, nest); - if (tcf_exts_dump_stats(skb, &f->exts, &fw_ext_map) < 0) + if (tcf_exts_dump_stats(skb, &f->exts) < 0) goto nla_put_failure; return skb->len; diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 37da567d833e..2473953a5948 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -59,11 +59,6 @@ struct route4_filter { #define ROUTE4_FAILURE ((struct route4_filter *)(-1L)) -static const struct tcf_ext_map route_ext_map = { - .police = TCA_ROUTE4_POLICE, - .action = TCA_ROUTE4_ACT -}; - static inline int route4_fastmap_hash(u32 id, int iif) { return id & 0xF; @@ -347,7 +342,8 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp, struct route4_bucket *b; struct tcf_exts e; - err = tcf_exts_validate(net, tp, tb, est, &e, &route_ext_map); + tcf_exts_init(&e, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE); + err = tcf_exts_validate(net, tp, tb, est, &e); if (err < 0) return err; @@ -481,6 +477,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, if (f == NULL) goto errout; + tcf_exts_init(&f->exts, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE); err = route4_set_parms(net, tp, base, f, handle, head, tb, tca[TCA_RATE], 1); if (err < 0) @@ -589,12 +586,12 @@ static int route4_dump(struct tcf_proto *tp, unsigned long fh, nla_put_u32(skb, TCA_ROUTE4_CLASSID, f->res.classid)) goto nla_put_failure; - if (tcf_exts_dump(skb, &f->exts, &route_ext_map) < 0) + if (tcf_exts_dump(skb, &f->exts) < 0) goto nla_put_failure; nla_nest_end(skb, nest); - if (tcf_exts_dump_stats(skb, &f->exts, &route_ext_map) < 0) + if (tcf_exts_dump_stats(skb, &f->exts) < 0) goto nla_put_failure; return skb->len; diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 252d8b05872e..4f25c2ac825b 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -116,11 +116,6 @@ static inline unsigned int hash_src(__be32 *src) return h & 0xF; } -static struct tcf_ext_map rsvp_ext_map = { - .police = TCA_RSVP_POLICE, - .action = TCA_RSVP_ACT -}; - #define RSVP_APPLY_RESULT() \ { \ int r = tcf_exts_exec(skb, &f->exts, res); \ @@ -440,7 +435,8 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb, if (err < 0) return err; - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, &rsvp_ext_map); + tcf_exts_init(&e, TCA_RSVP_ACT, TCA_RSVP_POLICE); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e); if (err < 0) return err; @@ -471,6 +467,7 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb, if (f == NULL) goto errout2; + tcf_exts_init(&f->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE); h2 = 16; if (tb[TCA_RSVP_SRC]) { memcpy(f->src, nla_data(tb[TCA_RSVP_SRC]), sizeof(f->src)); @@ -633,12 +630,12 @@ static int rsvp_dump(struct tcf_proto *tp, unsigned long fh, nla_put(skb, TCA_RSVP_SRC, sizeof(f->src), f->src)) goto nla_put_failure; - if (tcf_exts_dump(skb, &f->exts, &rsvp_ext_map) < 0) + if (tcf_exts_dump(skb, &f->exts) < 0) goto nla_put_failure; nla_nest_end(skb, nest); - if (tcf_exts_dump_stats(skb, &f->exts, &rsvp_ext_map) < 0) + if (tcf_exts_dump_stats(skb, &f->exts) < 0) goto nla_put_failure; return skb->len; diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index b86535a40169..ffad18791c93 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -50,11 +50,6 @@ struct tcindex_data { int fall_through; /* 0: only classify if explicit match */ }; -static const struct tcf_ext_map tcindex_ext_map = { - .police = TCA_TCINDEX_POLICE, - .action = TCA_TCINDEX_ACT -}; - static inline int tcindex_filter_is_set(struct tcindex_filter_result *r) { @@ -209,17 +204,21 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, struct tcindex_filter *f = NULL; /* make gcc behave */ struct tcf_exts e; - err = tcf_exts_validate(net, tp, tb, est, &e, &tcindex_ext_map); + tcf_exts_init(&e, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); + err = tcf_exts_validate(net, tp, tb, est, &e); if (err < 0) return err; memcpy(&cp, p, sizeof(cp)); memset(&new_filter_result, 0, sizeof(new_filter_result)); + tcf_exts_init(&new_filter_result.exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); if (old_r) memcpy(&cr, r, sizeof(cr)); - else + else { memset(&cr, 0, sizeof(cr)); + tcf_exts_init(&cr.exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); + } if (tb[TCA_TCINDEX_HASH]) cp.hash = nla_get_u32(tb[TCA_TCINDEX_HASH]); @@ -468,11 +467,11 @@ static int tcindex_dump(struct tcf_proto *tp, unsigned long fh, nla_put_u32(skb, TCA_TCINDEX_CLASSID, r->res.classid)) goto nla_put_failure; - if (tcf_exts_dump(skb, &r->exts, &tcindex_ext_map) < 0) + if (tcf_exts_dump(skb, &r->exts) < 0) goto nla_put_failure; nla_nest_end(skb, nest); - if (tcf_exts_dump_stats(skb, &r->exts, &tcindex_ext_map) < 0) + if (tcf_exts_dump_stats(skb, &r->exts) < 0) goto nla_put_failure; } diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 59e546c2ac98..20f2fb79c747 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -79,11 +79,6 @@ struct tc_u_common { u32 hgenerator; }; -static const struct tcf_ext_map u32_ext_map = { - .action = TCA_U32_ACT, - .police = TCA_U32_POLICE -}; - static inline unsigned int u32_hash_fold(__be32 key, const struct tc_u32_sel *sel, u8 fshift) @@ -496,7 +491,8 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp, int err; struct tcf_exts e; - err = tcf_exts_validate(net, tp, tb, est, &e, &u32_ext_map); + tcf_exts_init(&e, TCA_U32_ACT, TCA_U32_POLICE); + err = tcf_exts_validate(net, tp, tb, est, &e); if (err < 0) return err; @@ -646,6 +642,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, n->ht_up = ht; n->handle = handle; n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0; + tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE); #ifdef CONFIG_CLS_U32_MARK if (tb[TCA_U32_MARK]) { @@ -759,7 +756,7 @@ static int u32_dump(struct tcf_proto *tp, unsigned long fh, goto nla_put_failure; #endif - if (tcf_exts_dump(skb, &n->exts, &u32_ext_map) < 0) + if (tcf_exts_dump(skb, &n->exts) < 0) goto nla_put_failure; #ifdef CONFIG_NET_CLS_IND @@ -778,7 +775,7 @@ static int u32_dump(struct tcf_proto *tp, unsigned long fh, nla_nest_end(skb, nest); if (TC_U32_KEY(n->handle)) - if (tcf_exts_dump_stats(skb, &n->exts, &u32_ext_map) < 0) + if (tcf_exts_dump_stats(skb, &n->exts) < 0) goto nla_put_failure; return skb->len; diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 382519a5d7f9..9b8c0b0e60d7 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -222,7 +222,7 @@ META_COLLECTOR(int_maclen) META_COLLECTOR(int_rxhash) { - dst->value = skb_get_rxhash(skb); + dst->value = skb_get_hash(skb); } /************************************************************************** diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index d5a8a4b2454f..e25183333807 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1783,8 +1783,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t qdisc_root_sleeping_lock(sch), tca[TCA_RATE]); if (err) { - if (rtab) - qdisc_put_rtab(rtab); + qdisc_put_rtab(rtab); return err; } } diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 95d843961907..08ef7a42c0e4 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -47,6 +47,7 @@ #include <linux/rbtree.h> #include <linux/hash.h> #include <linux/prefetch.h> +#include <linux/vmalloc.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/sock.h> @@ -225,7 +226,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) /* By forcing low order bit to 1, we make sure to not * collide with a local flow (socket pointers are word aligned) */ - sk = (struct sock *)(skb_get_rxhash(skb) | 1L); + sk = (struct sock *)(skb_get_hash(skb) | 1L); } root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)]; @@ -578,15 +579,36 @@ static void fq_rehash(struct fq_sched_data *q, q->stat_gc_flows += fcnt; } -static int fq_resize(struct fq_sched_data *q, u32 log) +static void *fq_alloc_node(size_t sz, int node) { + void *ptr; + + ptr = kmalloc_node(sz, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN, node); + if (!ptr) + ptr = vmalloc_node(sz, node); + return ptr; +} + +static void fq_free(void *addr) +{ + if (addr && is_vmalloc_addr(addr)) + vfree(addr); + else + kfree(addr); +} + +static int fq_resize(struct Qdisc *sch, u32 log) +{ + struct fq_sched_data *q = qdisc_priv(sch); struct rb_root *array; u32 idx; if (q->fq_root && log == q->fq_trees_log) return 0; - array = kmalloc(sizeof(struct rb_root) << log, GFP_KERNEL); + /* If XPS was setup, we can allocate memory on right NUMA node */ + array = fq_alloc_node(sizeof(struct rb_root) << log, + netdev_queue_numa_node_read(sch->dev_queue)); if (!array) return -ENOMEM; @@ -595,7 +617,7 @@ static int fq_resize(struct fq_sched_data *q, u32 log) if (q->fq_root) { fq_rehash(q, q->fq_root, q->fq_trees_log, array, log); - kfree(q->fq_root); + fq_free(q->fq_root); } q->fq_root = array; q->fq_trees_log = log; @@ -676,7 +698,7 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt) } if (!err) - err = fq_resize(q, fq_log); + err = fq_resize(sch, fq_log); while (sch->q.qlen > sch->limit) { struct sk_buff *skb = fq_dequeue(sch); @@ -697,7 +719,7 @@ static void fq_destroy(struct Qdisc *sch) struct fq_sched_data *q = qdisc_priv(sch); fq_reset(sch); - kfree(q->fq_root); + fq_free(q->fq_root); qdisc_watchdog_cancel(&q->watchdog); } @@ -723,7 +745,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt) if (opt) err = fq_change(sch, opt); else - err = fq_resize(q, q->fq_trees_log); + err = fq_resize(sch, q->fq_trees_log); return err; } diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c new file mode 100644 index 000000000000..97aa33dbb90f --- /dev/null +++ b/net/sched/sch_hhf.c @@ -0,0 +1,746 @@ +/* net/sched/sch_hhf.c Heavy-Hitter Filter (HHF) + * + * Copyright (C) 2013 Terry Lam <vtlam@google.com> + * Copyright (C) 2013 Nandita Dukkipati <nanditad@google.com> + */ + +#include <linux/jhash.h> +#include <linux/jiffies.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/vmalloc.h> +#include <net/flow_keys.h> +#include <net/pkt_sched.h> +#include <net/sock.h> + +/* Heavy-Hitter Filter (HHF) + * + * Principles : + * Flows are classified into two buckets: non-heavy-hitter and heavy-hitter + * buckets. Initially, a new flow starts as non-heavy-hitter. Once classified + * as heavy-hitter, it is immediately switched to the heavy-hitter bucket. + * The buckets are dequeued by a Weighted Deficit Round Robin (WDRR) scheduler, + * in which the heavy-hitter bucket is served with less weight. + * In other words, non-heavy-hitters (e.g., short bursts of critical traffic) + * are isolated from heavy-hitters (e.g., persistent bulk traffic) and also have + * higher share of bandwidth. + * + * To capture heavy-hitters, we use the "multi-stage filter" algorithm in the + * following paper: + * [EV02] C. Estan and G. Varghese, "New Directions in Traffic Measurement and + * Accounting", in ACM SIGCOMM, 2002. + * + * Conceptually, a multi-stage filter comprises k independent hash functions + * and k counter arrays. Packets are indexed into k counter arrays by k hash + * functions, respectively. The counters are then increased by the packet sizes. + * Therefore, + * - For a heavy-hitter flow: *all* of its k array counters must be large. + * - For a non-heavy-hitter flow: some of its k array counters can be large + * due to hash collision with other small flows; however, with high + * probability, not *all* k counters are large. + * + * By the design of the multi-stage filter algorithm, the false negative rate + * (heavy-hitters getting away uncaptured) is zero. However, the algorithm is + * susceptible to false positives (non-heavy-hitters mistakenly classified as + * heavy-hitters). + * Therefore, we also implement the following optimizations to reduce false + * positives by avoiding unnecessary increment of the counter values: + * - Optimization O1: once a heavy-hitter is identified, its bytes are not + * accounted in the array counters. This technique is called "shielding" + * in Section 3.3.1 of [EV02]. + * - Optimization O2: conservative update of counters + * (Section 3.3.2 of [EV02]), + * New counter value = max {old counter value, + * smallest counter value + packet bytes} + * + * Finally, we refresh the counters periodically since otherwise the counter + * values will keep accumulating. + * + * Once a flow is classified as heavy-hitter, we also save its per-flow state + * in an exact-matching flow table so that its subsequent packets can be + * dispatched to the heavy-hitter bucket accordingly. + * + * + * At a high level, this qdisc works as follows: + * Given a packet p: + * - If the flow-id of p (e.g., TCP 5-tuple) is already in the exact-matching + * heavy-hitter flow table, denoted table T, then send p to the heavy-hitter + * bucket. + * - Otherwise, forward p to the multi-stage filter, denoted filter F + * + If F decides that p belongs to a non-heavy-hitter flow, then send p + * to the non-heavy-hitter bucket. + * + Otherwise, if F decides that p belongs to a new heavy-hitter flow, + * then set up a new flow entry for the flow-id of p in the table T and + * send p to the heavy-hitter bucket. + * + * In this implementation: + * - T is a fixed-size hash-table with 1024 entries. Hash collision is + * resolved by linked-list chaining. + * - F has four counter arrays, each array containing 1024 32-bit counters. + * That means 4 * 1024 * 32 bits = 16KB of memory. + * - Since each array in F contains 1024 counters, 10 bits are sufficient to + * index into each array. + * Hence, instead of having four hash functions, we chop the 32-bit + * skb-hash into three 10-bit chunks, and the remaining 10-bit chunk is + * computed as XOR sum of those three chunks. + * - We need to clear the counter arrays periodically; however, directly + * memsetting 16KB of memory can lead to cache eviction and unwanted delay. + * So by representing each counter by a valid bit, we only need to reset + * 4K of 1 bit (i.e. 512 bytes) instead of 16KB of memory. + * - The Deficit Round Robin engine is taken from fq_codel implementation + * (net/sched/sch_fq_codel.c). Note that wdrr_bucket corresponds to + * fq_codel_flow in fq_codel implementation. + * + */ + +/* Non-configurable parameters */ +#define HH_FLOWS_CNT 1024 /* number of entries in exact-matching table T */ +#define HHF_ARRAYS_CNT 4 /* number of arrays in multi-stage filter F */ +#define HHF_ARRAYS_LEN 1024 /* number of counters in each array of F */ +#define HHF_BIT_MASK_LEN 10 /* masking 10 bits */ +#define HHF_BIT_MASK 0x3FF /* bitmask of 10 bits */ + +#define WDRR_BUCKET_CNT 2 /* two buckets for Weighted DRR */ +enum wdrr_bucket_idx { + WDRR_BUCKET_FOR_HH = 0, /* bucket id for heavy-hitters */ + WDRR_BUCKET_FOR_NON_HH = 1 /* bucket id for non-heavy-hitters */ +}; + +#define hhf_time_before(a, b) \ + (typecheck(u32, a) && typecheck(u32, b) && ((s32)((a) - (b)) < 0)) + +/* Heavy-hitter per-flow state */ +struct hh_flow_state { + u32 hash_id; /* hash of flow-id (e.g. TCP 5-tuple) */ + u32 hit_timestamp; /* last time heavy-hitter was seen */ + struct list_head flowchain; /* chaining under hash collision */ +}; + +/* Weighted Deficit Round Robin (WDRR) scheduler */ +struct wdrr_bucket { + struct sk_buff *head; + struct sk_buff *tail; + struct list_head bucketchain; + int deficit; +}; + +struct hhf_sched_data { + struct wdrr_bucket buckets[WDRR_BUCKET_CNT]; + u32 perturbation; /* hash perturbation */ + u32 quantum; /* psched_mtu(qdisc_dev(sch)); */ + u32 drop_overlimit; /* number of times max qdisc packet + * limit was hit + */ + struct list_head *hh_flows; /* table T (currently active HHs) */ + u32 hh_flows_limit; /* max active HH allocs */ + u32 hh_flows_overlimit; /* num of disallowed HH allocs */ + u32 hh_flows_total_cnt; /* total admitted HHs */ + u32 hh_flows_current_cnt; /* total current HHs */ + u32 *hhf_arrays[HHF_ARRAYS_CNT]; /* HH filter F */ + u32 hhf_arrays_reset_timestamp; /* last time hhf_arrays + * was reset + */ + unsigned long *hhf_valid_bits[HHF_ARRAYS_CNT]; /* shadow valid bits + * of hhf_arrays + */ + /* Similar to the "new_flows" vs. "old_flows" concept in fq_codel DRR */ + struct list_head new_buckets; /* list of new buckets */ + struct list_head old_buckets; /* list of old buckets */ + + /* Configurable HHF parameters */ + u32 hhf_reset_timeout; /* interval to reset counter + * arrays in filter F + * (default 40ms) + */ + u32 hhf_admit_bytes; /* counter thresh to classify as + * HH (default 128KB). + * With these default values, + * 128KB / 40ms = 25 Mbps + * i.e., we expect to capture HHs + * sending > 25 Mbps. + */ + u32 hhf_evict_timeout; /* aging threshold to evict idle + * HHs out of table T. This should + * be large enough to avoid + * reordering during HH eviction. + * (default 1s) + */ + u32 hhf_non_hh_weight; /* WDRR weight for non-HHs + * (default 2, + * i.e., non-HH : HH = 2 : 1) + */ +}; + +static u32 hhf_time_stamp(void) +{ + return jiffies; +} + +static unsigned int skb_hash(const struct hhf_sched_data *q, + const struct sk_buff *skb) +{ + struct flow_keys keys; + unsigned int hash; + + if (skb->sk && skb->sk->sk_hash) + return skb->sk->sk_hash; + + skb_flow_dissect(skb, &keys); + hash = jhash_3words((__force u32)keys.dst, + (__force u32)keys.src ^ keys.ip_proto, + (__force u32)keys.ports, q->perturbation); + return hash; +} + +/* Looks up a heavy-hitter flow in a chaining list of table T. */ +static struct hh_flow_state *seek_list(const u32 hash, + struct list_head *head, + struct hhf_sched_data *q) +{ + struct hh_flow_state *flow, *next; + u32 now = hhf_time_stamp(); + + if (list_empty(head)) + return NULL; + + list_for_each_entry_safe(flow, next, head, flowchain) { + u32 prev = flow->hit_timestamp + q->hhf_evict_timeout; + + if (hhf_time_before(prev, now)) { + /* Delete expired heavy-hitters, but preserve one entry + * to avoid kzalloc() when next time this slot is hit. + */ + if (list_is_last(&flow->flowchain, head)) + return NULL; + list_del(&flow->flowchain); + kfree(flow); + q->hh_flows_current_cnt--; + } else if (flow->hash_id == hash) { + return flow; + } + } + return NULL; +} + +/* Returns a flow state entry for a new heavy-hitter. Either reuses an expired + * entry or dynamically alloc a new entry. + */ +static struct hh_flow_state *alloc_new_hh(struct list_head *head, + struct hhf_sched_data *q) +{ + struct hh_flow_state *flow; + u32 now = hhf_time_stamp(); + + if (!list_empty(head)) { + /* Find an expired heavy-hitter flow entry. */ + list_for_each_entry(flow, head, flowchain) { + u32 prev = flow->hit_timestamp + q->hhf_evict_timeout; + + if (hhf_time_before(prev, now)) + return flow; + } + } + + if (q->hh_flows_current_cnt >= q->hh_flows_limit) { + q->hh_flows_overlimit++; + return NULL; + } + /* Create new entry. */ + flow = kzalloc(sizeof(struct hh_flow_state), GFP_ATOMIC); + if (!flow) + return NULL; + + q->hh_flows_current_cnt++; + INIT_LIST_HEAD(&flow->flowchain); + list_add_tail(&flow->flowchain, head); + + return flow; +} + +/* Assigns packets to WDRR buckets. Implements a multi-stage filter to + * classify heavy-hitters. + */ +static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch) +{ + struct hhf_sched_data *q = qdisc_priv(sch); + u32 tmp_hash, hash; + u32 xorsum, filter_pos[HHF_ARRAYS_CNT], flow_pos; + struct hh_flow_state *flow; + u32 pkt_len, min_hhf_val; + int i; + u32 prev; + u32 now = hhf_time_stamp(); + + /* Reset the HHF counter arrays if this is the right time. */ + prev = q->hhf_arrays_reset_timestamp + q->hhf_reset_timeout; + if (hhf_time_before(prev, now)) { + for (i = 0; i < HHF_ARRAYS_CNT; i++) + bitmap_zero(q->hhf_valid_bits[i], HHF_ARRAYS_LEN); + q->hhf_arrays_reset_timestamp = now; + } + + /* Get hashed flow-id of the skb. */ + hash = skb_hash(q, skb); + + /* Check if this packet belongs to an already established HH flow. */ + flow_pos = hash & HHF_BIT_MASK; + flow = seek_list(hash, &q->hh_flows[flow_pos], q); + if (flow) { /* found its HH flow */ + flow->hit_timestamp = now; + return WDRR_BUCKET_FOR_HH; + } + + /* Now pass the packet through the multi-stage filter. */ + tmp_hash = hash; + xorsum = 0; + for (i = 0; i < HHF_ARRAYS_CNT - 1; i++) { + /* Split the skb_hash into three 10-bit chunks. */ + filter_pos[i] = tmp_hash & HHF_BIT_MASK; + xorsum ^= filter_pos[i]; + tmp_hash >>= HHF_BIT_MASK_LEN; + } + /* The last chunk is computed as XOR sum of other chunks. */ + filter_pos[HHF_ARRAYS_CNT - 1] = xorsum ^ tmp_hash; + + pkt_len = qdisc_pkt_len(skb); + min_hhf_val = ~0U; + for (i = 0; i < HHF_ARRAYS_CNT; i++) { + u32 val; + + if (!test_bit(filter_pos[i], q->hhf_valid_bits[i])) { + q->hhf_arrays[i][filter_pos[i]] = 0; + __set_bit(filter_pos[i], q->hhf_valid_bits[i]); + } + + val = q->hhf_arrays[i][filter_pos[i]] + pkt_len; + if (min_hhf_val > val) + min_hhf_val = val; + } + + /* Found a new HH iff all counter values > HH admit threshold. */ + if (min_hhf_val > q->hhf_admit_bytes) { + /* Just captured a new heavy-hitter. */ + flow = alloc_new_hh(&q->hh_flows[flow_pos], q); + if (!flow) /* memory alloc problem */ + return WDRR_BUCKET_FOR_NON_HH; + flow->hash_id = hash; + flow->hit_timestamp = now; + q->hh_flows_total_cnt++; + + /* By returning without updating counters in q->hhf_arrays, + * we implicitly implement "shielding" (see Optimization O1). + */ + return WDRR_BUCKET_FOR_HH; + } + + /* Conservative update of HHF arrays (see Optimization O2). */ + for (i = 0; i < HHF_ARRAYS_CNT; i++) { + if (q->hhf_arrays[i][filter_pos[i]] < min_hhf_val) + q->hhf_arrays[i][filter_pos[i]] = min_hhf_val; + } + return WDRR_BUCKET_FOR_NON_HH; +} + +/* Removes one skb from head of bucket. */ +static struct sk_buff *dequeue_head(struct wdrr_bucket *bucket) +{ + struct sk_buff *skb = bucket->head; + + bucket->head = skb->next; + skb->next = NULL; + return skb; +} + +/* Tail-adds skb to bucket. */ +static void bucket_add(struct wdrr_bucket *bucket, struct sk_buff *skb) +{ + if (bucket->head == NULL) + bucket->head = skb; + else + bucket->tail->next = skb; + bucket->tail = skb; + skb->next = NULL; +} + +static unsigned int hhf_drop(struct Qdisc *sch) +{ + struct hhf_sched_data *q = qdisc_priv(sch); + struct wdrr_bucket *bucket; + + /* Always try to drop from heavy-hitters first. */ + bucket = &q->buckets[WDRR_BUCKET_FOR_HH]; + if (!bucket->head) + bucket = &q->buckets[WDRR_BUCKET_FOR_NON_HH]; + + if (bucket->head) { + struct sk_buff *skb = dequeue_head(bucket); + + sch->q.qlen--; + sch->qstats.drops++; + sch->qstats.backlog -= qdisc_pkt_len(skb); + kfree_skb(skb); + } + + /* Return id of the bucket from which the packet was dropped. */ + return bucket - q->buckets; +} + +static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct hhf_sched_data *q = qdisc_priv(sch); + enum wdrr_bucket_idx idx; + struct wdrr_bucket *bucket; + + idx = hhf_classify(skb, sch); + + bucket = &q->buckets[idx]; + bucket_add(bucket, skb); + sch->qstats.backlog += qdisc_pkt_len(skb); + + if (list_empty(&bucket->bucketchain)) { + unsigned int weight; + + /* The logic of new_buckets vs. old_buckets is the same as + * new_flows vs. old_flows in the implementation of fq_codel, + * i.e., short bursts of non-HHs should have strict priority. + */ + if (idx == WDRR_BUCKET_FOR_HH) { + /* Always move heavy-hitters to old bucket. */ + weight = 1; + list_add_tail(&bucket->bucketchain, &q->old_buckets); + } else { + weight = q->hhf_non_hh_weight; + list_add_tail(&bucket->bucketchain, &q->new_buckets); + } + bucket->deficit = weight * q->quantum; + } + if (++sch->q.qlen < sch->limit) + return NET_XMIT_SUCCESS; + + q->drop_overlimit++; + /* Return Congestion Notification only if we dropped a packet from this + * bucket. + */ + if (hhf_drop(sch) == idx) + return NET_XMIT_CN; + + /* As we dropped a packet, better let upper stack know this. */ + qdisc_tree_decrease_qlen(sch, 1); + return NET_XMIT_SUCCESS; +} + +static struct sk_buff *hhf_dequeue(struct Qdisc *sch) +{ + struct hhf_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb = NULL; + struct wdrr_bucket *bucket; + struct list_head *head; + +begin: + head = &q->new_buckets; + if (list_empty(head)) { + head = &q->old_buckets; + if (list_empty(head)) + return NULL; + } + bucket = list_first_entry(head, struct wdrr_bucket, bucketchain); + + if (bucket->deficit <= 0) { + int weight = (bucket - q->buckets == WDRR_BUCKET_FOR_HH) ? + 1 : q->hhf_non_hh_weight; + + bucket->deficit += weight * q->quantum; + list_move_tail(&bucket->bucketchain, &q->old_buckets); + goto begin; + } + + if (bucket->head) { + skb = dequeue_head(bucket); + sch->q.qlen--; + sch->qstats.backlog -= qdisc_pkt_len(skb); + } + + if (!skb) { + /* Force a pass through old_buckets to prevent starvation. */ + if ((head == &q->new_buckets) && !list_empty(&q->old_buckets)) + list_move_tail(&bucket->bucketchain, &q->old_buckets); + else + list_del_init(&bucket->bucketchain); + goto begin; + } + qdisc_bstats_update(sch, skb); + bucket->deficit -= qdisc_pkt_len(skb); + + return skb; +} + +static void hhf_reset(struct Qdisc *sch) +{ + struct sk_buff *skb; + + while ((skb = hhf_dequeue(sch)) != NULL) + kfree_skb(skb); +} + +static void *hhf_zalloc(size_t sz) +{ + void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN); + + if (!ptr) + ptr = vzalloc(sz); + + return ptr; +} + +static void hhf_free(void *addr) +{ + if (addr) { + if (is_vmalloc_addr(addr)) + vfree(addr); + else + kfree(addr); + } +} + +static void hhf_destroy(struct Qdisc *sch) +{ + int i; + struct hhf_sched_data *q = qdisc_priv(sch); + + for (i = 0; i < HHF_ARRAYS_CNT; i++) { + hhf_free(q->hhf_arrays[i]); + hhf_free(q->hhf_valid_bits[i]); + } + + for (i = 0; i < HH_FLOWS_CNT; i++) { + struct hh_flow_state *flow, *next; + struct list_head *head = &q->hh_flows[i]; + + if (list_empty(head)) + continue; + list_for_each_entry_safe(flow, next, head, flowchain) { + list_del(&flow->flowchain); + kfree(flow); + } + } + hhf_free(q->hh_flows); +} + +static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = { + [TCA_HHF_BACKLOG_LIMIT] = { .type = NLA_U32 }, + [TCA_HHF_QUANTUM] = { .type = NLA_U32 }, + [TCA_HHF_HH_FLOWS_LIMIT] = { .type = NLA_U32 }, + [TCA_HHF_RESET_TIMEOUT] = { .type = NLA_U32 }, + [TCA_HHF_ADMIT_BYTES] = { .type = NLA_U32 }, + [TCA_HHF_EVICT_TIMEOUT] = { .type = NLA_U32 }, + [TCA_HHF_NON_HH_WEIGHT] = { .type = NLA_U32 }, +}; + +static int hhf_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct hhf_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_HHF_MAX + 1]; + unsigned int qlen; + int err; + u64 non_hh_quantum; + u32 new_quantum = q->quantum; + u32 new_hhf_non_hh_weight = q->hhf_non_hh_weight; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_HHF_MAX, opt, hhf_policy); + if (err < 0) + return err; + + sch_tree_lock(sch); + + if (tb[TCA_HHF_BACKLOG_LIMIT]) + sch->limit = nla_get_u32(tb[TCA_HHF_BACKLOG_LIMIT]); + + if (tb[TCA_HHF_QUANTUM]) + new_quantum = nla_get_u32(tb[TCA_HHF_QUANTUM]); + + if (tb[TCA_HHF_NON_HH_WEIGHT]) + new_hhf_non_hh_weight = nla_get_u32(tb[TCA_HHF_NON_HH_WEIGHT]); + + non_hh_quantum = (u64)new_quantum * new_hhf_non_hh_weight; + if (non_hh_quantum > INT_MAX) + return -EINVAL; + q->quantum = new_quantum; + q->hhf_non_hh_weight = new_hhf_non_hh_weight; + + if (tb[TCA_HHF_HH_FLOWS_LIMIT]) + q->hh_flows_limit = nla_get_u32(tb[TCA_HHF_HH_FLOWS_LIMIT]); + + if (tb[TCA_HHF_RESET_TIMEOUT]) { + u32 ms = nla_get_u32(tb[TCA_HHF_RESET_TIMEOUT]); + + q->hhf_reset_timeout = msecs_to_jiffies(ms); + } + + if (tb[TCA_HHF_ADMIT_BYTES]) + q->hhf_admit_bytes = nla_get_u32(tb[TCA_HHF_ADMIT_BYTES]); + + if (tb[TCA_HHF_EVICT_TIMEOUT]) { + u32 ms = nla_get_u32(tb[TCA_HHF_EVICT_TIMEOUT]); + + q->hhf_evict_timeout = msecs_to_jiffies(ms); + } + + qlen = sch->q.qlen; + while (sch->q.qlen > sch->limit) { + struct sk_buff *skb = hhf_dequeue(sch); + + kfree_skb(skb); + } + qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); + + sch_tree_unlock(sch); + return 0; +} + +static int hhf_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct hhf_sched_data *q = qdisc_priv(sch); + int i; + + sch->limit = 1000; + q->quantum = psched_mtu(qdisc_dev(sch)); + q->perturbation = net_random(); + INIT_LIST_HEAD(&q->new_buckets); + INIT_LIST_HEAD(&q->old_buckets); + + /* Configurable HHF parameters */ + q->hhf_reset_timeout = HZ / 25; /* 40 ms */ + q->hhf_admit_bytes = 131072; /* 128 KB */ + q->hhf_evict_timeout = HZ; /* 1 sec */ + q->hhf_non_hh_weight = 2; + + if (opt) { + int err = hhf_change(sch, opt); + + if (err) + return err; + } + + if (!q->hh_flows) { + /* Initialize heavy-hitter flow table. */ + q->hh_flows = hhf_zalloc(HH_FLOWS_CNT * + sizeof(struct list_head)); + if (!q->hh_flows) + return -ENOMEM; + for (i = 0; i < HH_FLOWS_CNT; i++) + INIT_LIST_HEAD(&q->hh_flows[i]); + + /* Cap max active HHs at twice len of hh_flows table. */ + q->hh_flows_limit = 2 * HH_FLOWS_CNT; + q->hh_flows_overlimit = 0; + q->hh_flows_total_cnt = 0; + q->hh_flows_current_cnt = 0; + + /* Initialize heavy-hitter filter arrays. */ + for (i = 0; i < HHF_ARRAYS_CNT; i++) { + q->hhf_arrays[i] = hhf_zalloc(HHF_ARRAYS_LEN * + sizeof(u32)); + if (!q->hhf_arrays[i]) { + hhf_destroy(sch); + return -ENOMEM; + } + } + q->hhf_arrays_reset_timestamp = hhf_time_stamp(); + + /* Initialize valid bits of heavy-hitter filter arrays. */ + for (i = 0; i < HHF_ARRAYS_CNT; i++) { + q->hhf_valid_bits[i] = hhf_zalloc(HHF_ARRAYS_LEN / + BITS_PER_BYTE); + if (!q->hhf_valid_bits[i]) { + hhf_destroy(sch); + return -ENOMEM; + } + } + + /* Initialize Weighted DRR buckets. */ + for (i = 0; i < WDRR_BUCKET_CNT; i++) { + struct wdrr_bucket *bucket = q->buckets + i; + + INIT_LIST_HEAD(&bucket->bucketchain); + } + } + + return 0; +} + +static int hhf_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct hhf_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_HHF_BACKLOG_LIMIT, sch->limit) || + nla_put_u32(skb, TCA_HHF_QUANTUM, q->quantum) || + nla_put_u32(skb, TCA_HHF_HH_FLOWS_LIMIT, q->hh_flows_limit) || + nla_put_u32(skb, TCA_HHF_RESET_TIMEOUT, + jiffies_to_msecs(q->hhf_reset_timeout)) || + nla_put_u32(skb, TCA_HHF_ADMIT_BYTES, q->hhf_admit_bytes) || + nla_put_u32(skb, TCA_HHF_EVICT_TIMEOUT, + jiffies_to_msecs(q->hhf_evict_timeout)) || + nla_put_u32(skb, TCA_HHF_NON_HH_WEIGHT, q->hhf_non_hh_weight)) + goto nla_put_failure; + + nla_nest_end(skb, opts); + return skb->len; + +nla_put_failure: + return -1; +} + +static int hhf_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct hhf_sched_data *q = qdisc_priv(sch); + struct tc_hhf_xstats st = { + .drop_overlimit = q->drop_overlimit, + .hh_overlimit = q->hh_flows_overlimit, + .hh_tot_count = q->hh_flows_total_cnt, + .hh_cur_count = q->hh_flows_current_cnt, + }; + + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +struct Qdisc_ops hhf_qdisc_ops __read_mostly = { + .id = "hhf", + .priv_size = sizeof(struct hhf_sched_data), + + .enqueue = hhf_enqueue, + .dequeue = hhf_dequeue, + .peek = qdisc_peek_dequeued, + .drop = hhf_drop, + .init = hhf_init, + .reset = hhf_reset, + .destroy = hhf_destroy, + .change = hhf_change, + .dump = hhf_dump, + .dump_stats = hhf_dump_stats, + .owner = THIS_MODULE, +}; +EXPORT_SYMBOL(hhf_qdisc_ops); + +static int __init hhf_module_init(void) +{ + return register_qdisc(&hhf_qdisc_ops); +} + +static void __exit hhf_module_exit(void) +{ + unregister_qdisc(&hhf_qdisc_ops); +} + +module_init(hhf_module_init) +module_exit(hhf_module_exit) +MODULE_AUTHOR("Terry Lam"); +MODULE_AUTHOR("Nandita Dukkipati"); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 830c64f25539..6b0e854b0115 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1471,11 +1471,22 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, sch_tree_lock(sch); } + rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0; + + ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0; + + psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64); + psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64); + /* it used to be a nasty bug here, we have to check that node * is really leaf before changing cl->un.leaf ! */ if (!cl->level) { - cl->quantum = hopt->rate.rate / q->rate2quantum; + u64 quantum = cl->rate.rate_bytes_ps; + + do_div(quantum, q->rate2quantum); + cl->quantum = min_t(u64, quantum, INT_MAX); + if (!hopt->quantum && cl->quantum < 1000) { pr_warning( "HTB: quantum of class %X is small. Consider r2q change.\n", @@ -1494,13 +1505,6 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, cl->prio = TC_HTB_NUMPRIO - 1; } - rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0; - - ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0; - - psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64); - psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64); - cl->buffer = PSCHED_TICKS2NS(hopt->buffer); cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer); diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index a6090051c5db..887e672f9d7d 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -118,6 +118,32 @@ struct tbf_sched_data { }; +/* Time to Length, convert time in ns to length in bytes + * to determinate how many bytes can be sent in given time. + */ +static u64 psched_ns_t2l(const struct psched_ratecfg *r, + u64 time_in_ns) +{ + /* The formula is : + * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC + */ + u64 len = time_in_ns * r->rate_bytes_ps; + + do_div(len, NSEC_PER_SEC); + + if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) { + do_div(len, 53); + len = len * 48; + } + + if (len > r->overhead) + len -= r->overhead; + else + len = 0; + + return len; +} + /* * Return length of individual segments of a gso packet, * including all headers (MAC, IP, TCP/UDP) @@ -289,10 +315,11 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt) struct tbf_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_TBF_MAX + 1]; struct tc_tbf_qopt *qopt; - struct qdisc_rate_table *rtab = NULL; - struct qdisc_rate_table *ptab = NULL; struct Qdisc *child = NULL; - int max_size, n; + struct psched_ratecfg rate; + struct psched_ratecfg peak; + u64 max_size; + s64 buffer, mtu; u64 rate64 = 0, prate64 = 0; err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy); @@ -304,38 +331,13 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt) goto done; qopt = nla_data(tb[TCA_TBF_PARMS]); - rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB]); - if (rtab == NULL) - goto done; - - if (qopt->peakrate.rate) { - if (qopt->peakrate.rate > qopt->rate.rate) - ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB]); - if (ptab == NULL) - goto done; - } - - for (n = 0; n < 256; n++) - if (rtab->data[n] > qopt->buffer) - break; - max_size = (n << qopt->rate.cell_log) - 1; - if (ptab) { - int size; - - for (n = 0; n < 256; n++) - if (ptab->data[n] > qopt->mtu) - break; - size = (n << qopt->peakrate.cell_log) - 1; - if (size < max_size) - max_size = size; - } - if (max_size < 0) - goto done; + if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE) + qdisc_put_rtab(qdisc_get_rtab(&qopt->rate, + tb[TCA_TBF_RTAB])); - if (max_size < psched_mtu(qdisc_dev(sch))) - pr_warn_ratelimited("sch_tbf: burst %u is lower than device %s mtu (%u) !\n", - max_size, qdisc_dev(sch)->name, - psched_mtu(qdisc_dev(sch))); + if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE) + qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate, + tb[TCA_TBF_PTAB])); if (q->qdisc != &noop_qdisc) { err = fifo_set_limit(q->qdisc, qopt->limit); @@ -349,6 +351,39 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt) } } + buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U); + mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U); + + if (tb[TCA_TBF_RATE64]) + rate64 = nla_get_u64(tb[TCA_TBF_RATE64]); + psched_ratecfg_precompute(&rate, &qopt->rate, rate64); + + max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U); + + if (qopt->peakrate.rate) { + if (tb[TCA_TBF_PRATE64]) + prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]); + psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64); + if (peak.rate_bytes_ps <= rate.rate_bytes_ps) { + pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n", + peak.rate_bytes_ps, rate.rate_bytes_ps); + err = -EINVAL; + goto done; + } + + max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu)); + } + + if (max_size < psched_mtu(qdisc_dev(sch))) + pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n", + max_size, qdisc_dev(sch)->name, + psched_mtu(qdisc_dev(sch))); + + if (!max_size) { + err = -EINVAL; + goto done; + } + sch_tree_lock(sch); if (child) { qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen); @@ -362,13 +397,9 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt) q->tokens = q->buffer; q->ptokens = q->mtu; - if (tb[TCA_TBF_RATE64]) - rate64 = nla_get_u64(tb[TCA_TBF_RATE64]); - psched_ratecfg_precompute(&q->rate, &rtab->rate, rate64); - if (ptab) { - if (tb[TCA_TBF_PRATE64]) - prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]); - psched_ratecfg_precompute(&q->peak, &ptab->rate, prate64); + memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg)); + if (qopt->peakrate.rate) { + memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg)); q->peak_present = true; } else { q->peak_present = false; @@ -377,10 +408,6 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt) sch_tree_unlock(sch); err = 0; done: - if (rtab) - qdisc_put_rtab(rtab); - if (ptab) - qdisc_put_rtab(ptab); return err; } diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 3d7c6bd46311..5ae609200674 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -145,8 +145,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a = 5 * asoc->rto_max; asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = asoc->sackdelay; - asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = - min_t(unsigned long, sp->autoclose, net->sctp.max_autoclose) * HZ; + asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ; /* Initializes the timers */ for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) @@ -254,8 +253,6 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a asoc->peer.ipv6_address = 1; INIT_LIST_HEAD(&asoc->asocs); - asoc->autoclose = sp->autoclose; - asoc->default_stream = sp->default_stream; asoc->default_ppid = sp->default_ppid; asoc->default_flags = sp->default_flags; diff --git a/net/sctp/input.c b/net/sctp/input.c index 2a192a7c5d81..042ec6c9ae24 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -389,6 +389,9 @@ void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc, if (!t || (t->pathmtu <= pmtu)) return; + if (!ip6_sk_accept_pmtu(sk)) + return; + if (sock_owned_by_user(sk)) { asoc->pmtu_pending = 1; t->pmtu_pending = 1; diff --git a/net/sctp/output.c b/net/sctp/output.c index 6371337e1fe7..3f55823279d9 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -580,7 +580,8 @@ int sctp_packet_transmit(struct sctp_packet *packet) unsigned long timeout; /* Restart the AUTOCLOSE timer when sending data. */ - if (sctp_state(asoc, ESTABLISHED) && asoc->autoclose) { + if (sctp_state(asoc, ESTABLISHED) && + asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]) { timer = &asoc->timers[SCTP_EVENT_TIMEOUT_AUTOCLOSE]; timeout = asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]; diff --git a/net/sctp/probe.c b/net/sctp/probe.c index 53c452efb40b..5e68b94ee640 100644 --- a/net/sctp/probe.c +++ b/net/sctp/probe.c @@ -38,6 +38,7 @@ #include <net/sctp/sctp.h> #include <net/sctp/sm.h> +MODULE_SOFTDEP("pre: sctp"); MODULE_AUTHOR("Wei Yongjun <yjwei@cn.fujitsu.com>"); MODULE_DESCRIPTION("SCTP snooper"); MODULE_LICENSE("GPL"); @@ -182,6 +183,20 @@ static struct jprobe sctp_recv_probe = { .entry = jsctp_sf_eat_sack, }; +static __init int sctp_setup_jprobe(void) +{ + int ret = register_jprobe(&sctp_recv_probe); + + if (ret) { + if (request_module("sctp")) + goto out; + ret = register_jprobe(&sctp_recv_probe); + } + +out: + return ret; +} + static __init int sctpprobe_init(void) { int ret = -ENOMEM; @@ -202,7 +217,7 @@ static __init int sctpprobe_init(void) &sctpprobe_fops)) goto free_kfifo; - ret = register_jprobe(&sctp_recv_probe); + ret = sctp_setup_jprobe(); if (ret) goto remove_proc; diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index dd0eba919a8b..ee02771d8b9c 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -819,7 +819,7 @@ sctp_disposition_t sctp_sf_do_5_1D_ce(struct net *net, SCTP_INC_STATS(net, SCTP_MIB_PASSIVEESTABS); sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_START, SCTP_NULL()); - if (new_asoc->autoclose) + if (new_asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]) sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START, SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE)); @@ -907,7 +907,7 @@ sctp_disposition_t sctp_sf_do_5_1E_ca(struct net *net, SCTP_INC_STATS(net, SCTP_MIB_CURRESTAB); SCTP_INC_STATS(net, SCTP_MIB_ACTIVEESTABS); sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_START, SCTP_NULL()); - if (asoc->autoclose) + if (asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]) sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START, SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE)); @@ -2969,7 +2969,7 @@ sctp_disposition_t sctp_sf_eat_data_6_2(struct net *net, if (chunk->chunk_hdr->flags & SCTP_DATA_SACK_IMM) force = SCTP_FORCE(); - if (asoc->autoclose) { + if (asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]) { sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE)); } @@ -3877,7 +3877,7 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn(struct net *net, SCTP_CHUNK(chunk)); /* Count this as receiving DATA. */ - if (asoc->autoclose) { + if (asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]) { sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE)); } @@ -5266,7 +5266,7 @@ sctp_disposition_t sctp_sf_do_9_2_start_shutdown( sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD)); - if (asoc->autoclose) + if (asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]) sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE)); @@ -5345,7 +5345,7 @@ sctp_disposition_t sctp_sf_do_9_2_shutdown_ack( sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN)); - if (asoc->autoclose) + if (asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]) sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE)); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 191cd9257806..d39fd0c2c4cf 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2195,6 +2195,7 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); + struct net *net = sock_net(sk); /* Applicable to UDP-style socket only */ if (sctp_style(sk, TCP)) @@ -2204,6 +2205,9 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval, if (copy_from_user(&sp->autoclose, optval, optlen)) return -EFAULT; + if (sp->autoclose > net->sctp.max_autoclose) + sp->autoclose = net->sctp.max_autoclose; + return 0; } @@ -2810,6 +2814,8 @@ static int sctp_setsockopt_rtoinfo(struct sock *sk, char __user *optval, unsigne { struct sctp_rtoinfo rtoinfo; struct sctp_association *asoc; + unsigned long rto_min, rto_max; + struct sctp_sock *sp = sctp_sk(sk); if (optlen != sizeof (struct sctp_rtoinfo)) return -EINVAL; @@ -2823,26 +2829,36 @@ static int sctp_setsockopt_rtoinfo(struct sock *sk, char __user *optval, unsigne if (!asoc && rtoinfo.srto_assoc_id && sctp_style(sk, UDP)) return -EINVAL; + rto_max = rtoinfo.srto_max; + rto_min = rtoinfo.srto_min; + + if (rto_max) + rto_max = asoc ? msecs_to_jiffies(rto_max) : rto_max; + else + rto_max = asoc ? asoc->rto_max : sp->rtoinfo.srto_max; + + if (rto_min) + rto_min = asoc ? msecs_to_jiffies(rto_min) : rto_min; + else + rto_min = asoc ? asoc->rto_min : sp->rtoinfo.srto_min; + + if (rto_min > rto_max) + return -EINVAL; + if (asoc) { if (rtoinfo.srto_initial != 0) asoc->rto_initial = msecs_to_jiffies(rtoinfo.srto_initial); - if (rtoinfo.srto_max != 0) - asoc->rto_max = msecs_to_jiffies(rtoinfo.srto_max); - if (rtoinfo.srto_min != 0) - asoc->rto_min = msecs_to_jiffies(rtoinfo.srto_min); + asoc->rto_max = rto_max; + asoc->rto_min = rto_min; } else { /* If there is no association or the association-id = 0 * set the values to the endpoint. */ - struct sctp_sock *sp = sctp_sk(sk); - if (rtoinfo.srto_initial != 0) sp->rtoinfo.srto_initial = rtoinfo.srto_initial; - if (rtoinfo.srto_max != 0) - sp->rtoinfo.srto_max = rtoinfo.srto_max; - if (rtoinfo.srto_min != 0) - sp->rtoinfo.srto_min = rtoinfo.srto_min; + sp->rtoinfo.srto_max = rto_max; + sp->rtoinfo.srto_min = rto_min; } return 0; diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 80b17b5df6bb..9dd5ac084663 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -55,11 +55,16 @@ extern long sysctl_sctp_mem[3]; extern int sysctl_sctp_rmem[3]; extern int sysctl_sctp_wmem[3]; -static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, - int write, +static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); +static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); +static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, - loff_t *ppos); + static struct ctl_table sctp_table[] = { { .procname = "sctp_mem", @@ -101,17 +106,17 @@ static struct ctl_table sctp_net_table[] = { .data = &init_net.sctp.rto_min, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_sctp_do_rto_min, .extra1 = &one, - .extra2 = &timer_max + .extra2 = &init_net.sctp.rto_max }, { .procname = "rto_max", .data = &init_net.sctp.rto_max, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .proc_handler = proc_sctp_do_rto_max, + .extra1 = &init_net.sctp.rto_min, .extra2 = &timer_max }, { @@ -293,8 +298,7 @@ static struct ctl_table sctp_net_table[] = { { /* sentinel */ } }; -static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, - int write, +static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -341,6 +345,60 @@ static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, return ret; } +static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + struct net *net = current->nsproxy->net_ns; + int new_value; + struct ctl_table tbl; + unsigned int min = *(unsigned int *) ctl->extra1; + unsigned int max = *(unsigned int *) ctl->extra2; + int ret; + + memset(&tbl, 0, sizeof(struct ctl_table)); + tbl.maxlen = sizeof(unsigned int); + + if (write) + tbl.data = &new_value; + else + tbl.data = &net->sctp.rto_min; + ret = proc_dointvec(&tbl, write, buffer, lenp, ppos); + if (write) { + if (ret || new_value > max || new_value < min) + return -EINVAL; + net->sctp.rto_min = new_value; + } + return ret; +} + +static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + struct net *net = current->nsproxy->net_ns; + int new_value; + struct ctl_table tbl; + unsigned int min = *(unsigned int *) ctl->extra1; + unsigned int max = *(unsigned int *) ctl->extra2; + int ret; + + memset(&tbl, 0, sizeof(struct ctl_table)); + tbl.maxlen = sizeof(unsigned int); + + if (write) + tbl.data = &new_value; + else + tbl.data = &net->sctp.rto_max; + ret = proc_dointvec(&tbl, write, buffer, lenp, ppos); + if (write) { + if (ret || new_value > max || new_value < min) + return -EINVAL; + net->sctp.rto_max = new_value; + } + return ret; +} + int sctp_sysctl_net_register(struct net *net) { struct ctl_table *table; diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 97912b40c254..42fdfc634e56 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1517,7 +1517,7 @@ out: static int gss_refresh_null(struct rpc_task *task) { - return -EACCES; + return 0; } static __be32 * diff --git a/net/tipc/core.c b/net/tipc/core.c index 68977c423022..f9e88d8b04ca 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -109,7 +109,6 @@ err: static void tipc_core_stop(void) { tipc_netlink_stop(); - tipc_handler_stop(); tipc_cfg_stop(); tipc_subscr_stop(); tipc_nametbl_stop(); @@ -142,9 +141,10 @@ static int tipc_core_start(void) res = tipc_subscr_start(); if (!res) res = tipc_cfg_init(); - if (res) + if (res) { + tipc_handler_stop(); tipc_core_stop(); - + } return res; } @@ -174,6 +174,7 @@ static int __init tipc_init(void) static void __exit tipc_exit(void) { + tipc_handler_stop(); tipc_core_stop_net(); tipc_core_stop(); pr_info("Deactivated\n"); diff --git a/net/tipc/handler.c b/net/tipc/handler.c index b36f0fcd9bdf..e4bc8a296744 100644 --- a/net/tipc/handler.c +++ b/net/tipc/handler.c @@ -56,12 +56,13 @@ unsigned int tipc_k_signal(Handler routine, unsigned long argument) { struct queue_item *item; + spin_lock_bh(&qitem_lock); if (!handler_enabled) { pr_err("Signal request ignored by handler\n"); + spin_unlock_bh(&qitem_lock); return -ENOPROTOOPT; } - spin_lock_bh(&qitem_lock); item = kmem_cache_alloc(tipc_queue_item_cache, GFP_ATOMIC); if (!item) { pr_err("Signal queue out of memory\n"); @@ -112,10 +113,14 @@ void tipc_handler_stop(void) struct list_head *l, *n; struct queue_item *item; - if (!handler_enabled) + spin_lock_bh(&qitem_lock); + if (!handler_enabled) { + spin_unlock_bh(&qitem_lock); return; - + } handler_enabled = 0; + spin_unlock_bh(&qitem_lock); + tasklet_kill(&tipc_tasklet); spin_lock_bh(&qitem_lock); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 813622296317..800ca61758ff 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -532,13 +532,17 @@ static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *, static int unix_seqpacket_recvmsg(struct kiocb *, struct socket *, struct msghdr *, size_t, int); -static void unix_set_peek_off(struct sock *sk, int val) +static int unix_set_peek_off(struct sock *sk, int val) { struct unix_sock *u = unix_sk(sk); - mutex_lock(&u->readlock); + if (mutex_lock_interruptible(&u->readlock)) + return -EINTR; + sk->sk_peek_off = val; mutex_unlock(&u->readlock); + + return 0; } @@ -716,7 +720,9 @@ static int unix_autobind(struct socket *sock) int err; unsigned int retries = 0; - mutex_lock(&u->readlock); + err = mutex_lock_interruptible(&u->readlock); + if (err) + return err; err = 0; if (u->addr) @@ -875,7 +881,9 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; addr_len = err; - mutex_lock(&u->readlock); + err = mutex_lock_interruptible(&u->readlock); + if (err) + goto out; err = -EINVAL; if (u->addr) |